Files
keyboard/keyBoard/Class/AiTalk/VM/Segmenter.m

149 lines
3.9 KiB
Objective-C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//
// Segmenter.m
// keyBoard
//
// Created by Mac on 2026/1/15.
//
#import "Segmenter.h"
@interface Segmenter ()
@property(nonatomic, strong) NSMutableString *buffer;
@property(nonatomic, strong) NSMutableArray<NSString *> *readySegments;
@end
@implementation Segmenter
- (instancetype)init {
self = [super init];
if (self) {
_buffer = [[NSMutableString alloc] init];
_readySegments = [[NSMutableArray alloc] init];
_maxCharacterThreshold = 30;
}
return self;
}
#pragma mark - Public Methods
- (void)appendToken:(NSString *)token {
if (!token || token.length == 0) {
return;
}
[self.buffer appendString:token];
// 检查是否需要切分
[self checkAndSplit];
}
- (NSArray<NSString *> *)popReadySegments {
NSArray *segments = [self.readySegments copy];
[self.readySegments removeAllObjects];
return segments;
}
- (NSString *)flushRemainingSegment {
NSString *remaining = [self.buffer copy];
[self.buffer setString:@""];
// 去除首尾空白
remaining = [remaining
stringByTrimmingCharactersInSet:[NSCharacterSet
whitespaceAndNewlineCharacterSet]];
return remaining.length > 0 ? remaining : nil;
}
- (void)reset {
[self.buffer setString:@""];
[self.readySegments removeAllObjects];
}
#pragma mark - Private Methods
- (void)checkAndSplit {
// 句子结束标点
NSCharacterSet *sentenceEnders =
[NSCharacterSet characterSetWithCharactersInString:@"。!?\n"];
while (YES) {
NSString *currentBuffer = self.buffer;
// 查找第一个句子结束标点
NSRange range = [currentBuffer rangeOfCharacterFromSet:sentenceEnders];
if (range.location != NSNotFound) {
// 找到结束标点,切分
NSUInteger endIndex = range.location + 1;
NSString *segment = [currentBuffer substringToIndex:endIndex];
segment = [segment stringByTrimmingCharactersInSet:
[NSCharacterSet whitespaceAndNewlineCharacterSet]];
if (segment.length > 0) {
[self.readySegments addObject:segment];
}
// 移除已切分的部分
[self.buffer deleteCharactersInRange:NSMakeRange(0, endIndex)];
} else if (currentBuffer.length >= self.maxCharacterThreshold) {
// 未找到标点,但超过阈值,强制切分
// 尝试在空格或逗号处切分
NSRange breakRange = [self findBestBreakPoint:currentBuffer];
if (breakRange.location != NSNotFound) {
NSString *segment =
[currentBuffer substringToIndex:breakRange.location + 1];
segment =
[segment stringByTrimmingCharactersInSet:
[NSCharacterSet whitespaceAndNewlineCharacterSet]];
if (segment.length > 0) {
[self.readySegments addObject:segment];
}
[self.buffer
deleteCharactersInRange:NSMakeRange(0, breakRange.location + 1)];
} else {
// 无法找到合适的断点,直接切分
NSString *segment =
[currentBuffer substringToIndex:self.maxCharacterThreshold];
segment =
[segment stringByTrimmingCharactersInSet:
[NSCharacterSet whitespaceAndNewlineCharacterSet]];
if (segment.length > 0) {
[self.readySegments addObject:segment];
}
[self.buffer
deleteCharactersInRange:NSMakeRange(0, self.maxCharacterThreshold)];
}
} else {
// 未达到切分条件
break;
}
}
}
- (NSRange)findBestBreakPoint:(NSString *)text {
// 优先在逗号、分号等处断开
NSCharacterSet *breakChars =
[NSCharacterSet characterSetWithCharactersInString:@",、;;: "];
// 从后往前查找,尽可能多包含内容
for (NSInteger i = text.length - 1; i >= self.maxCharacterThreshold / 2;
i--) {
unichar c = [text characterAtIndex:i];
if ([breakChars characterIsMember:c]) {
return NSMakeRange(i, 1);
}
}
return NSMakeRange(NSNotFound, 0);
}
@end