149 lines
3.9 KiB
Objective-C
149 lines
3.9 KiB
Objective-C
//
|
||
// Segmenter.m
|
||
// keyBoard
|
||
//
|
||
// Created by Mac on 2026/1/15.
|
||
//
|
||
|
||
#import "Segmenter.h"
|
||
|
||
@interface Segmenter ()
|
||
|
||
@property(nonatomic, strong) NSMutableString *buffer;
|
||
@property(nonatomic, strong) NSMutableArray<NSString *> *readySegments;
|
||
|
||
@end
|
||
|
||
@implementation Segmenter
|
||
|
||
- (instancetype)init {
|
||
self = [super init];
|
||
if (self) {
|
||
_buffer = [[NSMutableString alloc] init];
|
||
_readySegments = [[NSMutableArray alloc] init];
|
||
_maxCharacterThreshold = 30;
|
||
}
|
||
return self;
|
||
}
|
||
|
||
#pragma mark - Public Methods
|
||
|
||
- (void)appendToken:(NSString *)token {
|
||
if (!token || token.length == 0) {
|
||
return;
|
||
}
|
||
|
||
[self.buffer appendString:token];
|
||
|
||
// 检查是否需要切分
|
||
[self checkAndSplit];
|
||
}
|
||
|
||
- (NSArray<NSString *> *)popReadySegments {
|
||
NSArray *segments = [self.readySegments copy];
|
||
[self.readySegments removeAllObjects];
|
||
return segments;
|
||
}
|
||
|
||
- (NSString *)flushRemainingSegment {
|
||
NSString *remaining = [self.buffer copy];
|
||
[self.buffer setString:@""];
|
||
|
||
// 去除首尾空白
|
||
remaining = [remaining
|
||
stringByTrimmingCharactersInSet:[NSCharacterSet
|
||
whitespaceAndNewlineCharacterSet]];
|
||
|
||
return remaining.length > 0 ? remaining : nil;
|
||
}
|
||
|
||
- (void)reset {
|
||
[self.buffer setString:@""];
|
||
[self.readySegments removeAllObjects];
|
||
}
|
||
|
||
#pragma mark - Private Methods
|
||
|
||
- (void)checkAndSplit {
|
||
// 句子结束标点
|
||
NSCharacterSet *sentenceEnders =
|
||
[NSCharacterSet characterSetWithCharactersInString:@"。!?\n"];
|
||
|
||
while (YES) {
|
||
NSString *currentBuffer = self.buffer;
|
||
|
||
// 查找第一个句子结束标点
|
||
NSRange range = [currentBuffer rangeOfCharacterFromSet:sentenceEnders];
|
||
|
||
if (range.location != NSNotFound) {
|
||
// 找到结束标点,切分
|
||
NSUInteger endIndex = range.location + 1;
|
||
NSString *segment = [currentBuffer substringToIndex:endIndex];
|
||
segment = [segment stringByTrimmingCharactersInSet:
|
||
[NSCharacterSet whitespaceAndNewlineCharacterSet]];
|
||
|
||
if (segment.length > 0) {
|
||
[self.readySegments addObject:segment];
|
||
}
|
||
|
||
// 移除已切分的部分
|
||
[self.buffer deleteCharactersInRange:NSMakeRange(0, endIndex)];
|
||
} else if (currentBuffer.length >= self.maxCharacterThreshold) {
|
||
// 未找到标点,但超过阈值,强制切分
|
||
// 尝试在空格或逗号处切分
|
||
NSRange breakRange = [self findBestBreakPoint:currentBuffer];
|
||
|
||
if (breakRange.location != NSNotFound) {
|
||
NSString *segment =
|
||
[currentBuffer substringToIndex:breakRange.location + 1];
|
||
segment =
|
||
[segment stringByTrimmingCharactersInSet:
|
||
[NSCharacterSet whitespaceAndNewlineCharacterSet]];
|
||
|
||
if (segment.length > 0) {
|
||
[self.readySegments addObject:segment];
|
||
}
|
||
|
||
[self.buffer
|
||
deleteCharactersInRange:NSMakeRange(0, breakRange.location + 1)];
|
||
} else {
|
||
// 无法找到合适的断点,直接切分
|
||
NSString *segment =
|
||
[currentBuffer substringToIndex:self.maxCharacterThreshold];
|
||
segment =
|
||
[segment stringByTrimmingCharactersInSet:
|
||
[NSCharacterSet whitespaceAndNewlineCharacterSet]];
|
||
|
||
if (segment.length > 0) {
|
||
[self.readySegments addObject:segment];
|
||
}
|
||
|
||
[self.buffer
|
||
deleteCharactersInRange:NSMakeRange(0, self.maxCharacterThreshold)];
|
||
}
|
||
} else {
|
||
// 未达到切分条件
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
- (NSRange)findBestBreakPoint:(NSString *)text {
|
||
// 优先在逗号、分号等处断开
|
||
NSCharacterSet *breakChars =
|
||
[NSCharacterSet characterSetWithCharactersInString:@",,、;;:: "];
|
||
|
||
// 从后往前查找,尽可能多包含内容
|
||
for (NSInteger i = text.length - 1; i >= self.maxCharacterThreshold / 2;
|
||
i--) {
|
||
unichar c = [text characterAtIndex:i];
|
||
if ([breakChars characterIsMember:c]) {
|
||
return NSMakeRange(i, 1);
|
||
}
|
||
}
|
||
|
||
return NSMakeRange(NSNotFound, 0);
|
||
}
|
||
|
||
@end
|