533 lines
14 KiB
Objective-C
533 lines
14 KiB
Objective-C
//
|
||
// ConversationOrchestrator.m
|
||
// keyBoard
|
||
//
|
||
// Created by Mac on 2026/1/15.
|
||
//
|
||
|
||
#import "ConversationOrchestrator.h"
|
||
#import "ASRStreamClient.h"
|
||
#import "AudioCaptureManager.h"
|
||
#import "AudioSessionManager.h"
|
||
#import "LLMStreamClient.h"
|
||
#import "Segmenter.h"
|
||
#import "SubtitleSync.h"
|
||
#import "TTSPlaybackPipeline.h"
|
||
#import "TTSServiceClient.h"
|
||
|
||
@interface ConversationOrchestrator () <
|
||
AudioSessionManagerDelegate, AudioCaptureManagerDelegate,
|
||
ASRStreamClientDelegate, LLMStreamClientDelegate, TTSServiceClientDelegate,
|
||
TTSPlaybackPipelineDelegate>
|
||
|
||
// 模块
|
||
@property(nonatomic, strong) AudioSessionManager *audioSession;
|
||
@property(nonatomic, strong) AudioCaptureManager *audioCapture;
|
||
@property(nonatomic, strong) ASRStreamClient *asrClient;
|
||
@property(nonatomic, strong) LLMStreamClient *llmClient;
|
||
@property(nonatomic, strong) Segmenter *segmenter;
|
||
@property(nonatomic, strong) TTSServiceClient *ttsClient;
|
||
@property(nonatomic, strong) TTSPlaybackPipeline *playbackPipeline;
|
||
@property(nonatomic, strong) SubtitleSync *subtitleSync;
|
||
|
||
// 状态
|
||
@property(nonatomic, assign) ConversationState state;
|
||
@property(nonatomic, copy) NSString *conversationId;
|
||
@property(nonatomic, copy) NSString *currentSessionId;
|
||
|
||
// 文本跟踪
|
||
@property(nonatomic, strong) NSMutableString *fullAssistantText;
|
||
@property(nonatomic, strong)
|
||
NSMutableDictionary<NSString *, NSString *> *segmentTextMap;
|
||
@property(nonatomic, assign) NSInteger segmentCounter;
|
||
|
||
// 队列
|
||
@property(nonatomic, strong) dispatch_queue_t orchestratorQueue;
|
||
|
||
@end
|
||
|
||
@implementation ConversationOrchestrator
|
||
|
||
#pragma mark - Initialization
|
||
|
||
- (instancetype)init {
|
||
self = [super init];
|
||
if (self) {
|
||
_orchestratorQueue = dispatch_queue_create(
|
||
"com.keyboard.aitalk.orchestrator", DISPATCH_QUEUE_SERIAL);
|
||
_state = ConversationStateIdle;
|
||
_conversationId = [[NSUUID UUID] UUIDString];
|
||
|
||
_fullAssistantText = [[NSMutableString alloc] init];
|
||
_segmentTextMap = [[NSMutableDictionary alloc] init];
|
||
_segmentCounter = 0;
|
||
|
||
[self setupModules];
|
||
}
|
||
return self;
|
||
}
|
||
|
||
- (void)setupModules {
|
||
// Audio Session
|
||
self.audioSession = [AudioSessionManager sharedManager];
|
||
self.audioSession.delegate = self;
|
||
|
||
// Audio Capture
|
||
self.audioCapture = [[AudioCaptureManager alloc] init];
|
||
self.audioCapture.delegate = self;
|
||
|
||
// ASR Client
|
||
self.asrClient = [[ASRStreamClient alloc] init];
|
||
self.asrClient.delegate = self;
|
||
|
||
// LLM Client
|
||
self.llmClient = [[LLMStreamClient alloc] init];
|
||
self.llmClient.delegate = self;
|
||
|
||
// Segmenter
|
||
self.segmenter = [[Segmenter alloc] init];
|
||
|
||
// TTS Client
|
||
self.ttsClient = [[TTSServiceClient alloc] init];
|
||
self.ttsClient.delegate = self;
|
||
// ElevenLabs 配置(通过后端代理)
|
||
self.ttsClient.voiceId = @"JBFqnCBsd6RMkjVDRZzb"; // 默认语音 George
|
||
self.ttsClient.languageCode = @"zh"; // 中文
|
||
self.ttsClient.expectedPayloadType =
|
||
TTSPayloadTypeURL; // 使用 URL 模式(简单)
|
||
|
||
// Playback Pipeline
|
||
self.playbackPipeline = [[TTSPlaybackPipeline alloc] init];
|
||
self.playbackPipeline.delegate = self;
|
||
|
||
// Subtitle Sync
|
||
self.subtitleSync = [[SubtitleSync alloc] init];
|
||
}
|
||
|
||
#pragma mark - Configuration Setters
|
||
|
||
- (void)setAsrServerURL:(NSString *)asrServerURL {
|
||
_asrServerURL = [asrServerURL copy];
|
||
self.asrClient.serverURL = asrServerURL;
|
||
}
|
||
|
||
- (void)setLlmServerURL:(NSString *)llmServerURL {
|
||
_llmServerURL = [llmServerURL copy];
|
||
self.llmClient.serverURL = llmServerURL;
|
||
}
|
||
|
||
- (void)setTtsServerURL:(NSString *)ttsServerURL {
|
||
_ttsServerURL = [ttsServerURL copy];
|
||
self.ttsClient.serverURL = ttsServerURL;
|
||
}
|
||
|
||
#pragma mark - User Actions
|
||
|
||
- (void)userDidPressRecord {
|
||
dispatch_async(self.orchestratorQueue, ^{
|
||
NSLog(@"[Orchestrator] userDidPressRecord, current state: %ld",
|
||
(long)self.state);
|
||
|
||
// 如果正在播放或思考,执行打断
|
||
if (self.state == ConversationStateSpeaking ||
|
||
self.state == ConversationStateThinking) {
|
||
[self performBargein];
|
||
}
|
||
|
||
// 检查麦克风权限
|
||
if (![self.audioSession hasMicrophonePermission]) {
|
||
[self.audioSession requestMicrophonePermission:^(BOOL granted) {
|
||
if (granted) {
|
||
dispatch_async(self.orchestratorQueue, ^{
|
||
[self startRecording];
|
||
});
|
||
}
|
||
}];
|
||
return;
|
||
}
|
||
|
||
[self startRecording];
|
||
});
|
||
}
|
||
|
||
- (void)userDidReleaseRecord {
|
||
dispatch_async(self.orchestratorQueue, ^{
|
||
NSLog(@"[Orchestrator] userDidReleaseRecord, current state: %ld",
|
||
(long)self.state);
|
||
|
||
if (self.state != ConversationStateListening) {
|
||
return;
|
||
}
|
||
|
||
// 停止采集
|
||
[self.audioCapture stopCapture];
|
||
|
||
// 请求 ASR 最终结果
|
||
[self.asrClient finalize];
|
||
|
||
// 更新状态
|
||
[self updateState:ConversationStateRecognizing];
|
||
});
|
||
}
|
||
|
||
- (void)stop {
|
||
dispatch_async(self.orchestratorQueue, ^{
|
||
[self cancelAll];
|
||
[self updateState:ConversationStateIdle];
|
||
});
|
||
}
|
||
|
||
#pragma mark - Private: Recording
|
||
|
||
- (void)startRecording {
|
||
// 配置音频会话
|
||
NSError *error = nil;
|
||
if (![self.audioSession configureForConversation:&error]) {
|
||
[self reportError:error];
|
||
return;
|
||
}
|
||
|
||
if (![self.audioSession activateSession:&error]) {
|
||
[self reportError:error];
|
||
return;
|
||
}
|
||
|
||
// 生成新的会话 ID
|
||
self.currentSessionId = [[NSUUID UUID] UUIDString];
|
||
|
||
// 启动 ASR
|
||
[self.asrClient startWithSessionId:self.currentSessionId];
|
||
|
||
// 启动音频采集
|
||
if (![self.audioCapture startCapture:&error]) {
|
||
[self reportError:error];
|
||
[self.asrClient cancel];
|
||
return;
|
||
}
|
||
|
||
// 更新状态
|
||
[self updateState:ConversationStateListening];
|
||
}
|
||
|
||
#pragma mark - Private: Barge-in (打断)
|
||
|
||
- (void)performBargein {
|
||
NSLog(@"[Orchestrator] Performing barge-in");
|
||
|
||
// 取消所有正在进行的请求
|
||
[self.ttsClient cancel];
|
||
[self.llmClient cancel];
|
||
[self.asrClient cancel];
|
||
|
||
// 停止播放
|
||
[self.playbackPipeline stop];
|
||
|
||
// 清空状态
|
||
[self.segmenter reset];
|
||
[self.segmentTextMap removeAllObjects];
|
||
[self.fullAssistantText setString:@""];
|
||
self.segmentCounter = 0;
|
||
}
|
||
|
||
- (void)cancelAll {
|
||
[self.audioCapture stopCapture];
|
||
[self.asrClient cancel];
|
||
[self.llmClient cancel];
|
||
[self.ttsClient cancel];
|
||
[self.playbackPipeline stop];
|
||
[self.segmenter reset];
|
||
[self.audioSession deactivateSession];
|
||
}
|
||
|
||
#pragma mark - Private: State Management
|
||
|
||
- (void)updateState:(ConversationState)newState {
|
||
if (self.state == newState)
|
||
return;
|
||
|
||
ConversationState oldState = self.state;
|
||
self.state = newState;
|
||
|
||
NSLog(@"[Orchestrator] State: %ld -> %ld", (long)oldState, (long)newState);
|
||
|
||
dispatch_async(dispatch_get_main_queue(), ^{
|
||
if (self.onStateChange) {
|
||
self.onStateChange(newState);
|
||
}
|
||
|
||
// 特殊状态回调
|
||
if (newState == ConversationStateSpeaking &&
|
||
oldState != ConversationStateSpeaking) {
|
||
if (self.onSpeakingStart) {
|
||
self.onSpeakingStart();
|
||
}
|
||
}
|
||
|
||
if (oldState == ConversationStateSpeaking &&
|
||
newState != ConversationStateSpeaking) {
|
||
if (self.onSpeakingEnd) {
|
||
self.onSpeakingEnd();
|
||
}
|
||
}
|
||
});
|
||
}
|
||
|
||
- (void)reportError:(NSError *)error {
|
||
NSLog(@"[Orchestrator] Error: %@", error.localizedDescription);
|
||
|
||
dispatch_async(dispatch_get_main_queue(), ^{
|
||
if (self.onError) {
|
||
self.onError(error);
|
||
}
|
||
});
|
||
}
|
||
|
||
#pragma mark - AudioCaptureManagerDelegate
|
||
|
||
- (void)audioCaptureManagerDidOutputPCMFrame:(NSData *)pcmFrame {
|
||
// 发送到 ASR
|
||
[self.asrClient sendAudioPCMFrame:pcmFrame];
|
||
}
|
||
|
||
- (void)audioCaptureManagerDidUpdateRMS:(float)rms {
|
||
dispatch_async(dispatch_get_main_queue(), ^{
|
||
if (self.onVolumeUpdate) {
|
||
self.onVolumeUpdate(rms);
|
||
}
|
||
});
|
||
}
|
||
|
||
#pragma mark - AudioSessionManagerDelegate
|
||
|
||
- (void)audioSessionManagerDidInterrupt:(KBAudioSessionInterruptionType)type {
|
||
dispatch_async(self.orchestratorQueue, ^{
|
||
if (type == KBAudioSessionInterruptionTypeBegan) {
|
||
// 中断开始:停止采集和播放
|
||
[self cancelAll];
|
||
[self updateState:ConversationStateIdle];
|
||
}
|
||
});
|
||
}
|
||
|
||
- (void)audioSessionManagerMicrophonePermissionDenied {
|
||
NSError *error =
|
||
[NSError errorWithDomain:@"ConversationOrchestrator"
|
||
code:-1
|
||
userInfo:@{
|
||
NSLocalizedDescriptionKey : @"请在设置中开启麦克风权限"
|
||
}];
|
||
[self reportError:error];
|
||
}
|
||
|
||
#pragma mark - ASRStreamClientDelegate
|
||
|
||
- (void)asrClientDidReceivePartialText:(NSString *)text {
|
||
dispatch_async(dispatch_get_main_queue(), ^{
|
||
if (self.onPartialText) {
|
||
self.onPartialText(text);
|
||
}
|
||
});
|
||
}
|
||
|
||
- (void)asrClientDidReceiveFinalText:(NSString *)text {
|
||
dispatch_async(self.orchestratorQueue, ^{
|
||
NSLog(@"[Orchestrator] ASR final text: %@", text);
|
||
|
||
// 回调用户文本
|
||
dispatch_async(dispatch_get_main_queue(), ^{
|
||
if (self.onUserFinalText) {
|
||
self.onUserFinalText(text);
|
||
}
|
||
});
|
||
|
||
// 如果文本为空,回到空闲
|
||
if (text.length == 0) {
|
||
[self updateState:ConversationStateIdle];
|
||
return;
|
||
}
|
||
|
||
// 更新状态并开始 LLM 请求
|
||
[self updateState:ConversationStateThinking];
|
||
|
||
// 重置文本跟踪
|
||
[self.fullAssistantText setString:@""];
|
||
[self.segmentTextMap removeAllObjects];
|
||
self.segmentCounter = 0;
|
||
[self.segmenter reset];
|
||
|
||
// 启动播放管线
|
||
NSError *error = nil;
|
||
if (![self.playbackPipeline start:&error]) {
|
||
NSLog(@"[Orchestrator] Failed to start playback pipeline: %@",
|
||
error.localizedDescription);
|
||
}
|
||
|
||
// 发送 LLM 请求
|
||
[self.llmClient sendUserText:text conversationId:self.conversationId];
|
||
});
|
||
}
|
||
|
||
- (void)asrClientDidFail:(NSError *)error {
|
||
dispatch_async(self.orchestratorQueue, ^{
|
||
[self reportError:error];
|
||
[self updateState:ConversationStateIdle];
|
||
});
|
||
}
|
||
|
||
#pragma mark - LLMStreamClientDelegate
|
||
|
||
- (void)llmClientDidReceiveToken:(NSString *)token {
|
||
dispatch_async(self.orchestratorQueue, ^{
|
||
// 追加到完整文本
|
||
[self.fullAssistantText appendString:token];
|
||
|
||
// 追加到分段器
|
||
[self.segmenter appendToken:token];
|
||
|
||
// 检查是否有可触发 TTS 的片段
|
||
NSArray<NSString *> *segments = [self.segmenter popReadySegments];
|
||
for (NSString *segmentText in segments) {
|
||
[self requestTTSForSegment:segmentText];
|
||
}
|
||
});
|
||
}
|
||
|
||
- (void)llmClientDidComplete {
|
||
dispatch_async(self.orchestratorQueue, ^{
|
||
NSLog(@"[Orchestrator] LLM complete");
|
||
|
||
// 处理剩余片段
|
||
NSString *remaining = [self.segmenter flushRemainingSegment];
|
||
if (remaining && remaining.length > 0) {
|
||
[self requestTTSForSegment:remaining];
|
||
}
|
||
|
||
// 回调完整文本
|
||
NSString *fullText = [self.fullAssistantText copy];
|
||
dispatch_async(dispatch_get_main_queue(), ^{
|
||
if (self.onAssistantFullText) {
|
||
self.onAssistantFullText(fullText);
|
||
}
|
||
});
|
||
});
|
||
}
|
||
|
||
- (void)llmClientDidFail:(NSError *)error {
|
||
dispatch_async(self.orchestratorQueue, ^{
|
||
[self reportError:error];
|
||
[self updateState:ConversationStateIdle];
|
||
});
|
||
}
|
||
|
||
#pragma mark - Private: TTS Request
|
||
|
||
- (void)requestTTSForSegment:(NSString *)segmentText {
|
||
NSString *segmentId =
|
||
[NSString stringWithFormat:@"seg_%ld", (long)self.segmentCounter++];
|
||
|
||
// 记录片段文本
|
||
self.segmentTextMap[segmentId] = segmentText;
|
||
|
||
NSLog(@"[Orchestrator] Requesting TTS for segment %@: %@", segmentId,
|
||
segmentText);
|
||
|
||
// 请求 TTS
|
||
[self.ttsClient requestTTSForText:segmentText segmentId:segmentId];
|
||
}
|
||
|
||
#pragma mark - TTSServiceClientDelegate
|
||
|
||
- (void)ttsClientDidReceiveURL:(NSURL *)url segmentId:(NSString *)segmentId {
|
||
dispatch_async(self.orchestratorQueue, ^{
|
||
[self.playbackPipeline enqueueURL:url segmentId:segmentId];
|
||
|
||
// 如果还在 Thinking,切换到 Speaking
|
||
if (self.state == ConversationStateThinking) {
|
||
[self updateState:ConversationStateSpeaking];
|
||
}
|
||
});
|
||
}
|
||
|
||
- (void)ttsClientDidReceiveAudioChunk:(NSData *)chunk
|
||
payloadType:(TTSPayloadType)type
|
||
segmentId:(NSString *)segmentId {
|
||
dispatch_async(self.orchestratorQueue, ^{
|
||
[self.playbackPipeline enqueueChunk:chunk
|
||
payloadType:type
|
||
segmentId:segmentId];
|
||
|
||
// 如果还在 Thinking,切换到 Speaking
|
||
if (self.state == ConversationStateThinking) {
|
||
[self updateState:ConversationStateSpeaking];
|
||
}
|
||
});
|
||
}
|
||
|
||
- (void)ttsClientDidFinishSegment:(NSString *)segmentId {
|
||
dispatch_async(self.orchestratorQueue, ^{
|
||
[self.playbackPipeline markSegmentComplete:segmentId];
|
||
});
|
||
}
|
||
|
||
- (void)ttsClientDidFail:(NSError *)error {
|
||
dispatch_async(self.orchestratorQueue, ^{
|
||
[self reportError:error];
|
||
});
|
||
}
|
||
|
||
#pragma mark - TTSPlaybackPipelineDelegate
|
||
|
||
- (void)pipelineDidStartSegment:(NSString *)segmentId
|
||
duration:(NSTimeInterval)duration {
|
||
NSLog(@"[Orchestrator] Started playing segment: %@", segmentId);
|
||
}
|
||
|
||
- (void)pipelineDidUpdatePlaybackTime:(NSTimeInterval)time
|
||
segmentId:(NSString *)segmentId {
|
||
dispatch_async(self.orchestratorQueue, ^{
|
||
// 获取片段文本
|
||
NSString *segmentText = self.segmentTextMap[segmentId];
|
||
if (!segmentText)
|
||
return;
|
||
|
||
// 计算可见文本
|
||
NSTimeInterval duration =
|
||
[self.playbackPipeline durationForSegment:segmentId];
|
||
NSString *visibleText =
|
||
[self.subtitleSync visibleTextForFullText:segmentText
|
||
currentTime:time
|
||
duration:duration];
|
||
|
||
// TODO: 这里应该累加之前片段的文本,实现完整的打字机效果
|
||
// 简化实现:只显示当前片段
|
||
dispatch_async(dispatch_get_main_queue(), ^{
|
||
if (self.onAssistantVisibleText) {
|
||
self.onAssistantVisibleText(visibleText);
|
||
}
|
||
});
|
||
});
|
||
}
|
||
|
||
- (void)pipelineDidFinishSegment:(NSString *)segmentId {
|
||
NSLog(@"[Orchestrator] Finished playing segment: %@", segmentId);
|
||
}
|
||
|
||
- (void)pipelineDidFinishAllSegments {
|
||
dispatch_async(self.orchestratorQueue, ^{
|
||
NSLog(@"[Orchestrator] All segments finished");
|
||
|
||
// 回到空闲状态
|
||
[self updateState:ConversationStateIdle];
|
||
[self.audioSession deactivateSession];
|
||
});
|
||
}
|
||
|
||
- (void)pipelineDidFail:(NSError *)error {
|
||
dispatch_async(self.orchestratorQueue, ^{
|
||
[self reportError:error];
|
||
[self updateState:ConversationStateIdle];
|
||
});
|
||
}
|
||
|
||
@end
|