// // ConversationOrchestrator.m // keyBoard // // Created by Mac on 2026/1/15. // #import "ConversationOrchestrator.h" #import "ASRStreamClient.h" #import "AudioCaptureManager.h" #import "AudioSessionManager.h" #import "LLMStreamClient.h" #import "Segmenter.h" #import "SubtitleSync.h" #import "TTSPlaybackPipeline.h" #import "TTSServiceClient.h" @interface ConversationOrchestrator () < AudioSessionManagerDelegate, AudioCaptureManagerDelegate, ASRStreamClientDelegate, LLMStreamClientDelegate, TTSServiceClientDelegate, TTSPlaybackPipelineDelegate> // 模块 @property(nonatomic, strong) AudioSessionManager *audioSession; @property(nonatomic, strong) AudioCaptureManager *audioCapture; @property(nonatomic, strong) ASRStreamClient *asrClient; @property(nonatomic, strong) LLMStreamClient *llmClient; @property(nonatomic, strong) Segmenter *segmenter; @property(nonatomic, strong) TTSServiceClient *ttsClient; @property(nonatomic, strong) TTSPlaybackPipeline *playbackPipeline; @property(nonatomic, strong) SubtitleSync *subtitleSync; // 状态 @property(nonatomic, assign) ConversationState state; @property(nonatomic, copy) NSString *conversationId; @property(nonatomic, copy) NSString *currentSessionId; // 文本跟踪 @property(nonatomic, strong) NSMutableString *fullAssistantText; @property(nonatomic, strong) NSMutableDictionary *segmentTextMap; @property(nonatomic, assign) NSInteger segmentCounter; // 队列 @property(nonatomic, strong) dispatch_queue_t orchestratorQueue; @end @implementation ConversationOrchestrator #pragma mark - Initialization - (instancetype)init { self = [super init]; if (self) { _orchestratorQueue = dispatch_queue_create( "com.keyboard.aitalk.orchestrator", DISPATCH_QUEUE_SERIAL); _state = ConversationStateIdle; _conversationId = [[NSUUID UUID] UUIDString]; _fullAssistantText = [[NSMutableString alloc] init]; _segmentTextMap = [[NSMutableDictionary alloc] init]; _segmentCounter = 0; [self setupModules]; } return self; } - (void)setupModules { // Audio Session self.audioSession = [AudioSessionManager sharedManager]; self.audioSession.delegate = self; // Audio Capture self.audioCapture = [[AudioCaptureManager alloc] init]; self.audioCapture.delegate = self; // ASR Client self.asrClient = [[ASRStreamClient alloc] init]; self.asrClient.delegate = self; // LLM Client self.llmClient = [[LLMStreamClient alloc] init]; self.llmClient.delegate = self; // Segmenter self.segmenter = [[Segmenter alloc] init]; // TTS Client self.ttsClient = [[TTSServiceClient alloc] init]; self.ttsClient.delegate = self; // Playback Pipeline self.playbackPipeline = [[TTSPlaybackPipeline alloc] init]; self.playbackPipeline.delegate = self; // Subtitle Sync self.subtitleSync = [[SubtitleSync alloc] init]; } #pragma mark - Configuration Setters - (void)setAsrServerURL:(NSString *)asrServerURL { _asrServerURL = [asrServerURL copy]; self.asrClient.serverURL = asrServerURL; } - (void)setLlmServerURL:(NSString *)llmServerURL { _llmServerURL = [llmServerURL copy]; self.llmClient.serverURL = llmServerURL; } - (void)setTtsServerURL:(NSString *)ttsServerURL { _ttsServerURL = [ttsServerURL copy]; self.ttsClient.serverURL = ttsServerURL; } #pragma mark - User Actions - (void)userDidPressRecord { dispatch_async(self.orchestratorQueue, ^{ NSLog(@"[Orchestrator] userDidPressRecord, current state: %ld", (long)self.state); // 如果正在播放或思考,执行打断 if (self.state == ConversationStateSpeaking || self.state == ConversationStateThinking) { [self performBargein]; } // 检查麦克风权限 if (![self.audioSession hasMicrophonePermission]) { [self.audioSession requestMicrophonePermission:^(BOOL granted) { if (granted) { dispatch_async(self.orchestratorQueue, ^{ [self startRecording]; }); } }]; return; } [self startRecording]; }); } - (void)userDidReleaseRecord { dispatch_async(self.orchestratorQueue, ^{ NSLog(@"[Orchestrator] userDidReleaseRecord, current state: %ld", (long)self.state); if (self.state != ConversationStateListening) { return; } // 停止采集 [self.audioCapture stopCapture]; // 请求 ASR 最终结果 [self.asrClient finalize]; // 更新状态 [self updateState:ConversationStateRecognizing]; }); } - (void)stop { dispatch_async(self.orchestratorQueue, ^{ [self cancelAll]; [self updateState:ConversationStateIdle]; }); } #pragma mark - Private: Recording - (void)startRecording { // 配置音频会话 NSError *error = nil; if (![self.audioSession configureForConversation:&error]) { [self reportError:error]; return; } if (![self.audioSession activateSession:&error]) { [self reportError:error]; return; } // 生成新的会话 ID self.currentSessionId = [[NSUUID UUID] UUIDString]; // 启动 ASR [self.asrClient startWithSessionId:self.currentSessionId]; // 启动音频采集 if (![self.audioCapture startCapture:&error]) { [self reportError:error]; [self.asrClient cancel]; return; } // 更新状态 [self updateState:ConversationStateListening]; } #pragma mark - Private: Barge-in (打断) - (void)performBargein { NSLog(@"[Orchestrator] Performing barge-in"); // 取消所有正在进行的请求 [self.ttsClient cancel]; [self.llmClient cancel]; [self.asrClient cancel]; // 停止播放 [self.playbackPipeline stop]; // 清空状态 [self.segmenter reset]; [self.segmentTextMap removeAllObjects]; [self.fullAssistantText setString:@""]; self.segmentCounter = 0; } - (void)cancelAll { [self.audioCapture stopCapture]; [self.asrClient cancel]; [self.llmClient cancel]; [self.ttsClient cancel]; [self.playbackPipeline stop]; [self.segmenter reset]; [self.audioSession deactivateSession]; } #pragma mark - Private: State Management - (void)updateState:(ConversationState)newState { if (self.state == newState) return; ConversationState oldState = self.state; self.state = newState; NSLog(@"[Orchestrator] State: %ld -> %ld", (long)oldState, (long)newState); dispatch_async(dispatch_get_main_queue(), ^{ if (self.onStateChange) { self.onStateChange(newState); } // 特殊状态回调 if (newState == ConversationStateSpeaking && oldState != ConversationStateSpeaking) { if (self.onSpeakingStart) { self.onSpeakingStart(); } } if (oldState == ConversationStateSpeaking && newState != ConversationStateSpeaking) { if (self.onSpeakingEnd) { self.onSpeakingEnd(); } } }); } - (void)reportError:(NSError *)error { NSLog(@"[Orchestrator] Error: %@", error.localizedDescription); dispatch_async(dispatch_get_main_queue(), ^{ if (self.onError) { self.onError(error); } }); } #pragma mark - AudioCaptureManagerDelegate - (void)audioCaptureManagerDidOutputPCMFrame:(NSData *)pcmFrame { // 发送到 ASR [self.asrClient sendAudioPCMFrame:pcmFrame]; } - (void)audioCaptureManagerDidUpdateRMS:(float)rms { dispatch_async(dispatch_get_main_queue(), ^{ if (self.onVolumeUpdate) { self.onVolumeUpdate(rms); } }); } #pragma mark - AudioSessionManagerDelegate - (void)audioSessionManagerDidInterrupt:(KBAudioSessionInterruptionType)type { dispatch_async(self.orchestratorQueue, ^{ if (type == KBAudioSessionInterruptionTypeBegan) { // 中断开始:停止采集和播放 [self cancelAll]; [self updateState:ConversationStateIdle]; } }); } - (void)audioSessionManagerMicrophonePermissionDenied { NSError *error = [NSError errorWithDomain:@"ConversationOrchestrator" code:-1 userInfo:@{ NSLocalizedDescriptionKey : @"请在设置中开启麦克风权限" }]; [self reportError:error]; } #pragma mark - ASRStreamClientDelegate - (void)asrClientDidReceivePartialText:(NSString *)text { dispatch_async(dispatch_get_main_queue(), ^{ if (self.onPartialText) { self.onPartialText(text); } }); } - (void)asrClientDidReceiveFinalText:(NSString *)text { dispatch_async(self.orchestratorQueue, ^{ NSLog(@"[Orchestrator] ASR final text: %@", text); // 回调用户文本 dispatch_async(dispatch_get_main_queue(), ^{ if (self.onUserFinalText) { self.onUserFinalText(text); } }); // 如果文本为空,回到空闲 if (text.length == 0) { [self updateState:ConversationStateIdle]; return; } // 更新状态并开始 LLM 请求 [self updateState:ConversationStateThinking]; // 重置文本跟踪 [self.fullAssistantText setString:@""]; [self.segmentTextMap removeAllObjects]; self.segmentCounter = 0; [self.segmenter reset]; // 启动播放管线 NSError *error = nil; if (![self.playbackPipeline start:&error]) { NSLog(@"[Orchestrator] Failed to start playback pipeline: %@", error.localizedDescription); } // 发送 LLM 请求 [self.llmClient sendUserText:text conversationId:self.conversationId]; }); } - (void)asrClientDidFail:(NSError *)error { dispatch_async(self.orchestratorQueue, ^{ [self reportError:error]; [self updateState:ConversationStateIdle]; }); } #pragma mark - LLMStreamClientDelegate - (void)llmClientDidReceiveToken:(NSString *)token { dispatch_async(self.orchestratorQueue, ^{ // 追加到完整文本 [self.fullAssistantText appendString:token]; // 追加到分段器 [self.segmenter appendToken:token]; // 检查是否有可触发 TTS 的片段 NSArray *segments = [self.segmenter popReadySegments]; for (NSString *segmentText in segments) { [self requestTTSForSegment:segmentText]; } }); } - (void)llmClientDidComplete { dispatch_async(self.orchestratorQueue, ^{ NSLog(@"[Orchestrator] LLM complete"); // 处理剩余片段 NSString *remaining = [self.segmenter flushRemainingSegment]; if (remaining && remaining.length > 0) { [self requestTTSForSegment:remaining]; } // 回调完整文本 NSString *fullText = [self.fullAssistantText copy]; dispatch_async(dispatch_get_main_queue(), ^{ if (self.onAssistantFullText) { self.onAssistantFullText(fullText); } }); }); } - (void)llmClientDidFail:(NSError *)error { dispatch_async(self.orchestratorQueue, ^{ [self reportError:error]; [self updateState:ConversationStateIdle]; }); } #pragma mark - Private: TTS Request - (void)requestTTSForSegment:(NSString *)segmentText { NSString *segmentId = [NSString stringWithFormat:@"seg_%ld", (long)self.segmentCounter++]; // 记录片段文本 self.segmentTextMap[segmentId] = segmentText; NSLog(@"[Orchestrator] Requesting TTS for segment %@: %@", segmentId, segmentText); // 请求 TTS [self.ttsClient requestTTSForText:segmentText segmentId:segmentId]; } #pragma mark - TTSServiceClientDelegate - (void)ttsClientDidReceiveURL:(NSURL *)url segmentId:(NSString *)segmentId { dispatch_async(self.orchestratorQueue, ^{ [self.playbackPipeline enqueueURL:url segmentId:segmentId]; // 如果还在 Thinking,切换到 Speaking if (self.state == ConversationStateThinking) { [self updateState:ConversationStateSpeaking]; } }); } - (void)ttsClientDidReceiveAudioChunk:(NSData *)chunk payloadType:(TTSPayloadType)type segmentId:(NSString *)segmentId { dispatch_async(self.orchestratorQueue, ^{ [self.playbackPipeline enqueueChunk:chunk payloadType:type segmentId:segmentId]; // 如果还在 Thinking,切换到 Speaking if (self.state == ConversationStateThinking) { [self updateState:ConversationStateSpeaking]; } }); } - (void)ttsClientDidFinishSegment:(NSString *)segmentId { dispatch_async(self.orchestratorQueue, ^{ [self.playbackPipeline markSegmentComplete:segmentId]; }); } - (void)ttsClientDidFail:(NSError *)error { dispatch_async(self.orchestratorQueue, ^{ [self reportError:error]; }); } #pragma mark - TTSPlaybackPipelineDelegate - (void)pipelineDidStartSegment:(NSString *)segmentId duration:(NSTimeInterval)duration { NSLog(@"[Orchestrator] Started playing segment: %@", segmentId); } - (void)pipelineDidUpdatePlaybackTime:(NSTimeInterval)time segmentId:(NSString *)segmentId { dispatch_async(self.orchestratorQueue, ^{ // 获取片段文本 NSString *segmentText = self.segmentTextMap[segmentId]; if (!segmentText) return; // 计算可见文本 NSTimeInterval duration = [self.playbackPipeline durationForSegment:segmentId]; NSString *visibleText = [self.subtitleSync visibleTextForFullText:segmentText currentTime:time duration:duration]; // TODO: 这里应该累加之前片段的文本,实现完整的打字机效果 // 简化实现:只显示当前片段 dispatch_async(dispatch_get_main_queue(), ^{ if (self.onAssistantVisibleText) { self.onAssistantVisibleText(visibleText); } }); }); } - (void)pipelineDidFinishSegment:(NSString *)segmentId { NSLog(@"[Orchestrator] Finished playing segment: %@", segmentId); } - (void)pipelineDidFinishAllSegments { dispatch_async(self.orchestratorQueue, ^{ NSLog(@"[Orchestrator] All segments finished"); // 回到空闲状态 [self updateState:ConversationStateIdle]; [self.audioSession deactivateSession]; }); } - (void)pipelineDidFail:(NSError *)error { dispatch_async(self.orchestratorQueue, ^{ [self reportError:error]; [self updateState:ConversationStateIdle]; }); } @end