添加语音websocket等,还没测试

This commit is contained in:
2026-01-16 13:38:03 +08:00
parent 169a1929d7
commit b021fd308f
33 changed files with 5098 additions and 8 deletions

View File

@@ -0,0 +1,527 @@
//
// ConversationOrchestrator.m
// keyBoard
//
// Created by Mac on 2026/1/15.
//
#import "ConversationOrchestrator.h"
#import "ASRStreamClient.h"
#import "AudioCaptureManager.h"
#import "AudioSessionManager.h"
#import "LLMStreamClient.h"
#import "Segmenter.h"
#import "SubtitleSync.h"
#import "TTSPlaybackPipeline.h"
#import "TTSServiceClient.h"
@interface ConversationOrchestrator () <
AudioSessionManagerDelegate, AudioCaptureManagerDelegate,
ASRStreamClientDelegate, LLMStreamClientDelegate, TTSServiceClientDelegate,
TTSPlaybackPipelineDelegate>
//
@property(nonatomic, strong) AudioSessionManager *audioSession;
@property(nonatomic, strong) AudioCaptureManager *audioCapture;
@property(nonatomic, strong) ASRStreamClient *asrClient;
@property(nonatomic, strong) LLMStreamClient *llmClient;
@property(nonatomic, strong) Segmenter *segmenter;
@property(nonatomic, strong) TTSServiceClient *ttsClient;
@property(nonatomic, strong) TTSPlaybackPipeline *playbackPipeline;
@property(nonatomic, strong) SubtitleSync *subtitleSync;
//
@property(nonatomic, assign) ConversationState state;
@property(nonatomic, copy) NSString *conversationId;
@property(nonatomic, copy) NSString *currentSessionId;
//
@property(nonatomic, strong) NSMutableString *fullAssistantText;
@property(nonatomic, strong)
NSMutableDictionary<NSString *, NSString *> *segmentTextMap;
@property(nonatomic, assign) NSInteger segmentCounter;
//
@property(nonatomic, strong) dispatch_queue_t orchestratorQueue;
@end
@implementation ConversationOrchestrator
#pragma mark - Initialization
- (instancetype)init {
self = [super init];
if (self) {
_orchestratorQueue = dispatch_queue_create(
"com.keyboard.aitalk.orchestrator", DISPATCH_QUEUE_SERIAL);
_state = ConversationStateIdle;
_conversationId = [[NSUUID UUID] UUIDString];
_fullAssistantText = [[NSMutableString alloc] init];
_segmentTextMap = [[NSMutableDictionary alloc] init];
_segmentCounter = 0;
[self setupModules];
}
return self;
}
- (void)setupModules {
// Audio Session
self.audioSession = [AudioSessionManager sharedManager];
self.audioSession.delegate = self;
// Audio Capture
self.audioCapture = [[AudioCaptureManager alloc] init];
self.audioCapture.delegate = self;
// ASR Client
self.asrClient = [[ASRStreamClient alloc] init];
self.asrClient.delegate = self;
// LLM Client
self.llmClient = [[LLMStreamClient alloc] init];
self.llmClient.delegate = self;
// Segmenter
self.segmenter = [[Segmenter alloc] init];
// TTS Client
self.ttsClient = [[TTSServiceClient alloc] init];
self.ttsClient.delegate = self;
// Playback Pipeline
self.playbackPipeline = [[TTSPlaybackPipeline alloc] init];
self.playbackPipeline.delegate = self;
// Subtitle Sync
self.subtitleSync = [[SubtitleSync alloc] init];
}
#pragma mark - Configuration Setters
- (void)setAsrServerURL:(NSString *)asrServerURL {
_asrServerURL = [asrServerURL copy];
self.asrClient.serverURL = asrServerURL;
}
- (void)setLlmServerURL:(NSString *)llmServerURL {
_llmServerURL = [llmServerURL copy];
self.llmClient.serverURL = llmServerURL;
}
- (void)setTtsServerURL:(NSString *)ttsServerURL {
_ttsServerURL = [ttsServerURL copy];
self.ttsClient.serverURL = ttsServerURL;
}
#pragma mark - User Actions
- (void)userDidPressRecord {
dispatch_async(self.orchestratorQueue, ^{
NSLog(@"[Orchestrator] userDidPressRecord, current state: %ld",
(long)self.state);
//
if (self.state == ConversationStateSpeaking ||
self.state == ConversationStateThinking) {
[self performBargein];
}
//
if (![self.audioSession hasMicrophonePermission]) {
[self.audioSession requestMicrophonePermission:^(BOOL granted) {
if (granted) {
dispatch_async(self.orchestratorQueue, ^{
[self startRecording];
});
}
}];
return;
}
[self startRecording];
});
}
- (void)userDidReleaseRecord {
dispatch_async(self.orchestratorQueue, ^{
NSLog(@"[Orchestrator] userDidReleaseRecord, current state: %ld",
(long)self.state);
if (self.state != ConversationStateListening) {
return;
}
//
[self.audioCapture stopCapture];
// ASR
[self.asrClient finalize];
//
[self updateState:ConversationStateRecognizing];
});
}
- (void)stop {
dispatch_async(self.orchestratorQueue, ^{
[self cancelAll];
[self updateState:ConversationStateIdle];
});
}
#pragma mark - Private: Recording
- (void)startRecording {
//
NSError *error = nil;
if (![self.audioSession configureForConversation:&error]) {
[self reportError:error];
return;
}
if (![self.audioSession activateSession:&error]) {
[self reportError:error];
return;
}
// ID
self.currentSessionId = [[NSUUID UUID] UUIDString];
// ASR
[self.asrClient startWithSessionId:self.currentSessionId];
//
if (![self.audioCapture startCapture:&error]) {
[self reportError:error];
[self.asrClient cancel];
return;
}
//
[self updateState:ConversationStateListening];
}
#pragma mark - Private: Barge-in ()
- (void)performBargein {
NSLog(@"[Orchestrator] Performing barge-in");
//
[self.ttsClient cancel];
[self.llmClient cancel];
[self.asrClient cancel];
//
[self.playbackPipeline stop];
//
[self.segmenter reset];
[self.segmentTextMap removeAllObjects];
[self.fullAssistantText setString:@""];
self.segmentCounter = 0;
}
- (void)cancelAll {
[self.audioCapture stopCapture];
[self.asrClient cancel];
[self.llmClient cancel];
[self.ttsClient cancel];
[self.playbackPipeline stop];
[self.segmenter reset];
[self.audioSession deactivateSession];
}
#pragma mark - Private: State Management
- (void)updateState:(ConversationState)newState {
if (self.state == newState)
return;
ConversationState oldState = self.state;
self.state = newState;
NSLog(@"[Orchestrator] State: %ld -> %ld", (long)oldState, (long)newState);
dispatch_async(dispatch_get_main_queue(), ^{
if (self.onStateChange) {
self.onStateChange(newState);
}
//
if (newState == ConversationStateSpeaking &&
oldState != ConversationStateSpeaking) {
if (self.onSpeakingStart) {
self.onSpeakingStart();
}
}
if (oldState == ConversationStateSpeaking &&
newState != ConversationStateSpeaking) {
if (self.onSpeakingEnd) {
self.onSpeakingEnd();
}
}
});
}
- (void)reportError:(NSError *)error {
NSLog(@"[Orchestrator] Error: %@", error.localizedDescription);
dispatch_async(dispatch_get_main_queue(), ^{
if (self.onError) {
self.onError(error);
}
});
}
#pragma mark - AudioCaptureManagerDelegate
- (void)audioCaptureManagerDidOutputPCMFrame:(NSData *)pcmFrame {
// ASR
[self.asrClient sendAudioPCMFrame:pcmFrame];
}
- (void)audioCaptureManagerDidUpdateRMS:(float)rms {
dispatch_async(dispatch_get_main_queue(), ^{
if (self.onVolumeUpdate) {
self.onVolumeUpdate(rms);
}
});
}
#pragma mark - AudioSessionManagerDelegate
- (void)audioSessionManagerDidInterrupt:(KBAudioSessionInterruptionType)type {
dispatch_async(self.orchestratorQueue, ^{
if (type == KBAudioSessionInterruptionTypeBegan) {
//
[self cancelAll];
[self updateState:ConversationStateIdle];
}
});
}
- (void)audioSessionManagerMicrophonePermissionDenied {
NSError *error =
[NSError errorWithDomain:@"ConversationOrchestrator"
code:-1
userInfo:@{
NSLocalizedDescriptionKey : @"请在设置中开启麦克风权限"
}];
[self reportError:error];
}
#pragma mark - ASRStreamClientDelegate
- (void)asrClientDidReceivePartialText:(NSString *)text {
dispatch_async(dispatch_get_main_queue(), ^{
if (self.onPartialText) {
self.onPartialText(text);
}
});
}
- (void)asrClientDidReceiveFinalText:(NSString *)text {
dispatch_async(self.orchestratorQueue, ^{
NSLog(@"[Orchestrator] ASR final text: %@", text);
//
dispatch_async(dispatch_get_main_queue(), ^{
if (self.onUserFinalText) {
self.onUserFinalText(text);
}
});
//
if (text.length == 0) {
[self updateState:ConversationStateIdle];
return;
}
// LLM
[self updateState:ConversationStateThinking];
//
[self.fullAssistantText setString:@""];
[self.segmentTextMap removeAllObjects];
self.segmentCounter = 0;
[self.segmenter reset];
// 线
NSError *error = nil;
if (![self.playbackPipeline start:&error]) {
NSLog(@"[Orchestrator] Failed to start playback pipeline: %@",
error.localizedDescription);
}
// LLM
[self.llmClient sendUserText:text conversationId:self.conversationId];
});
}
- (void)asrClientDidFail:(NSError *)error {
dispatch_async(self.orchestratorQueue, ^{
[self reportError:error];
[self updateState:ConversationStateIdle];
});
}
#pragma mark - LLMStreamClientDelegate
- (void)llmClientDidReceiveToken:(NSString *)token {
dispatch_async(self.orchestratorQueue, ^{
//
[self.fullAssistantText appendString:token];
//
[self.segmenter appendToken:token];
// TTS
NSArray<NSString *> *segments = [self.segmenter popReadySegments];
for (NSString *segmentText in segments) {
[self requestTTSForSegment:segmentText];
}
});
}
- (void)llmClientDidComplete {
dispatch_async(self.orchestratorQueue, ^{
NSLog(@"[Orchestrator] LLM complete");
//
NSString *remaining = [self.segmenter flushRemainingSegment];
if (remaining && remaining.length > 0) {
[self requestTTSForSegment:remaining];
}
//
NSString *fullText = [self.fullAssistantText copy];
dispatch_async(dispatch_get_main_queue(), ^{
if (self.onAssistantFullText) {
self.onAssistantFullText(fullText);
}
});
});
}
- (void)llmClientDidFail:(NSError *)error {
dispatch_async(self.orchestratorQueue, ^{
[self reportError:error];
[self updateState:ConversationStateIdle];
});
}
#pragma mark - Private: TTS Request
- (void)requestTTSForSegment:(NSString *)segmentText {
NSString *segmentId =
[NSString stringWithFormat:@"seg_%ld", (long)self.segmentCounter++];
//
self.segmentTextMap[segmentId] = segmentText;
NSLog(@"[Orchestrator] Requesting TTS for segment %@: %@", segmentId,
segmentText);
// TTS
[self.ttsClient requestTTSForText:segmentText segmentId:segmentId];
}
#pragma mark - TTSServiceClientDelegate
- (void)ttsClientDidReceiveURL:(NSURL *)url segmentId:(NSString *)segmentId {
dispatch_async(self.orchestratorQueue, ^{
[self.playbackPipeline enqueueURL:url segmentId:segmentId];
// Thinking Speaking
if (self.state == ConversationStateThinking) {
[self updateState:ConversationStateSpeaking];
}
});
}
- (void)ttsClientDidReceiveAudioChunk:(NSData *)chunk
payloadType:(TTSPayloadType)type
segmentId:(NSString *)segmentId {
dispatch_async(self.orchestratorQueue, ^{
[self.playbackPipeline enqueueChunk:chunk
payloadType:type
segmentId:segmentId];
// Thinking Speaking
if (self.state == ConversationStateThinking) {
[self updateState:ConversationStateSpeaking];
}
});
}
- (void)ttsClientDidFinishSegment:(NSString *)segmentId {
dispatch_async(self.orchestratorQueue, ^{
[self.playbackPipeline markSegmentComplete:segmentId];
});
}
- (void)ttsClientDidFail:(NSError *)error {
dispatch_async(self.orchestratorQueue, ^{
[self reportError:error];
});
}
#pragma mark - TTSPlaybackPipelineDelegate
- (void)pipelineDidStartSegment:(NSString *)segmentId
duration:(NSTimeInterval)duration {
NSLog(@"[Orchestrator] Started playing segment: %@", segmentId);
}
- (void)pipelineDidUpdatePlaybackTime:(NSTimeInterval)time
segmentId:(NSString *)segmentId {
dispatch_async(self.orchestratorQueue, ^{
//
NSString *segmentText = self.segmentTextMap[segmentId];
if (!segmentText)
return;
//
NSTimeInterval duration =
[self.playbackPipeline durationForSegment:segmentId];
NSString *visibleText =
[self.subtitleSync visibleTextForFullText:segmentText
currentTime:time
duration:duration];
// TODO:
//
dispatch_async(dispatch_get_main_queue(), ^{
if (self.onAssistantVisibleText) {
self.onAssistantVisibleText(visibleText);
}
});
});
}
- (void)pipelineDidFinishSegment:(NSString *)segmentId {
NSLog(@"[Orchestrator] Finished playing segment: %@", segmentId);
}
- (void)pipelineDidFinishAllSegments {
dispatch_async(self.orchestratorQueue, ^{
NSLog(@"[Orchestrator] All segments finished");
//
[self updateState:ConversationStateIdle];
[self.audioSession deactivateSession];
});
}
- (void)pipelineDidFail:(NSError *)error {
dispatch_async(self.orchestratorQueue, ^{
[self reportError:error];
[self updateState:ConversationStateIdle];
});
}
@end