diff --git a/keyBoard.xcodeproj/project.pbxproj b/keyBoard.xcodeproj/project.pbxproj index 57d99a3..bdcf1bb 100644 --- a/keyBoard.xcodeproj/project.pbxproj +++ b/keyBoard.xcodeproj/project.pbxproj @@ -195,6 +195,9 @@ 04D1F6B22EDFF10A00B12345 /* KBSkinInstallBridge.m in Sources */ = {isa = PBXBuildFile; fileRef = 04D1F6B12EDFF10A00B12345 /* KBSkinInstallBridge.m */; }; 04D1F6B32EDFF10A00B12345 /* KBSkinInstallBridge.m in Sources */ = {isa = PBXBuildFile; fileRef = 04D1F6B12EDFF10A00B12345 /* KBSkinInstallBridge.m */; }; 04E0383E2F1A7C30002CA5A0 /* KBCustomTabBar.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E0383D2F1A7C30002CA5A0 /* KBCustomTabBar.m */; }; + 04E038D82F20BFFB002CA5A0 /* websocket-api.md in Resources */ = {isa = PBXBuildFile; fileRef = 04E038D72F20BFFB002CA5A0 /* websocket-api.md */; }; + 04E038DD2F20C420002CA5A0 /* VoiceChatStreamingManager.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E038DA2F20C420002CA5A0 /* VoiceChatStreamingManager.m */; }; + 04E038DE2F20C420002CA5A0 /* VoiceChatWebSocketClient.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E038DC2F20C420002CA5A0 /* VoiceChatWebSocketClient.m */; }; 04E161832F10E6470022C23B /* normal_hei_them.zip in Resources */ = {isa = PBXBuildFile; fileRef = 04E161812F10E6470022C23B /* normal_hei_them.zip */; }; 04E161842F10E6470022C23B /* normal_them.zip in Resources */ = {isa = PBXBuildFile; fileRef = 04E161822F10E6470022C23B /* normal_them.zip */; }; 04FC95672EB0546C007BD342 /* KBKey.m in Sources */ = {isa = PBXBuildFile; fileRef = 04FC95652EB0546C007BD342 /* KBKey.m */; }; @@ -608,6 +611,11 @@ 04D1F6B12EDFF10A00B12345 /* KBSkinInstallBridge.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBSkinInstallBridge.m; sourceTree = ""; }; 04E0383C2F1A7C30002CA5A0 /* KBCustomTabBar.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KBCustomTabBar.h; sourceTree = ""; }; 04E0383D2F1A7C30002CA5A0 /* KBCustomTabBar.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBCustomTabBar.m; sourceTree = ""; }; + 04E038D72F20BFFB002CA5A0 /* websocket-api.md */ = {isa = PBXFileReference; lastKnownFileType = net.daringfireball.markdown; path = "websocket-api.md"; sourceTree = ""; }; + 04E038D92F20C420002CA5A0 /* VoiceChatStreamingManager.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = VoiceChatStreamingManager.h; sourceTree = ""; }; + 04E038DA2F20C420002CA5A0 /* VoiceChatStreamingManager.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = VoiceChatStreamingManager.m; sourceTree = ""; }; + 04E038DB2F20C420002CA5A0 /* VoiceChatWebSocketClient.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = VoiceChatWebSocketClient.h; sourceTree = ""; }; + 04E038DC2F20C420002CA5A0 /* VoiceChatWebSocketClient.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = VoiceChatWebSocketClient.m; sourceTree = ""; }; 04E161812F10E6470022C23B /* normal_hei_them.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; path = normal_hei_them.zip; sourceTree = ""; }; 04E161822F10E6470022C23B /* normal_them.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; path = normal_them.zip; sourceTree = ""; }; 04FC953A2EAFAE56007BD342 /* KeyBoardPrefixHeader.pch */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KeyBoardPrefixHeader.pch; sourceTree = ""; }; @@ -986,6 +994,10 @@ 046086AE2F19239B00757C95 /* TTSPlaybackPipeline.m */, 046086AF2F19239B00757C95 /* TTSServiceClient.h */, 046086B02F19239B00757C95 /* TTSServiceClient.m */, + 04E038D92F20C420002CA5A0 /* VoiceChatStreamingManager.h */, + 04E038DA2F20C420002CA5A0 /* VoiceChatStreamingManager.m */, + 04E038DB2F20C420002CA5A0 /* VoiceChatWebSocketClient.h */, + 04E038DC2F20C420002CA5A0 /* VoiceChatWebSocketClient.m */, ); path = VM; sourceTree = ""; @@ -994,6 +1006,7 @@ isa = PBXGroup; children = ( 046086742F191CC700757C95 /* AI技术分析.txt */, + 04E038D72F20BFFB002CA5A0 /* websocket-api.md */, 0460866C2F191A5100757C95 /* M */, 0460866D2F191A5100757C95 /* V */, 0460866E2F191A5100757C95 /* VC */, @@ -2003,6 +2016,7 @@ buildActionMask = 2147483647; files = ( 04286A0F2ECDA71B00CE730C /* 001.zip in Resources */, + 04E038D82F20BFFB002CA5A0 /* websocket-api.md in Resources */, 0479200B2ED87CEE004E8522 /* permiss_video.mp4 in Resources */, 04C6EABA2EAF86530089C901 /* Assets.xcassets in Resources */, 04A9FE212EB893F10020DB6D /* Localizable.strings in Resources */, @@ -2235,6 +2249,8 @@ 05A1B2D22F5B1A2B3C4D5E60 /* KBSearchThemeModel.m in Sources */, 047C65102EBCA8DD0035E841 /* HomeRankContentVC.m in Sources */, 047C655C2EBCD0F80035E841 /* UIView+KBShadow.m in Sources */, + 04E038DD2F20C420002CA5A0 /* VoiceChatStreamingManager.m in Sources */, + 04E038DE2F20C420002CA5A0 /* VoiceChatWebSocketClient.m in Sources */, 049FB2262EC3136D00FAB05D /* KBPersonInfoItemCell.m in Sources */, 048908C32EBE32B800FABA60 /* KBSearchVC.m in Sources */, 049FB20B2EC1C13800FAB05D /* KBSkinBottomActionView.m in Sources */, diff --git a/keyBoard/Class/AiTalk/VC/KBAiMainVC.m b/keyBoard/Class/AiTalk/VC/KBAiMainVC.m index 624cbed..71ddccf 100644 --- a/keyBoard/Class/AiTalk/VC/KBAiMainVC.m +++ b/keyBoard/Class/AiTalk/VC/KBAiMainVC.m @@ -11,8 +11,11 @@ #import "KBAiChatView.h" #import "KBAiRecordButton.h" #import "LSTPopView.h" +#import "VoiceChatStreamingManager.h" +#import "KBUserSessionManager.h" -@interface KBAiMainVC () +@interface KBAiMainVC () @property(nonatomic, weak) LSTPopView *popView; // UI @@ -28,6 +31,13 @@ // 核心模块 @property(nonatomic, strong) ConversationOrchestrator *orchestrator; +@property(nonatomic, strong) VoiceChatStreamingManager *streamingManager; + +// 文本跟踪 +@property(nonatomic, strong) NSMutableString *assistantVisibleText; + +// 日志节流 +@property(nonatomic, assign) NSTimeInterval lastRMSLogTime; @end @@ -44,6 +54,7 @@ [self setupUI]; [self setupOrchestrator]; + [self setupStreamingManager]; } - (void)viewWillAppear:(BOOL)animated { @@ -56,6 +67,7 @@ // 页面消失时停止对话 [self.orchestrator stop]; + [self.streamingManager disconnect]; } - (void)viewDidLayoutSubviews { @@ -184,11 +196,15 @@ - (void)setupOrchestrator { self.orchestrator = [[ConversationOrchestrator alloc] init]; - // 配置服务器地址(TODO: 替换为实际地址) - // self.orchestrator.asrServerURL = @"wss://your-asr-server.com/ws/asr"; - // self.orchestrator.llmServerURL = - // @"https://your-llm-server.com/api/chat/stream"; - // self.orchestrator.ttsServerURL = @"https://your-tts-server.com/api/tts"; + // 配置服务器地址 + // 1. ASR 语音识别服务(WebSocket) + self.orchestrator.asrServerURL = @"ws://192.168.2.21:7529/ws/asr"; + + // 2. LLM 大语言模型服务(HTTP Stream) + self.orchestrator.llmServerURL = @"http://192.168.2.21:7529/api/chat/stream"; + + // 3. TTS 语音合成服务(HTTP) + self.orchestrator.ttsServerURL = @"http://192.168.2.21:7529/api/tts/stream"; __weak typeof(self) weakSelf = self; @@ -278,6 +294,16 @@ }; } +#pragma mark - Streaming Manager + +- (void)setupStreamingManager { + self.streamingManager = [[VoiceChatStreamingManager alloc] init]; + self.streamingManager.delegate = self; + self.streamingManager.serverURL = @"ws://192.168.2.21:7529/api/ws/chat"; + self.assistantVisibleText = [[NSMutableString alloc] init]; + self.lastRMSLogTime = 0; +} + #pragma mark - 事件 - (void)showComment { CGFloat customViewHeight = KB_SCREEN_HEIGHT * (0.8); @@ -367,16 +393,112 @@ #pragma mark - KBAiRecordButtonDelegate - (void)recordButtonDidBeginPress:(KBAiRecordButton *)button { - [self.orchestrator userDidPressRecord]; + NSLog(@"[KBAiMainVC] Record button began press"); + NSString *token = [[KBUserSessionManager shared] accessToken] ?: @""; + if (token.length == 0) { + [[KBUserSessionManager shared] goLoginVC]; + return; + } + + self.statusLabel.text = @"正在连接..."; + self.recordButton.state = KBAiRecordButtonStateRecording; + [self.streamingManager startWithToken:token language:@"en" voiceId:nil]; } - (void)recordButtonDidEndPress:(KBAiRecordButton *)button { - [self.orchestrator userDidReleaseRecord]; + NSLog(@"[KBAiMainVC] Record button end press"); + [self.streamingManager stopAndFinalize]; } - (void)recordButtonDidCancelPress:(KBAiRecordButton *)button { - // 取消录音(同样调用 release,ASR 会返回空或部分结果) - [self.orchestrator userDidReleaseRecord]; + NSLog(@"[KBAiMainVC] Record button cancel press"); + [self.streamingManager cancel]; +} + +#pragma mark - VoiceChatStreamingManagerDelegate + +- (void)voiceChatStreamingManagerDidConnect { + self.statusLabel.text = @"已连接,准备中..."; +} + +- (void)voiceChatStreamingManagerDidDisconnect:(NSError *_Nullable)error { + self.recordButton.state = KBAiRecordButtonStateNormal; + if (error) { + [self showError:error]; + } +} + +- (void)voiceChatStreamingManagerDidStartSession:(NSString *)sessionId { + self.statusLabel.text = @"正在聆听..."; + self.recordButton.state = KBAiRecordButtonStateRecording; +} + +- (void)voiceChatStreamingManagerDidStartTurn:(NSInteger)turnIndex { + self.statusLabel.text = @"正在聆听..."; + self.recordButton.state = KBAiRecordButtonStateRecording; +} + +- (void)voiceChatStreamingManagerDidReceiveEagerEndOfTurnWithTranscript:(NSString *)text + confidence:(double)confidence { + self.statusLabel.text = @"准备响应..."; +} + +- (void)voiceChatStreamingManagerDidResumeTurn { + self.statusLabel.text = @"正在聆听..."; +} + +- (void)voiceChatStreamingManagerDidUpdateRMS:(float)rms { + [self.recordButton updateVolumeRMS:rms]; + NSTimeInterval now = [[NSDate date] timeIntervalSince1970]; + if (now - self.lastRMSLogTime >= 1.0) { + self.lastRMSLogTime = now; + NSLog(@"[KBAiMainVC] RMS: %.3f", rms); + } +} + +- (void)voiceChatStreamingManagerDidReceiveInterimTranscript:(NSString *)text { + self.statusLabel.text = text.length > 0 ? text : @"正在识别..."; +} + +- (void)voiceChatStreamingManagerDidReceiveFinalTranscript:(NSString *)text { + if (text.length > 0) { + [self.chatView addUserMessage:text]; + } +} + +- (void)voiceChatStreamingManagerDidReceiveLLMStart { + self.statusLabel.text = @"AI 正在思考..."; + [self.assistantVisibleText setString:@""]; + [self.chatView addAssistantMessage:@""]; +} + +- (void)voiceChatStreamingManagerDidReceiveLLMToken:(NSString *)token { + if (token.length == 0) { + return; + } + + [self.assistantVisibleText appendString:token]; + [self.chatView updateLastAssistantMessage:self.assistantVisibleText]; +} + +- (void)voiceChatStreamingManagerDidReceiveAudioChunk:(NSData *)audioData { +} + +- (void)voiceChatStreamingManagerDidCompleteWithTranscript:(NSString *)transcript + aiResponse:(NSString *)aiResponse { + NSString *finalText = aiResponse.length > 0 ? aiResponse + : self.assistantVisibleText; + if (finalText.length > 0) { + [self.chatView updateLastAssistantMessage:finalText]; + [self.chatView markLastAssistantMessageComplete]; + } + self.recordButton.state = KBAiRecordButtonStateNormal; + self.statusLabel.text = @"完成"; +} + +- (void)voiceChatStreamingManagerDidFail:(NSError *)error { + self.recordButton.state = KBAiRecordButtonStateNormal; + [self showError:error]; } @end diff --git a/keyBoard/Class/AiTalk/VM/ConversationOrchestrator.m b/keyBoard/Class/AiTalk/VM/ConversationOrchestrator.m index c376ee5..5a497ce 100644 --- a/keyBoard/Class/AiTalk/VM/ConversationOrchestrator.m +++ b/keyBoard/Class/AiTalk/VM/ConversationOrchestrator.m @@ -90,6 +90,11 @@ // TTS Client self.ttsClient = [[TTSServiceClient alloc] init]; self.ttsClient.delegate = self; + // ElevenLabs 配置(通过后端代理) + self.ttsClient.voiceId = @"JBFqnCBsd6RMkjVDRZzb"; // 默认语音 George + self.ttsClient.languageCode = @"zh"; // 中文 + self.ttsClient.expectedPayloadType = + TTSPayloadTypeURL; // 使用 URL 模式(简单) // Playback Pipeline self.playbackPipeline = [[TTSPlaybackPipeline alloc] init]; diff --git a/keyBoard/Class/AiTalk/VM/TTSServiceClient.h b/keyBoard/Class/AiTalk/VM/TTSServiceClient.h index 79bb3ec..d0118a5 100644 --- a/keyBoard/Class/AiTalk/VM/TTSServiceClient.h +++ b/keyBoard/Class/AiTalk/VM/TTSServiceClient.h @@ -41,6 +41,12 @@ typedef NS_ENUM(NSInteger, TTSPayloadType) { /// TTS 服务器 URL @property(nonatomic, copy) NSString *serverURL; +/// 语音 ID(ElevenLabs voice ID) +@property(nonatomic, copy) NSString *voiceId; + +/// 语言代码(如 "zh", "en") +@property(nonatomic, copy) NSString *languageCode; + /// 当前期望的返回类型(由服务端配置决定) @property(nonatomic, assign) TTSPayloadType expectedPayloadType; diff --git a/keyBoard/Class/AiTalk/VM/TTSServiceClient.m b/keyBoard/Class/AiTalk/VM/TTSServiceClient.m index e6b9401..f525a46 100644 --- a/keyBoard/Class/AiTalk/VM/TTSServiceClient.m +++ b/keyBoard/Class/AiTalk/VM/TTSServiceClient.m @@ -94,6 +94,8 @@ NSDictionary *body = @{ @"text" : text, @"segmentId" : segmentId, + @"voiceId" : self.voiceId ?: @"JBFqnCBsd6RMkjVDRZzb", + @"languageCode" : self.languageCode ?: @"zh", @"format" : @"mp3" // 或 m4a }; @@ -184,6 +186,8 @@ NSDictionary *requestDict = @{ @"text" : text, @"segmentId" : segmentId, + @"voiceId" : self.voiceId ?: @"JBFqnCBsd6RMkjVDRZzb", + @"languageCode" : self.languageCode ?: @"zh", @"format" : [self formatStringForPayloadType:self.expectedPayloadType] }; diff --git a/keyBoard/Class/AiTalk/VM/VoiceChatStreamingManager.h b/keyBoard/Class/AiTalk/VM/VoiceChatStreamingManager.h new file mode 100644 index 0000000..37c4b1a --- /dev/null +++ b/keyBoard/Class/AiTalk/VM/VoiceChatStreamingManager.h @@ -0,0 +1,53 @@ +// +// VoiceChatStreamingManager.h +// keyBoard +// +// Created by Mac on 2026/1/21. +// + +#import + +NS_ASSUME_NONNULL_BEGIN + +@protocol VoiceChatStreamingManagerDelegate +@optional +- (void)voiceChatStreamingManagerDidConnect; +- (void)voiceChatStreamingManagerDidDisconnect:(NSError *_Nullable)error; +- (void)voiceChatStreamingManagerDidStartSession:(NSString *)sessionId; +- (void)voiceChatStreamingManagerDidStartTurn:(NSInteger)turnIndex; +- (void)voiceChatStreamingManagerDidReceiveEagerEndOfTurnWithTranscript:(NSString *)text + confidence:(double)confidence; +- (void)voiceChatStreamingManagerDidResumeTurn; +- (void)voiceChatStreamingManagerDidUpdateRMS:(float)rms; +- (void)voiceChatStreamingManagerDidReceiveInterimTranscript:(NSString *)text; +- (void)voiceChatStreamingManagerDidReceiveFinalTranscript:(NSString *)text; +- (void)voiceChatStreamingManagerDidReceiveLLMStart; +- (void)voiceChatStreamingManagerDidReceiveLLMToken:(NSString *)token; +- (void)voiceChatStreamingManagerDidReceiveAudioChunk:(NSData *)audioData; +- (void)voiceChatStreamingManagerDidCompleteWithTranscript:(NSString *)transcript + aiResponse:(NSString *)aiResponse; +- (void)voiceChatStreamingManagerDidFail:(NSError *)error; +@end + +/// Manager for realtime recording and streaming. +@interface VoiceChatStreamingManager : NSObject + +@property(nonatomic, weak) id delegate; + +/// Base WebSocket URL, e.g. wss://api.yourdomain.com/api/ws/chat +@property(nonatomic, copy) NSString *serverURL; + +@property(nonatomic, assign, readonly, getter=isStreaming) BOOL streaming; +@property(nonatomic, copy, readonly, nullable) NSString *sessionId; + +- (void)startWithToken:(NSString *)token + language:(nullable NSString *)language + voiceId:(nullable NSString *)voiceId; + +- (void)stopAndFinalize; +- (void)cancel; +- (void)disconnect; + +@end + +NS_ASSUME_NONNULL_END diff --git a/keyBoard/Class/AiTalk/VM/VoiceChatStreamingManager.m b/keyBoard/Class/AiTalk/VM/VoiceChatStreamingManager.m new file mode 100644 index 0000000..ad078f7 --- /dev/null +++ b/keyBoard/Class/AiTalk/VM/VoiceChatStreamingManager.m @@ -0,0 +1,376 @@ +// +// VoiceChatStreamingManager.m +// keyBoard +// +// Created by Mac on 2026/1/21. +// + +#import "VoiceChatStreamingManager.h" +#import "AudioCaptureManager.h" +#import "AudioSessionManager.h" +#import "VoiceChatWebSocketClient.h" + +static NSString *const kVoiceChatStreamingManagerErrorDomain = + @"VoiceChatStreamingManager"; + +@interface VoiceChatStreamingManager () + +@property(nonatomic, strong) AudioSessionManager *audioSession; +@property(nonatomic, strong) AudioCaptureManager *audioCapture; +@property(nonatomic, strong) VoiceChatWebSocketClient *webSocketClient; +@property(nonatomic, strong) dispatch_queue_t stateQueue; + +@property(nonatomic, assign) BOOL streaming; +@property(nonatomic, copy) NSString *sessionId; + +@property(nonatomic, copy) NSString *pendingToken; +@property(nonatomic, copy) NSString *pendingLanguage; +@property(nonatomic, copy) NSString *pendingVoiceId; + +@end + +@implementation VoiceChatStreamingManager + +- (instancetype)init { + self = [super init]; + if (self) { + _stateQueue = dispatch_queue_create("com.keyboard.aitalk.voicechat.manager", + DISPATCH_QUEUE_SERIAL); + + _audioSession = [AudioSessionManager sharedManager]; + _audioSession.delegate = self; + + _audioCapture = [[AudioCaptureManager alloc] init]; + _audioCapture.delegate = self; + + _webSocketClient = [[VoiceChatWebSocketClient alloc] init]; + _webSocketClient.delegate = self; + + _serverURL = @"ws://192.168.2.21:7529/api/ws/chat?token="; + _webSocketClient.serverURL = _serverURL; + } + return self; +} + +- (void)dealloc { + [self disconnect]; +} + +- (void)setServerURL:(NSString *)serverURL { + _serverURL = [serverURL copy]; + self.webSocketClient.serverURL = _serverURL; +} + +#pragma mark - Public Methods + +- (void)startWithToken:(NSString *)token + language:(nullable NSString *)language + voiceId:(nullable NSString *)voiceId { + dispatch_async(self.stateQueue, ^{ + self.pendingToken = token ?: @""; + self.pendingLanguage = language ?: @""; + self.pendingVoiceId = voiceId ?: @""; + [self.webSocketClient disableAudioSending]; + [self startInternal]; + }); +} + +- (void)stopAndFinalize { + dispatch_async(self.stateQueue, ^{ + if (self.streaming) { + [self.audioCapture stopCapture]; + self.streaming = NO; + } + [self.webSocketClient disableAudioSending]; + [self.webSocketClient endAudio]; + }); +} + +- (void)cancel { + dispatch_async(self.stateQueue, ^{ + if (self.streaming) { + [self.audioCapture stopCapture]; + self.streaming = NO; + } + [self.webSocketClient disableAudioSending]; + [self.webSocketClient cancel]; + self.sessionId = nil; + }); +} + +- (void)disconnect { + dispatch_async(self.stateQueue, ^{ + if (self.streaming) { + [self.audioCapture stopCapture]; + self.streaming = NO; + } + [self.webSocketClient disableAudioSending]; + [self.webSocketClient disconnect]; + [self.audioSession deactivateSession]; + self.sessionId = nil; + }); +} + +#pragma mark - Private Methods + +- (void)startInternal { + if (self.pendingToken.length == 0) { + NSLog(@"[VoiceChatStreamingManager] Start failed: token is empty"); + [self reportErrorWithMessage:@"Token is required"]; + return; + } + + if (![self.audioSession hasMicrophonePermission]) { + __weak typeof(self) weakSelf = self; + [self.audioSession requestMicrophonePermission:^(BOOL granted) { + __strong typeof(weakSelf) strongSelf = weakSelf; + if (!strongSelf) { + return; + } + if (!granted) { + [strongSelf reportErrorWithMessage:@"Microphone permission denied"]; + return; + } + dispatch_async(strongSelf.stateQueue, ^{ + [strongSelf startInternal]; + }); + }]; + return; + } + + NSError *error = nil; + if (![self.audioSession configureForConversation:&error]) { + [self reportError:error]; + return; + } + + if (![self.audioSession activateSession:&error]) { + [self reportError:error]; + return; + } + + if (self.serverURL.length == 0) { + NSLog(@"[VoiceChatStreamingManager] Start failed: server URL is empty"); + [self reportErrorWithMessage:@"Server URL is required"]; + return; + } + + NSLog(@"[VoiceChatStreamingManager] Start streaming, server: %@", + self.serverURL); + self.webSocketClient.serverURL = self.serverURL; + [self.webSocketClient connectWithToken:self.pendingToken]; +} + +- (void)reportError:(NSError *)error { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatStreamingManagerDidFail:)]) { + [self.delegate voiceChatStreamingManagerDidFail:error]; + } + }); +} + +- (void)reportErrorWithMessage:(NSString *)message { + NSError *error = [NSError errorWithDomain:kVoiceChatStreamingManagerErrorDomain + code:-1 + userInfo:@{ + NSLocalizedDescriptionKey : message ?: @"" + }]; + [self reportError:error]; +} + +#pragma mark - AudioCaptureManagerDelegate + +- (void)audioCaptureManagerDidOutputPCMFrame:(NSData *)pcmFrame { + if (!self.streaming) { + return; + } + [self.webSocketClient sendAudioPCMFrame:pcmFrame]; +} + +- (void)audioCaptureManagerDidUpdateRMS:(float)rms { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatStreamingManagerDidUpdateRMS:)]) { + [self.delegate voiceChatStreamingManagerDidUpdateRMS:rms]; + } + }); +} + +#pragma mark - AudioSessionManagerDelegate + +- (void)audioSessionManagerDidInterrupt:(KBAudioSessionInterruptionType)type { + if (type == KBAudioSessionInterruptionTypeBegan) { + [self cancel]; + } +} + +- (void)audioSessionManagerMicrophonePermissionDenied { + [self reportErrorWithMessage:@"Microphone permission denied"]; +} + +#pragma mark - VoiceChatWebSocketClientDelegate + +- (void)voiceChatClientDidConnect { + dispatch_async(self.stateQueue, ^{ + [self.webSocketClient startSessionWithLanguage:self.pendingLanguage + voiceId:self.pendingVoiceId]; + }); + + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatStreamingManagerDidConnect)]) { + [self.delegate voiceChatStreamingManagerDidConnect]; + } + }); +} + +- (void)voiceChatClientDidDisconnect:(NSError *_Nullable)error { + dispatch_async(self.stateQueue, ^{ + if (self.streaming) { + [self.audioCapture stopCapture]; + self.streaming = NO; + } + [self.audioSession deactivateSession]; + self.sessionId = nil; + }); + + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatStreamingManagerDidDisconnect:)]) { + [self.delegate voiceChatStreamingManagerDidDisconnect:error]; + } + }); +} + +- (void)voiceChatClientDidStartSession:(NSString *)sessionId { + dispatch_async(self.stateQueue, ^{ + self.sessionId = sessionId; + + NSError *error = nil; + if (![self.audioCapture startCapture:&error]) { + [self reportError:error]; + [self.webSocketClient cancel]; + return; + } + + self.streaming = YES; + [self.webSocketClient enableAudioSending]; + + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatStreamingManagerDidStartSession:)]) { + [self.delegate voiceChatStreamingManagerDidStartSession:sessionId]; + } + }); + }); +} + +- (void)voiceChatClientDidStartTurn:(NSInteger)turnIndex { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatStreamingManagerDidStartTurn:)]) { + [self.delegate voiceChatStreamingManagerDidStartTurn:turnIndex]; + } + }); +} + +- (void)voiceChatClientDidReceiveEagerEndOfTurnWithTranscript:(NSString *)text + confidence:(double)confidence { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate + respondsToSelector:@selector + (voiceChatStreamingManagerDidReceiveEagerEndOfTurnWithTranscript: + confidence:)]) { + [self.delegate + voiceChatStreamingManagerDidReceiveEagerEndOfTurnWithTranscript:text + confidence:confidence]; + } + }); +} + +- (void)voiceChatClientDidResumeTurn { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatStreamingManagerDidResumeTurn)]) { + [self.delegate voiceChatStreamingManagerDidResumeTurn]; + } + }); +} + +- (void)voiceChatClientDidReceiveInterimTranscript:(NSString *)text { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatStreamingManagerDidReceiveInterimTranscript:)]) { + [self.delegate voiceChatStreamingManagerDidReceiveInterimTranscript:text]; + } + }); +} + +- (void)voiceChatClientDidReceiveFinalTranscript:(NSString *)text { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatStreamingManagerDidReceiveFinalTranscript:)]) { + [self.delegate voiceChatStreamingManagerDidReceiveFinalTranscript:text]; + } + }); +} + +- (void)voiceChatClientDidReceiveLLMStart { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatStreamingManagerDidReceiveLLMStart)]) { + [self.delegate voiceChatStreamingManagerDidReceiveLLMStart]; + } + }); +} + +- (void)voiceChatClientDidReceiveLLMToken:(NSString *)token { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatStreamingManagerDidReceiveLLMToken:)]) { + [self.delegate voiceChatStreamingManagerDidReceiveLLMToken:token]; + } + }); +} + +- (void)voiceChatClientDidReceiveAudioChunk:(NSData *)audioData { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatStreamingManagerDidReceiveAudioChunk:)]) { + [self.delegate voiceChatStreamingManagerDidReceiveAudioChunk:audioData]; + } + }); +} + +- (void)voiceChatClientDidCompleteWithTranscript:(NSString *)transcript + aiResponse:(NSString *)aiResponse { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatStreamingManagerDidCompleteWithTranscript: + aiResponse:)]) { + [self.delegate voiceChatStreamingManagerDidCompleteWithTranscript:transcript + aiResponse:aiResponse]; + } + }); +} + +- (void)voiceChatClientDidReceiveErrorCode:(NSString *)code + message:(NSString *)message { + NSString *desc = message.length > 0 ? message : @"Server error"; + NSError *error = [NSError errorWithDomain:kVoiceChatStreamingManagerErrorDomain + code:-2 + userInfo:@{ + NSLocalizedDescriptionKey : desc, + @"code" : code ?: @"" + }]; + [self reportError:error]; +} + +- (void)voiceChatClientDidFail:(NSError *)error { + [self reportError:error]; +} + +@end diff --git a/keyBoard/Class/AiTalk/VM/VoiceChatWebSocketClient.h b/keyBoard/Class/AiTalk/VM/VoiceChatWebSocketClient.h new file mode 100644 index 0000000..5c3bc19 --- /dev/null +++ b/keyBoard/Class/AiTalk/VM/VoiceChatWebSocketClient.h @@ -0,0 +1,57 @@ +// +// VoiceChatWebSocketClient.h +// keyBoard +// +// Created by Mac on 2026/1/21. +// + +#import + +NS_ASSUME_NONNULL_BEGIN + +@protocol VoiceChatWebSocketClientDelegate +@optional +- (void)voiceChatClientDidConnect; +- (void)voiceChatClientDidDisconnect:(NSError *_Nullable)error; +- (void)voiceChatClientDidStartSession:(NSString *)sessionId; +- (void)voiceChatClientDidStartTurn:(NSInteger)turnIndex; +- (void)voiceChatClientDidReceiveEagerEndOfTurnWithTranscript:(NSString *)text + confidence:(double)confidence; +- (void)voiceChatClientDidResumeTurn; +- (void)voiceChatClientDidReceiveInterimTranscript:(NSString *)text; +- (void)voiceChatClientDidReceiveFinalTranscript:(NSString *)text; +- (void)voiceChatClientDidReceiveLLMStart; +- (void)voiceChatClientDidReceiveLLMToken:(NSString *)token; +- (void)voiceChatClientDidReceiveAudioChunk:(NSData *)audioData; +- (void)voiceChatClientDidCompleteWithTranscript:(NSString *)transcript + aiResponse:(NSString *)aiResponse; +- (void)voiceChatClientDidReceiveErrorCode:(NSString *)code + message:(NSString *)message; +- (void)voiceChatClientDidFail:(NSError *)error; +@end + +/// WebSocket client for realtime voice chat. +@interface VoiceChatWebSocketClient : NSObject + +@property(nonatomic, weak) id delegate; + +/// Base WebSocket URL, e.g. wss://api.yourdomain.com/api/ws/chat +@property(nonatomic, copy) NSString *serverURL; + +@property(nonatomic, assign, readonly, getter=isConnected) BOOL connected; +@property(nonatomic, copy, readonly, nullable) NSString *sessionId; + +- (void)connectWithToken:(NSString *)token; +- (void)disconnect; + +- (void)startSessionWithLanguage:(nullable NSString *)language + voiceId:(nullable NSString *)voiceId; +- (void)enableAudioSending; +- (void)disableAudioSending; +- (void)sendAudioPCMFrame:(NSData *)pcmFrame; +- (void)endAudio; +- (void)cancel; + +@end + +NS_ASSUME_NONNULL_END diff --git a/keyBoard/Class/AiTalk/VM/VoiceChatWebSocketClient.m b/keyBoard/Class/AiTalk/VM/VoiceChatWebSocketClient.m new file mode 100644 index 0000000..36c2a2a --- /dev/null +++ b/keyBoard/Class/AiTalk/VM/VoiceChatWebSocketClient.m @@ -0,0 +1,457 @@ +// +// VoiceChatWebSocketClient.m +// keyBoard +// +// Created by Mac on 2026/1/21. +// + +#import "VoiceChatWebSocketClient.h" + +static NSString *const kVoiceChatWebSocketClientErrorDomain = + @"VoiceChatWebSocketClient"; + +@interface VoiceChatWebSocketClient () + +@property(nonatomic, strong) NSURLSession *urlSession; +@property(nonatomic, strong) NSURLSessionWebSocketTask *webSocketTask; +@property(nonatomic, strong) dispatch_queue_t networkQueue; +@property(nonatomic, assign) BOOL connected; +@property(nonatomic, copy) NSString *sessionId; +@property(nonatomic, assign) BOOL audioSendingEnabled; + +@end + +@implementation VoiceChatWebSocketClient + +- (instancetype)init { + self = [super init]; + if (self) { + _networkQueue = dispatch_queue_create("com.keyboard.aitalk.voicechat.ws", + DISPATCH_QUEUE_SERIAL); + _serverURL = @"wss://api.yourdomain.com/api/ws/chat"; + _audioSendingEnabled = NO; + } + return self; +} + +- (void)dealloc { + [self disconnect]; +} + +#pragma mark - Public Methods + +- (void)connectWithToken:(NSString *)token { + dispatch_async(self.networkQueue, ^{ + [self disconnectInternal]; + + NSURL *url = [self buildURLWithToken:token]; + if (!url) { + [self reportErrorWithMessage:@"Invalid server URL"]; + return; + } + + NSLog(@"[VoiceChatWebSocketClient] Connecting: %@", url.absoluteString); + + NSURLSessionConfiguration *config = + [NSURLSessionConfiguration defaultSessionConfiguration]; + config.timeoutIntervalForRequest = 30; + config.timeoutIntervalForResource = 300; + + self.urlSession = [NSURLSession sessionWithConfiguration:config + delegate:self + delegateQueue:nil]; + + self.webSocketTask = [self.urlSession webSocketTaskWithURL:url]; + [self.webSocketTask resume]; + [self receiveMessage]; + }); +} + +- (void)disconnect { + dispatch_async(self.networkQueue, ^{ + BOOL shouldNotify = self.webSocketTask != nil; + if (shouldNotify) { + NSLog(@"[VoiceChatWebSocketClient] Disconnect requested"); + } + [self disconnectInternal]; + if (shouldNotify) { + [self notifyDisconnect:nil]; + } + }); +} + +- (void)startSessionWithLanguage:(nullable NSString *)language + voiceId:(nullable NSString *)voiceId { + NSMutableDictionary *message = [NSMutableDictionary dictionary]; + message[@"type"] = @"session_start"; + + NSMutableDictionary *config = [NSMutableDictionary dictionary]; + if (language.length > 0) { + config[@"language"] = language; + } + if (voiceId.length > 0) { + config[@"voice_id"] = voiceId; + } + if (config.count > 0) { + message[@"config"] = config; + } + + [self sendJSON:message]; +} + +- (void)enableAudioSending { + dispatch_async(self.networkQueue, ^{ + self.audioSendingEnabled = YES; + }); +} + +- (void)disableAudioSending { + dispatch_async(self.networkQueue, ^{ + self.audioSendingEnabled = NO; + }); +} + +- (void)sendAudioPCMFrame:(NSData *)pcmFrame { + if (!self.connected || !self.webSocketTask || pcmFrame.length == 0) { + return; + } + + dispatch_async(self.networkQueue, ^{ + if (!self.audioSendingEnabled) { + return; + } + if (!self.connected || !self.webSocketTask) { + return; + } + NSURLSessionWebSocketMessage *message = + [[NSURLSessionWebSocketMessage alloc] initWithData:pcmFrame]; + [self.webSocketTask + sendMessage:message + completionHandler:^(NSError *_Nullable error) { + if (error) { + [self reportError:error]; + } else { + NSLog(@"[VoiceChatWebSocketClient] Sent audio frame: %lu bytes", + (unsigned long)pcmFrame.length); + } + }]; + }); +} + +- (void)endAudio { + NSLog(@"[VoiceChatWebSocketClient] Sending audio_end"); + [self sendJSON:@{ @"type" : @"audio_end" }]; +} + +- (void)cancel { + NSLog(@"[VoiceChatWebSocketClient] Sending cancel"); + [self sendJSON:@{ @"type" : @"cancel" }]; +} + +#pragma mark - Private Methods + +- (NSURL *)buildURLWithToken:(NSString *)token { + if (self.serverURL.length == 0) { + return nil; + } + + NSURLComponents *components = + [NSURLComponents componentsWithString:self.serverURL]; + if (!components) { + return nil; + } + + if (token.length > 0) { + NSMutableArray *items = + components.queryItems.mutableCopy ?: [NSMutableArray array]; + BOOL didReplace = NO; + for (NSUInteger i = 0; i < items.count; i++) { + NSURLQueryItem *item = items[i]; + if ([item.name isEqualToString:@"token"]) { + items[i] = [NSURLQueryItem queryItemWithName:@"token" value:token]; + didReplace = YES; + break; + } + } + if (!didReplace) { + [items addObject:[NSURLQueryItem queryItemWithName:@"token" + value:token]]; + } + components.queryItems = items; + } + + return components.URL; +} + +- (void)sendJSON:(NSDictionary *)dict { + if (!self.webSocketTask) { + return; + } + + NSError *jsonError = nil; + NSData *jsonData = [NSJSONSerialization dataWithJSONObject:dict + options:0 + error:&jsonError]; + if (jsonError) { + [self reportError:jsonError]; + return; + } + + NSString *jsonString = + [[NSString alloc] initWithData:jsonData + encoding:NSUTF8StringEncoding]; + if (!jsonString) { + [self reportErrorWithMessage:@"Failed to encode JSON message"]; + return; + } + + dispatch_async(self.networkQueue, ^{ + NSURLSessionWebSocketMessage *message = + [[NSURLSessionWebSocketMessage alloc] initWithString:jsonString]; + [self.webSocketTask + sendMessage:message + completionHandler:^(NSError *_Nullable error) { + if (error) { + [self reportError:error]; + } + }]; + }); +} + +- (void)receiveMessage { + if (!self.webSocketTask) { + return; + } + + __weak typeof(self) weakSelf = self; + [self.webSocketTask receiveMessageWithCompletionHandler:^( + NSURLSessionWebSocketMessage *_Nullable message, + NSError *_Nullable error) { + __strong typeof(weakSelf) strongSelf = weakSelf; + if (!strongSelf) { + return; + } + + if (error) { + if (error.code != NSURLErrorCancelled && error.code != 57) { + [strongSelf notifyDisconnect:error]; + [strongSelf disconnectInternal]; + } + return; + } + + if (message.type == NSURLSessionWebSocketMessageTypeString) { + NSLog(@"[VoiceChatWebSocketClient] Received text: %@", message.string); + [strongSelf handleTextMessage:message.string]; + } else if (message.type == NSURLSessionWebSocketMessageTypeData) { + NSLog(@"[VoiceChatWebSocketClient] Received binary: %lu bytes", + (unsigned long)message.data.length); + [strongSelf handleBinaryMessage:message.data]; + } + + [strongSelf receiveMessage]; + }]; +} + +- (void)handleTextMessage:(NSString *)text { + if (text.length == 0) { + return; + } + + NSData *data = [text dataUsingEncoding:NSUTF8StringEncoding]; + if (!data) { + return; + } + + NSError *jsonError = nil; + NSDictionary *json = [NSJSONSerialization JSONObjectWithData:data + options:0 + error:&jsonError]; + if (jsonError) { + [self reportError:jsonError]; + return; + } + + NSString *type = json[@"type"]; + if (type.length == 0) { + return; + } + + if ([type isEqualToString:@"session_started"]) { + NSString *sessionId = json[@"session_id"] ?: @""; + self.sessionId = sessionId; + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatClientDidStartSession:)]) { + [self.delegate voiceChatClientDidStartSession:sessionId]; + } + }); + } else if ([type isEqualToString:@"transcript_interim"]) { + NSString *transcript = json[@"text"] ?: @""; + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatClientDidReceiveInterimTranscript:)]) { + [self.delegate voiceChatClientDidReceiveInterimTranscript:transcript]; + } + }); + } else if ([type isEqualToString:@"transcript_final"]) { + NSString *transcript = json[@"text"] ?: @""; + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatClientDidReceiveFinalTranscript:)]) { + [self.delegate voiceChatClientDidReceiveFinalTranscript:transcript]; + } + }); + } else if ([type isEqualToString:@"turn_start"]) { + NSInteger turnIndex = [json[@"turn_index"] integerValue]; + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatClientDidStartTurn:)]) { + [self.delegate voiceChatClientDidStartTurn:turnIndex]; + } + }); + } else if ([type isEqualToString:@"eager_eot"]) { + NSString *transcript = json[@"transcript"] ?: @""; + double confidence = [json[@"confidence"] doubleValue]; + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatClientDidReceiveEagerEndOfTurnWithTranscript: + confidence:)]) { + [self.delegate + voiceChatClientDidReceiveEagerEndOfTurnWithTranscript:transcript + confidence:confidence]; + } + }); + } else if ([type isEqualToString:@"turn_resumed"]) { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatClientDidResumeTurn)]) { + [self.delegate voiceChatClientDidResumeTurn]; + } + }); + } else if ([type isEqualToString:@"llm_start"]) { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate + respondsToSelector:@selector(voiceChatClientDidReceiveLLMStart)]) { + [self.delegate voiceChatClientDidReceiveLLMStart]; + } + }); + } else if ([type isEqualToString:@"llm_token"]) { + NSString *token = json[@"token"] ?: @""; + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate + respondsToSelector:@selector(voiceChatClientDidReceiveLLMToken:)]) { + [self.delegate voiceChatClientDidReceiveLLMToken:token]; + } + }); + } else if ([type isEqualToString:@"complete"]) { + NSString *transcript = json[@"transcript"] ?: @""; + NSString *aiResponse = json[@"ai_response"] ?: @""; + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatClientDidCompleteWithTranscript: + aiResponse:)]) { + [self.delegate voiceChatClientDidCompleteWithTranscript:transcript + aiResponse:aiResponse]; + } + }); + } else if ([type isEqualToString:@"error"]) { + NSString *code = json[@"code"] ?: @""; + NSString *message = json[@"message"] ?: @""; + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatClientDidReceiveErrorCode:message:)]) { + [self.delegate voiceChatClientDidReceiveErrorCode:code + message:message]; + } + }); + } +} + +- (void)handleBinaryMessage:(NSData *)data { + if (data.length == 0) { + return; + } + + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate + respondsToSelector:@selector(voiceChatClientDidReceiveAudioChunk:)]) { + [self.delegate voiceChatClientDidReceiveAudioChunk:data]; + } + }); +} + +- (void)disconnectInternal { + self.connected = NO; + self.sessionId = nil; + self.audioSendingEnabled = NO; + + if (self.webSocketTask) { + [self.webSocketTask + cancelWithCloseCode:NSURLSessionWebSocketCloseCodeNormalClosure + reason:nil]; + self.webSocketTask = nil; + } + + if (self.urlSession) { + [self.urlSession invalidateAndCancel]; + self.urlSession = nil; + } +} + +- (void)reportError:(NSError *)error { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector(voiceChatClientDidFail:)]) { + [self.delegate voiceChatClientDidFail:error]; + } + }); +} + +- (void)reportErrorWithMessage:(NSString *)message { + NSError *error = [NSError errorWithDomain:kVoiceChatWebSocketClientErrorDomain + code:-1 + userInfo:@{ + NSLocalizedDescriptionKey : message ?: @"" + }]; + [self reportError:error]; +} + +- (void)notifyDisconnect:(NSError *_Nullable)error { + self.connected = NO; + + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (voiceChatClientDidDisconnect:)]) { + [self.delegate voiceChatClientDidDisconnect:error]; + } + }); +} + +#pragma mark - NSURLSessionWebSocketDelegate + +- (void)URLSession:(NSURLSession *)session + webSocketTask:(NSURLSessionWebSocketTask *)webSocketTask + didOpenWithProtocol:(NSString *)protocol { + self.connected = YES; + NSLog(@"[VoiceChatWebSocketClient] Connected"); + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector(voiceChatClientDidConnect)]) { + [self.delegate voiceChatClientDidConnect]; + } + }); +} + +- (void)URLSession:(NSURLSession *)session + webSocketTask:(NSURLSessionWebSocketTask *)webSocketTask + didCloseWithCode:(NSURLSessionWebSocketCloseCode)closeCode + reason:(NSData *)reason { + if (!self.webSocketTask) { + return; + } + NSLog(@"[VoiceChatWebSocketClient] Closed with code: %ld", + (long)closeCode); + [self notifyDisconnect:nil]; + [self disconnectInternal]; +} + +@end diff --git a/keyBoard/Class/AiTalk/websocket-api.md b/keyBoard/Class/AiTalk/websocket-api.md new file mode 100644 index 0000000..1373f4e --- /dev/null +++ b/keyBoard/Class/AiTalk/websocket-api.md @@ -0,0 +1,771 @@ +# 实时语音对话 WebSocket API 文档 + +> Version: 2.0.0 (Flux) +> Last Updated: 2026-01-21 +> Author: Backend Team + +--- + +## 概述 + +本文档描述实时语音对话 WebSocket API,用于 iOS 客户端与后端进行实时语音交互。 + +**v2.0 更新**: 升级为 Deepgram Flux 模型,支持智能轮次检测和 EagerEndOfTurn 提前响应。 + +### 核心特性 +- **智能轮次检测**: Flux 模型语义理解,自动判断用户说完(非简单静默检测) +- **EagerEndOfTurn**: 提前启动 LLM 响应,进一步降低延迟 +- **实时语音识别**: 边说边识别,实时显示转写文本 +- **流式响应**: AI 响应边生成边返回,无需等待完整响应 +- **流式音频**: TTS 音频边合成边播放,极低延迟 +- **Barge-in 支持**: 用户可以打断 AI 说话 + +### 性能指标 +| 指标 | 目标值 | 说明 | +|------|--------|------| +| 端点检测延迟 | ~260ms | Flux 智能检测 | +| TTFA (首音频延迟) | < 300ms | EagerEndOfTurn 优化 | +| 端到端延迟 | < 1.5秒 | 完整对话周期 | +| 实时转写延迟 | < 100ms | 中间结果 | + +--- + +## 连接信息 + +### WebSocket 端点 + +``` +生产环境: wss://api.yourdomain.com/api/ws/chat?token={sa_token} +开发环境: ws://localhost:7529/api/ws/chat?token={sa_token} +``` + +### 认证方式 + +通过 URL Query 参数传递 Sa-Token: + +``` +ws://host:port/api/ws/chat?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9... +``` + +| 参数 | 类型 | 必填 | 描述 | +|------|------|------|------| +| token | String | ✅ | Sa-Token 登录令牌,通过 Apple Sign-In 获取 | + +### 认证失败 + +如果 token 无效或过期,WebSocket 连接将被拒绝(HTTP 403)。 + +--- + +## 消息格式 + +### 通用规则 + +1. **文本消息**: JSON 格式,用于控制指令和状态通知 +2. **二进制消息**: 原始字节,用于音频数据传输 +3. **编码**: UTF-8 + +--- + +## 客户端 → 服务端消息 + +### 1. 开始会话 (session_start) + +**发送时机**: 建立 WebSocket 连接后,准备开始录音前 + +```json +{ + "type": "session_start", + "config": { + "language": "en", + "voice_id": "a5zfmqTslZJBP0jutmVY" + } +} +``` + +| 字段 | 类型 | 必填 | 描述 | +|------|------|------|------| +| type | String | ✅ | 固定值 `session_start` | +| config | Object | ❌ | 会话配置(可选) | +| config.language | String | ❌ | 语音识别语言,默认 `en` | +| config.voice_id | String | ❌ | TTS 声音 ID,默认使用服务端配置 | + +**响应**: 服务端返回 `session_started` 消息 + +--- + +### 2. 音频数据 (Binary) + +**发送时机**: 用户正在录音时,持续发送音频数据 + +**格式**: Binary WebSocket Frame,直接发送原始音频字节 + +**音频规格要求**: + +| 参数 | 值 | 说明 | +|------|------|------| +| 编码格式 | PCM (Linear16) | 未压缩的脉冲编码调制 | +| 采样率 | 16000 Hz | 16kHz | +| 位深度 | 16-bit | 有符号整数 | +| 声道数 | 1 (Mono) | 单声道 | +| 字节序 | Little-Endian | 小端序 | + +**iOS 代码示例**: + +```swift +// AVAudioEngine 配置 +let format = AVAudioFormat( + commonFormat: .pcmFormatInt16, + sampleRate: 16000, + channels: 1, + interleaved: true +)! + +// 发送音频数据 +audioEngine.inputNode.installTap( + onBus: 0, + bufferSize: 1024, + format: format +) { buffer, time in + let audioData = buffer.int16ChannelData![0] + let byteCount = Int(buffer.frameLength) * 2 // 16-bit = 2 bytes + let data = Data(bytes: audioData, count: byteCount) + + webSocket.write(data: data) +} +``` + +**发送频率**: 建议每 20-100ms 发送一次,每次 320-1600 字节 + +--- + +### 3. 结束录音 (audio_end) + +**发送时机**: 用户停止录音(松开录音按钮) + +```json +{ + "type": "audio_end" +} +``` + +| 字段 | 类型 | 必填 | 描述 | +|------|------|------|------| +| type | String | ✅ | 固定值 `audio_end` | + +**说明**: 发送此消息后,服务端将完成语音识别并开始生成 AI 响应 + +--- + +### 4. 取消会话 (cancel) + +**发送时机**: 用户主动取消对话(如点击取消按钮) + +```json +{ + "type": "cancel" +} +``` + +| 字段 | 类型 | 必填 | 描述 | +|------|------|------|------| +| type | String | ✅ | 固定值 `cancel` | + +**说明**: 服务端将停止所有处理,不再返回任何消息 + +--- + +## 服务端 → 客户端消息 + +### 1. 会话已启动 (session_started) + +**接收时机**: 发送 `session_start` 后 + +```json +{ + "type": "session_started", + "session_id": "abc123-def456-ghi789" +} +``` + +| 字段 | 类型 | 描述 | +|------|------|------| +| type | String | 固定值 `session_started` | +| session_id | String | 服务端分配的会话 ID | + +**客户端处理**: 收到此消息后,可以开始发送音频数据 + +--- + +### 2. 轮次开始 (turn_start) 🆕 + +**接收时机**: 用户开始说话时(Flux 检测到语音活动) + +```json +{ + "type": "turn_start", + "turn_index": 0 +} +``` + +| 字段 | 类型 | 描述 | +|------|------|------| +| type | String | 固定值 `turn_start` | +| turn_index | Integer | 当前轮次索引(从 0 开始) | + +**客户端处理**: +- 可显示"正在听..."状态 +- 准备接收转写结果 + +--- + +### 3. 中间转写结果 (transcript_interim) + +**接收时机**: 用户说话过程中,实时返回 + +```json +{ + "type": "transcript_interim", + "text": "Hello how are", + "is_final": false +} +``` + +| 字段 | 类型 | 描述 | +|------|------|------| +| type | String | 固定值 `transcript_interim` | +| text | String | 当前识别到的文本(可能会变化) | +| is_final | Boolean | 固定为 `false` | + +**客户端处理**: +- 实时更新 UI 显示转写文本 +- 此文本可能会被后续消息覆盖 +- 可用于显示"正在识别..."效果 + +--- + +### 3. 最终转写结果 (transcript_final) + +**接收时机**: 一句话识别完成时 + +```json +{ + "type": "transcript_final", + "text": "Hello, how are you?" +} +``` + +| 字段 | 类型 | 描述 | +|------|------|------| +| type | String | 固定值 `transcript_final` | +| text | String | 最终确定的转写文本 | + +**客户端处理**: +- 用此文本替换之前的中间结果 +- 此文本不会再变化 + +--- + +### 6. 提前端点检测 (eager_eot) 🆕 + +**接收时机**: Flux 检测到用户可能说完时(置信度达到阈值) + +```json +{ + "type": "eager_eot", + "transcript": "Hello, how are you", + "confidence": 0.65 +} +``` + +| 字段 | 类型 | 描述 | +|------|------|------| +| type | String | 固定值 `eager_eot` | +| transcript | String | 当前转写文本 | +| confidence | Double | 端点置信度 (0.0-1.0) | + +**客户端处理**: +- 这是一个**预测性事件**,表示用户可能说完了 +- 服务端已开始提前准备 LLM 响应 +- 可显示"准备响应..."状态 +- **注意**: 用户可能继续说话,此时会收到 `turn_resumed` + +--- + +### 7. 轮次恢复 (turn_resumed) 🆕 + +**接收时机**: 收到 `eager_eot` 后,用户继续说话 + +```json +{ + "type": "turn_resumed" +} +``` + +| 字段 | 类型 | 描述 | +|------|------|------| +| type | String | 固定值 `turn_resumed` | + +**客户端处理**: +- 用户继续说话,之前的 `eager_eot` 是误判 +- 服务端已取消正在准备的草稿响应 +- 恢复"正在听..."状态 +- 继续接收 `transcript_interim` 更新 + +--- + +### 8. LLM 开始生成 (llm_start) + +**接收时机**: 语音识别完成,AI 开始生成响应 + +```json +{ + "type": "llm_start" +} +``` + +| 字段 | 类型 | 描述 | +|------|------|------| +| type | String | 固定值 `llm_start` | + +**客户端处理**: +- 可显示"AI 正在思考..."状态 +- 准备接收 AI 响应文本和音频 + +--- + +### 5. LLM Token (llm_token) + +**接收时机**: AI 生成过程中,逐 token 返回 + +```json +{ + "type": "llm_token", + "token": "Hi" +} +``` + +| 字段 | 类型 | 描述 | +|------|------|------| +| type | String | 固定值 `llm_token` | +| token | String | AI 输出的单个 token(词或字符片段) | + +**客户端处理**: +- 可选择实现打字机效果 +- 逐个 token 追加显示 AI 响应文本 +- 如不需要打字效果,可忽略此消息 + +--- + +### 6. 音频数据 (Binary) + +**接收时机**: TTS 合成过程中,流式返回音频 + +**格式**: Binary WebSocket Frame,MP3 音频块 + +**音频规格**: + +| 参数 | 值 | +|------|------| +| 格式 | MP3 | +| 采样率 | 44100 Hz | +| 比特率 | 64 kbps | +| 声道 | 单声道 | + +**客户端处理**: + +```swift +// 使用 AVAudioEngine 或 AudioQueue 播放流式音频 +webSocket.onEvent = { event in + switch event { + case .binary(let data): + // 方案1: 追加到缓冲区,使用 AVAudioPlayerNode + audioBuffer.append(data) + playBufferedAudio() + + // 方案2: 使用 AVAudioEngine + AVAudioCompressedBuffer + // 方案3: 累积后使用 AVAudioPlayer + } +} +``` + +**重要提示**: +- 音频是分块返回的,需要正确拼接或流式播放 +- 每个二进制消息是 MP3 数据的一部分 +- 收到 `complete` 消息后,音频传输完成 + +--- + +### 7. 处理完成 (complete) + +**接收时机**: AI 响应生成完成,所有音频已发送 + +```json +{ + "type": "complete", + "transcript": "Hello, how are you?", + "ai_response": "Hi! I'm doing great, thanks for asking!" +} +``` + +| 字段 | 类型 | 描述 | +|------|------|------| +| type | String | 固定值 `complete` | +| transcript | String | 完整的用户语音转写文本 | +| ai_response | String | 完整的 AI 响应文本 | + +**客户端处理**: +- 更新 UI 显示完整对话 +- 可开始下一轮对话 +- 建议保存对话历史 + +--- + +### 8. 错误 (error) + +**接收时机**: 处理过程中发生错误 + +```json +{ + "type": "error", + "code": "DEEPGRAM_ERROR", + "message": "Speech recognition failed" +} +``` + +| 字段 | 类型 | 描述 | +|------|------|------| +| type | String | 固定值 `error` | +| code | String | 错误代码 | +| message | String | 错误描述 | + +**错误代码列表**: + +| 错误代码 | 描述 | 建议处理 | +|----------|------|----------| +| PARSE_ERROR | 消息解析失败 | 检查消息格式 | +| DEEPGRAM_ERROR | 语音识别服务错误 | 重试或提示用户 | +| DEEPGRAM_INIT_ERROR | 语音识别初始化失败 | 重新开始会话 | +| LLM_ERROR | AI 生成错误 | 重试或提示用户 | +| PIPELINE_ERROR | 处理流程错误 | 重新开始会话 | +| EMPTY_TRANSCRIPT | 未检测到语音 | 提示用户重新说话 | + +**客户端处理**: +- 显示友好的错误提示 +- 根据错误类型决定是否重试 + +--- + +## 完整交互流程 + +### 时序图 + +``` +iOS Client Server + | | + |------ WebSocket Connect --------->| + | ?token=xxx | + | | + |<-------- Connected ---------------| + | | + |------ session_start ------------->| + | | + |<----- session_started ------------| + | {session_id: "abc"} | + | | + |======= 用户开始说话 ===============| + | | + |------ Binary (audio) ------------>| + |------ Binary (audio) ------------>| + |<----- transcript_interim ---------| + | {text: "Hello"} | + |------ Binary (audio) ------------>| + |<----- transcript_interim ---------| + | {text: "Hello how"} | + |------ Binary (audio) ------------>| + |<----- transcript_final -----------| + | {text: "Hello, how are you?"}| + | | + |======= 用户停止说话 ===============| + | | + |------ audio_end ----------------->| + | | + |<----- llm_start ------------------| + | | + |<----- llm_token ------------------| + | {token: "Hi"} | + |<----- llm_token ------------------| + | {token: "!"} | + |<----- Binary (mp3) ---------------| + |<----- Binary (mp3) ---------------| + |<----- llm_token ------------------| + | {token: " I'm"} | + |<----- Binary (mp3) ---------------| + | ... | + |<----- complete -------------------| + | {transcript, ai_response} | + | | + |======= 可以开始下一轮 =============| + | | +``` + +--- + +## iOS 代码示例 + +### 完整 Swift 实现 + +```swift +import Foundation +import Starscream // WebSocket 库 + +class VoiceChatManager: WebSocketDelegate { + + private var socket: WebSocket? + private var audioBuffer = Data() + + // MARK: - 回调 + var onSessionStarted: ((String) -> Void)? + var onTranscriptInterim: ((String) -> Void)? + var onTranscriptFinal: ((String) -> Void)? + var onLLMStart: (() -> Void)? + var onLLMToken: ((String) -> Void)? + var onAudioChunk: ((Data) -> Void)? + var onComplete: ((String, String) -> Void)? + var onError: ((String, String) -> Void)? + + // MARK: - 连接 + func connect(token: String) { + let urlString = "wss://api.yourdomain.com/api/ws/chat?token=\(token)" + guard let url = URL(string: urlString) else { return } + + var request = URLRequest(url: url) + request.timeoutInterval = 30 + + socket = WebSocket(request: request) + socket?.delegate = self + socket?.connect() + } + + func disconnect() { + socket?.disconnect() + socket = nil + } + + // MARK: - 发送消息 + func startSession(language: String = "en", voiceId: String? = nil) { + var config: [String: Any] = ["language": language] + if let voiceId = voiceId { + config["voice_id"] = voiceId + } + + let message: [String: Any] = [ + "type": "session_start", + "config": config + ] + + sendJSON(message) + } + + func sendAudio(_ data: Data) { + socket?.write(data: data) + } + + func endAudio() { + sendJSON(["type": "audio_end"]) + } + + func cancel() { + sendJSON(["type": "cancel"]) + } + + private func sendJSON(_ dict: [String: Any]) { + guard let data = try? JSONSerialization.data(withJSONObject: dict), + let string = String(data: data, encoding: .utf8) else { return } + socket?.write(string: string) + } + + // MARK: - WebSocketDelegate + func didReceive(event: WebSocketEvent, client: WebSocketClient) { + switch event { + case .connected(_): + print("WebSocket connected") + + case .disconnected(let reason, let code): + print("WebSocket disconnected: \(reason) (\(code))") + + case .text(let text): + handleTextMessage(text) + + case .binary(let data): + // 收到 MP3 音频数据 + onAudioChunk?(data) + + case .error(let error): + print("WebSocket error: \(error?.localizedDescription ?? "unknown")") + + default: + break + } + } + + private func handleTextMessage(_ text: String) { + guard let data = text.data(using: .utf8), + let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any], + let type = json["type"] as? String else { return } + + switch type { + case "session_started": + if let sessionId = json["session_id"] as? String { + onSessionStarted?(sessionId) + } + + case "transcript_interim": + if let text = json["text"] as? String { + onTranscriptInterim?(text) + } + + case "transcript_final": + if let text = json["text"] as? String { + onTranscriptFinal?(text) + } + + case "llm_start": + onLLMStart?() + + case "llm_token": + if let token = json["token"] as? String { + onLLMToken?(token) + } + + case "complete": + if let transcript = json["transcript"] as? String, + let aiResponse = json["ai_response"] as? String { + onComplete?(transcript, aiResponse) + } + + case "error": + if let code = json["code"] as? String, + let message = json["message"] as? String { + onError?(code, message) + } + + default: + print("Unknown message type: \(type)") + } + } +} +``` + +### 使用示例 + +```swift +class VoiceChatViewController: UIViewController { + + let chatManager = VoiceChatManager() + let audioRecorder = AudioRecorder() // 自定义录音类 + let audioPlayer = StreamingAudioPlayer() // 自定义流式播放类 + + override func viewDidLoad() { + super.viewDidLoad() + setupCallbacks() + } + + func setupCallbacks() { + chatManager.onSessionStarted = { [weak self] sessionId in + print("Session started: \(sessionId)") + // 开始录音 + self?.audioRecorder.start { audioData in + self?.chatManager.sendAudio(audioData) + } + } + + chatManager.onTranscriptInterim = { [weak self] text in + self?.transcriptLabel.text = text + "..." + } + + chatManager.onTranscriptFinal = { [weak self] text in + self?.transcriptLabel.text = text + } + + chatManager.onLLMStart = { [weak self] in + self?.statusLabel.text = "AI is thinking..." + } + + chatManager.onLLMToken = { [weak self] token in + self?.aiResponseLabel.text = (self?.aiResponseLabel.text ?? "") + token + } + + chatManager.onAudioChunk = { [weak self] data in + self?.audioPlayer.appendData(data) + } + + chatManager.onComplete = { [weak self] transcript, aiResponse in + self?.statusLabel.text = "Complete" + self?.addToHistory(user: transcript, ai: aiResponse) + } + + chatManager.onError = { [weak self] code, message in + self?.showError(message) + } + } + + @IBAction func startTapped(_ sender: UIButton) { + // 连接并开始会话 + chatManager.connect(token: AuthManager.shared.saToken) + chatManager.onSessionStarted = { [weak self] _ in + self?.chatManager.startSession() + } + } + + @IBAction func stopTapped(_ sender: UIButton) { + audioRecorder.stop() + chatManager.endAudio() + } + + @IBAction func cancelTapped(_ sender: UIButton) { + audioRecorder.stop() + audioPlayer.stop() + chatManager.cancel() + } +} +``` + +--- + +## 注意事项 + +### 1. 音频录制 +- 必须使用 PCM 16-bit, 16kHz, Mono 格式 +- 建议每 20-100ms 发送一次音频数据 +- 录音权限需要在 Info.plist 中声明 + +### 2. 音频播放 +- 返回的是 MP3 格式音频块 +- 需要实现流式播放或缓冲播放 +- 建议使用 AVAudioEngine 实现低延迟播放 + +### 3. 网络处理 +- 实现自动重连机制 +- 处理网络切换场景 +- 设置合理的超时时间 + +### 4. 用户体验 +- 显示实时转写文本 +- 显示 AI 响应状态 +- 提供取消按钮 +- 处理录音权限被拒绝的情况 + +### 5. 调试建议 +- 使用 `wss://` 确保生产环境安全 +- 本地开发可使用 `ws://` +- 检查 Sa-Token 是否过期 + +--- + +## 版本历史 + +| 版本 | 日期 | 变更 | +|------|------|------| +| 1.0.0 | 2026-01-21 | 初始版本 | +