This commit is contained in:
2026-01-21 17:25:38 +08:00
parent d1d47336c2
commit 36c0b0b210
10 changed files with 1877 additions and 10 deletions

View File

@@ -195,6 +195,9 @@
04D1F6B22EDFF10A00B12345 /* KBSkinInstallBridge.m in Sources */ = {isa = PBXBuildFile; fileRef = 04D1F6B12EDFF10A00B12345 /* KBSkinInstallBridge.m */; }; 04D1F6B22EDFF10A00B12345 /* KBSkinInstallBridge.m in Sources */ = {isa = PBXBuildFile; fileRef = 04D1F6B12EDFF10A00B12345 /* KBSkinInstallBridge.m */; };
04D1F6B32EDFF10A00B12345 /* KBSkinInstallBridge.m in Sources */ = {isa = PBXBuildFile; fileRef = 04D1F6B12EDFF10A00B12345 /* KBSkinInstallBridge.m */; }; 04D1F6B32EDFF10A00B12345 /* KBSkinInstallBridge.m in Sources */ = {isa = PBXBuildFile; fileRef = 04D1F6B12EDFF10A00B12345 /* KBSkinInstallBridge.m */; };
04E0383E2F1A7C30002CA5A0 /* KBCustomTabBar.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E0383D2F1A7C30002CA5A0 /* KBCustomTabBar.m */; }; 04E0383E2F1A7C30002CA5A0 /* KBCustomTabBar.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E0383D2F1A7C30002CA5A0 /* KBCustomTabBar.m */; };
04E038D82F20BFFB002CA5A0 /* websocket-api.md in Resources */ = {isa = PBXBuildFile; fileRef = 04E038D72F20BFFB002CA5A0 /* websocket-api.md */; };
04E038DD2F20C420002CA5A0 /* VoiceChatStreamingManager.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E038DA2F20C420002CA5A0 /* VoiceChatStreamingManager.m */; };
04E038DE2F20C420002CA5A0 /* VoiceChatWebSocketClient.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E038DC2F20C420002CA5A0 /* VoiceChatWebSocketClient.m */; };
04E161832F10E6470022C23B /* normal_hei_them.zip in Resources */ = {isa = PBXBuildFile; fileRef = 04E161812F10E6470022C23B /* normal_hei_them.zip */; }; 04E161832F10E6470022C23B /* normal_hei_them.zip in Resources */ = {isa = PBXBuildFile; fileRef = 04E161812F10E6470022C23B /* normal_hei_them.zip */; };
04E161842F10E6470022C23B /* normal_them.zip in Resources */ = {isa = PBXBuildFile; fileRef = 04E161822F10E6470022C23B /* normal_them.zip */; }; 04E161842F10E6470022C23B /* normal_them.zip in Resources */ = {isa = PBXBuildFile; fileRef = 04E161822F10E6470022C23B /* normal_them.zip */; };
04FC95672EB0546C007BD342 /* KBKey.m in Sources */ = {isa = PBXBuildFile; fileRef = 04FC95652EB0546C007BD342 /* KBKey.m */; }; 04FC95672EB0546C007BD342 /* KBKey.m in Sources */ = {isa = PBXBuildFile; fileRef = 04FC95652EB0546C007BD342 /* KBKey.m */; };
@@ -608,6 +611,11 @@
04D1F6B12EDFF10A00B12345 /* KBSkinInstallBridge.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBSkinInstallBridge.m; sourceTree = "<group>"; }; 04D1F6B12EDFF10A00B12345 /* KBSkinInstallBridge.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBSkinInstallBridge.m; sourceTree = "<group>"; };
04E0383C2F1A7C30002CA5A0 /* KBCustomTabBar.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KBCustomTabBar.h; sourceTree = "<group>"; }; 04E0383C2F1A7C30002CA5A0 /* KBCustomTabBar.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KBCustomTabBar.h; sourceTree = "<group>"; };
04E0383D2F1A7C30002CA5A0 /* KBCustomTabBar.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBCustomTabBar.m; sourceTree = "<group>"; }; 04E0383D2F1A7C30002CA5A0 /* KBCustomTabBar.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBCustomTabBar.m; sourceTree = "<group>"; };
04E038D72F20BFFB002CA5A0 /* websocket-api.md */ = {isa = PBXFileReference; lastKnownFileType = net.daringfireball.markdown; path = "websocket-api.md"; sourceTree = "<group>"; };
04E038D92F20C420002CA5A0 /* VoiceChatStreamingManager.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = VoiceChatStreamingManager.h; sourceTree = "<group>"; };
04E038DA2F20C420002CA5A0 /* VoiceChatStreamingManager.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = VoiceChatStreamingManager.m; sourceTree = "<group>"; };
04E038DB2F20C420002CA5A0 /* VoiceChatWebSocketClient.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = VoiceChatWebSocketClient.h; sourceTree = "<group>"; };
04E038DC2F20C420002CA5A0 /* VoiceChatWebSocketClient.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = VoiceChatWebSocketClient.m; sourceTree = "<group>"; };
04E161812F10E6470022C23B /* normal_hei_them.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; path = normal_hei_them.zip; sourceTree = "<group>"; }; 04E161812F10E6470022C23B /* normal_hei_them.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; path = normal_hei_them.zip; sourceTree = "<group>"; };
04E161822F10E6470022C23B /* normal_them.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; path = normal_them.zip; sourceTree = "<group>"; }; 04E161822F10E6470022C23B /* normal_them.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; path = normal_them.zip; sourceTree = "<group>"; };
04FC953A2EAFAE56007BD342 /* KeyBoardPrefixHeader.pch */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KeyBoardPrefixHeader.pch; sourceTree = "<group>"; }; 04FC953A2EAFAE56007BD342 /* KeyBoardPrefixHeader.pch */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KeyBoardPrefixHeader.pch; sourceTree = "<group>"; };
@@ -986,6 +994,10 @@
046086AE2F19239B00757C95 /* TTSPlaybackPipeline.m */, 046086AE2F19239B00757C95 /* TTSPlaybackPipeline.m */,
046086AF2F19239B00757C95 /* TTSServiceClient.h */, 046086AF2F19239B00757C95 /* TTSServiceClient.h */,
046086B02F19239B00757C95 /* TTSServiceClient.m */, 046086B02F19239B00757C95 /* TTSServiceClient.m */,
04E038D92F20C420002CA5A0 /* VoiceChatStreamingManager.h */,
04E038DA2F20C420002CA5A0 /* VoiceChatStreamingManager.m */,
04E038DB2F20C420002CA5A0 /* VoiceChatWebSocketClient.h */,
04E038DC2F20C420002CA5A0 /* VoiceChatWebSocketClient.m */,
); );
path = VM; path = VM;
sourceTree = "<group>"; sourceTree = "<group>";
@@ -994,6 +1006,7 @@
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
046086742F191CC700757C95 /* AI技术分析.txt */, 046086742F191CC700757C95 /* AI技术分析.txt */,
04E038D72F20BFFB002CA5A0 /* websocket-api.md */,
0460866C2F191A5100757C95 /* M */, 0460866C2F191A5100757C95 /* M */,
0460866D2F191A5100757C95 /* V */, 0460866D2F191A5100757C95 /* V */,
0460866E2F191A5100757C95 /* VC */, 0460866E2F191A5100757C95 /* VC */,
@@ -2003,6 +2016,7 @@
buildActionMask = 2147483647; buildActionMask = 2147483647;
files = ( files = (
04286A0F2ECDA71B00CE730C /* 001.zip in Resources */, 04286A0F2ECDA71B00CE730C /* 001.zip in Resources */,
04E038D82F20BFFB002CA5A0 /* websocket-api.md in Resources */,
0479200B2ED87CEE004E8522 /* permiss_video.mp4 in Resources */, 0479200B2ED87CEE004E8522 /* permiss_video.mp4 in Resources */,
04C6EABA2EAF86530089C901 /* Assets.xcassets in Resources */, 04C6EABA2EAF86530089C901 /* Assets.xcassets in Resources */,
04A9FE212EB893F10020DB6D /* Localizable.strings in Resources */, 04A9FE212EB893F10020DB6D /* Localizable.strings in Resources */,
@@ -2235,6 +2249,8 @@
05A1B2D22F5B1A2B3C4D5E60 /* KBSearchThemeModel.m in Sources */, 05A1B2D22F5B1A2B3C4D5E60 /* KBSearchThemeModel.m in Sources */,
047C65102EBCA8DD0035E841 /* HomeRankContentVC.m in Sources */, 047C65102EBCA8DD0035E841 /* HomeRankContentVC.m in Sources */,
047C655C2EBCD0F80035E841 /* UIView+KBShadow.m in Sources */, 047C655C2EBCD0F80035E841 /* UIView+KBShadow.m in Sources */,
04E038DD2F20C420002CA5A0 /* VoiceChatStreamingManager.m in Sources */,
04E038DE2F20C420002CA5A0 /* VoiceChatWebSocketClient.m in Sources */,
049FB2262EC3136D00FAB05D /* KBPersonInfoItemCell.m in Sources */, 049FB2262EC3136D00FAB05D /* KBPersonInfoItemCell.m in Sources */,
048908C32EBE32B800FABA60 /* KBSearchVC.m in Sources */, 048908C32EBE32B800FABA60 /* KBSearchVC.m in Sources */,
049FB20B2EC1C13800FAB05D /* KBSkinBottomActionView.m in Sources */, 049FB20B2EC1C13800FAB05D /* KBSkinBottomActionView.m in Sources */,

View File

@@ -11,8 +11,11 @@
#import "KBAiChatView.h" #import "KBAiChatView.h"
#import "KBAiRecordButton.h" #import "KBAiRecordButton.h"
#import "LSTPopView.h" #import "LSTPopView.h"
#import "VoiceChatStreamingManager.h"
#import "KBUserSessionManager.h"
@interface KBAiMainVC () <KBAiRecordButtonDelegate> @interface KBAiMainVC () <KBAiRecordButtonDelegate,
VoiceChatStreamingManagerDelegate>
@property(nonatomic, weak) LSTPopView *popView; @property(nonatomic, weak) LSTPopView *popView;
// UI // UI
@@ -28,6 +31,13 @@
// //
@property(nonatomic, strong) ConversationOrchestrator *orchestrator; @property(nonatomic, strong) ConversationOrchestrator *orchestrator;
@property(nonatomic, strong) VoiceChatStreamingManager *streamingManager;
//
@property(nonatomic, strong) NSMutableString *assistantVisibleText;
//
@property(nonatomic, assign) NSTimeInterval lastRMSLogTime;
@end @end
@@ -44,6 +54,7 @@
[self setupUI]; [self setupUI];
[self setupOrchestrator]; [self setupOrchestrator];
[self setupStreamingManager];
} }
- (void)viewWillAppear:(BOOL)animated { - (void)viewWillAppear:(BOOL)animated {
@@ -56,6 +67,7 @@
// //
[self.orchestrator stop]; [self.orchestrator stop];
[self.streamingManager disconnect];
} }
- (void)viewDidLayoutSubviews { - (void)viewDidLayoutSubviews {
@@ -184,11 +196,15 @@
- (void)setupOrchestrator { - (void)setupOrchestrator {
self.orchestrator = [[ConversationOrchestrator alloc] init]; self.orchestrator = [[ConversationOrchestrator alloc] init];
// TODO: //
// self.orchestrator.asrServerURL = @"wss://your-asr-server.com/ws/asr"; // 1. ASR WebSocket
// self.orchestrator.llmServerURL = self.orchestrator.asrServerURL = @"ws://192.168.2.21:7529/ws/asr";
// @"https://your-llm-server.com/api/chat/stream";
// self.orchestrator.ttsServerURL = @"https://your-tts-server.com/api/tts"; // 2. LLM HTTP Stream
self.orchestrator.llmServerURL = @"http://192.168.2.21:7529/api/chat/stream";
// 3. TTS HTTP
self.orchestrator.ttsServerURL = @"http://192.168.2.21:7529/api/tts/stream";
__weak typeof(self) weakSelf = self; __weak typeof(self) weakSelf = self;
@@ -278,6 +294,16 @@
}; };
} }
#pragma mark - Streaming Manager
- (void)setupStreamingManager {
self.streamingManager = [[VoiceChatStreamingManager alloc] init];
self.streamingManager.delegate = self;
self.streamingManager.serverURL = @"ws://192.168.2.21:7529/api/ws/chat";
self.assistantVisibleText = [[NSMutableString alloc] init];
self.lastRMSLogTime = 0;
}
#pragma mark - #pragma mark -
- (void)showComment { - (void)showComment {
CGFloat customViewHeight = KB_SCREEN_HEIGHT * (0.8); CGFloat customViewHeight = KB_SCREEN_HEIGHT * (0.8);
@@ -367,16 +393,112 @@
#pragma mark - KBAiRecordButtonDelegate #pragma mark - KBAiRecordButtonDelegate
- (void)recordButtonDidBeginPress:(KBAiRecordButton *)button { - (void)recordButtonDidBeginPress:(KBAiRecordButton *)button {
[self.orchestrator userDidPressRecord]; NSLog(@"[KBAiMainVC] Record button began press");
NSString *token = [[KBUserSessionManager shared] accessToken] ?: @"";
if (token.length == 0) {
[[KBUserSessionManager shared] goLoginVC];
return;
}
self.statusLabel.text = @"正在连接...";
self.recordButton.state = KBAiRecordButtonStateRecording;
[self.streamingManager startWithToken:token language:@"en" voiceId:nil];
} }
- (void)recordButtonDidEndPress:(KBAiRecordButton *)button { - (void)recordButtonDidEndPress:(KBAiRecordButton *)button {
[self.orchestrator userDidReleaseRecord]; NSLog(@"[KBAiMainVC] Record button end press");
[self.streamingManager stopAndFinalize];
} }
- (void)recordButtonDidCancelPress:(KBAiRecordButton *)button { - (void)recordButtonDidCancelPress:(KBAiRecordButton *)button {
// releaseASR NSLog(@"[KBAiMainVC] Record button cancel press");
[self.orchestrator userDidReleaseRecord]; [self.streamingManager cancel];
}
#pragma mark - VoiceChatStreamingManagerDelegate
- (void)voiceChatStreamingManagerDidConnect {
self.statusLabel.text = @"已连接,准备中...";
}
- (void)voiceChatStreamingManagerDidDisconnect:(NSError *_Nullable)error {
self.recordButton.state = KBAiRecordButtonStateNormal;
if (error) {
[self showError:error];
}
}
- (void)voiceChatStreamingManagerDidStartSession:(NSString *)sessionId {
self.statusLabel.text = @"正在聆听...";
self.recordButton.state = KBAiRecordButtonStateRecording;
}
- (void)voiceChatStreamingManagerDidStartTurn:(NSInteger)turnIndex {
self.statusLabel.text = @"正在聆听...";
self.recordButton.state = KBAiRecordButtonStateRecording;
}
- (void)voiceChatStreamingManagerDidReceiveEagerEndOfTurnWithTranscript:(NSString *)text
confidence:(double)confidence {
self.statusLabel.text = @"准备响应...";
}
- (void)voiceChatStreamingManagerDidResumeTurn {
self.statusLabel.text = @"正在聆听...";
}
- (void)voiceChatStreamingManagerDidUpdateRMS:(float)rms {
[self.recordButton updateVolumeRMS:rms];
NSTimeInterval now = [[NSDate date] timeIntervalSince1970];
if (now - self.lastRMSLogTime >= 1.0) {
self.lastRMSLogTime = now;
NSLog(@"[KBAiMainVC] RMS: %.3f", rms);
}
}
- (void)voiceChatStreamingManagerDidReceiveInterimTranscript:(NSString *)text {
self.statusLabel.text = text.length > 0 ? text : @"正在识别...";
}
- (void)voiceChatStreamingManagerDidReceiveFinalTranscript:(NSString *)text {
if (text.length > 0) {
[self.chatView addUserMessage:text];
}
}
- (void)voiceChatStreamingManagerDidReceiveLLMStart {
self.statusLabel.text = @"AI 正在思考...";
[self.assistantVisibleText setString:@""];
[self.chatView addAssistantMessage:@""];
}
- (void)voiceChatStreamingManagerDidReceiveLLMToken:(NSString *)token {
if (token.length == 0) {
return;
}
[self.assistantVisibleText appendString:token];
[self.chatView updateLastAssistantMessage:self.assistantVisibleText];
}
- (void)voiceChatStreamingManagerDidReceiveAudioChunk:(NSData *)audioData {
}
- (void)voiceChatStreamingManagerDidCompleteWithTranscript:(NSString *)transcript
aiResponse:(NSString *)aiResponse {
NSString *finalText = aiResponse.length > 0 ? aiResponse
: self.assistantVisibleText;
if (finalText.length > 0) {
[self.chatView updateLastAssistantMessage:finalText];
[self.chatView markLastAssistantMessageComplete];
}
self.recordButton.state = KBAiRecordButtonStateNormal;
self.statusLabel.text = @"完成";
}
- (void)voiceChatStreamingManagerDidFail:(NSError *)error {
self.recordButton.state = KBAiRecordButtonStateNormal;
[self showError:error];
} }
@end @end

View File

@@ -90,6 +90,11 @@
// TTS Client // TTS Client
self.ttsClient = [[TTSServiceClient alloc] init]; self.ttsClient = [[TTSServiceClient alloc] init];
self.ttsClient.delegate = self; self.ttsClient.delegate = self;
// ElevenLabs
self.ttsClient.voiceId = @"JBFqnCBsd6RMkjVDRZzb"; // George
self.ttsClient.languageCode = @"zh"; //
self.ttsClient.expectedPayloadType =
TTSPayloadTypeURL; // 使 URL
// Playback Pipeline // Playback Pipeline
self.playbackPipeline = [[TTSPlaybackPipeline alloc] init]; self.playbackPipeline = [[TTSPlaybackPipeline alloc] init];

View File

@@ -41,6 +41,12 @@ typedef NS_ENUM(NSInteger, TTSPayloadType) {
/// TTS 服务器 URL /// TTS 服务器 URL
@property(nonatomic, copy) NSString *serverURL; @property(nonatomic, copy) NSString *serverURL;
/// 语音 IDElevenLabs voice ID
@property(nonatomic, copy) NSString *voiceId;
/// 语言代码(如 "zh", "en"
@property(nonatomic, copy) NSString *languageCode;
/// 当前期望的返回类型(由服务端配置决定) /// 当前期望的返回类型(由服务端配置决定)
@property(nonatomic, assign) TTSPayloadType expectedPayloadType; @property(nonatomic, assign) TTSPayloadType expectedPayloadType;

View File

@@ -94,6 +94,8 @@
NSDictionary *body = @{ NSDictionary *body = @{
@"text" : text, @"text" : text,
@"segmentId" : segmentId, @"segmentId" : segmentId,
@"voiceId" : self.voiceId ?: @"JBFqnCBsd6RMkjVDRZzb",
@"languageCode" : self.languageCode ?: @"zh",
@"format" : @"mp3" // m4a @"format" : @"mp3" // m4a
}; };
@@ -184,6 +186,8 @@
NSDictionary *requestDict = @{ NSDictionary *requestDict = @{
@"text" : text, @"text" : text,
@"segmentId" : segmentId, @"segmentId" : segmentId,
@"voiceId" : self.voiceId ?: @"JBFqnCBsd6RMkjVDRZzb",
@"languageCode" : self.languageCode ?: @"zh",
@"format" : [self formatStringForPayloadType:self.expectedPayloadType] @"format" : [self formatStringForPayloadType:self.expectedPayloadType]
}; };

View File

@@ -0,0 +1,53 @@
//
// VoiceChatStreamingManager.h
// keyBoard
//
// Created by Mac on 2026/1/21.
//
#import <Foundation/Foundation.h>
NS_ASSUME_NONNULL_BEGIN
@protocol VoiceChatStreamingManagerDelegate <NSObject>
@optional
- (void)voiceChatStreamingManagerDidConnect;
- (void)voiceChatStreamingManagerDidDisconnect:(NSError *_Nullable)error;
- (void)voiceChatStreamingManagerDidStartSession:(NSString *)sessionId;
- (void)voiceChatStreamingManagerDidStartTurn:(NSInteger)turnIndex;
- (void)voiceChatStreamingManagerDidReceiveEagerEndOfTurnWithTranscript:(NSString *)text
confidence:(double)confidence;
- (void)voiceChatStreamingManagerDidResumeTurn;
- (void)voiceChatStreamingManagerDidUpdateRMS:(float)rms;
- (void)voiceChatStreamingManagerDidReceiveInterimTranscript:(NSString *)text;
- (void)voiceChatStreamingManagerDidReceiveFinalTranscript:(NSString *)text;
- (void)voiceChatStreamingManagerDidReceiveLLMStart;
- (void)voiceChatStreamingManagerDidReceiveLLMToken:(NSString *)token;
- (void)voiceChatStreamingManagerDidReceiveAudioChunk:(NSData *)audioData;
- (void)voiceChatStreamingManagerDidCompleteWithTranscript:(NSString *)transcript
aiResponse:(NSString *)aiResponse;
- (void)voiceChatStreamingManagerDidFail:(NSError *)error;
@end
/// Manager for realtime recording and streaming.
@interface VoiceChatStreamingManager : NSObject
@property(nonatomic, weak) id<VoiceChatStreamingManagerDelegate> delegate;
/// Base WebSocket URL, e.g. wss://api.yourdomain.com/api/ws/chat
@property(nonatomic, copy) NSString *serverURL;
@property(nonatomic, assign, readonly, getter=isStreaming) BOOL streaming;
@property(nonatomic, copy, readonly, nullable) NSString *sessionId;
- (void)startWithToken:(NSString *)token
language:(nullable NSString *)language
voiceId:(nullable NSString *)voiceId;
- (void)stopAndFinalize;
- (void)cancel;
- (void)disconnect;
@end
NS_ASSUME_NONNULL_END

View File

@@ -0,0 +1,376 @@
//
// VoiceChatStreamingManager.m
// keyBoard
//
// Created by Mac on 2026/1/21.
//
#import "VoiceChatStreamingManager.h"
#import "AudioCaptureManager.h"
#import "AudioSessionManager.h"
#import "VoiceChatWebSocketClient.h"
static NSString *const kVoiceChatStreamingManagerErrorDomain =
@"VoiceChatStreamingManager";
@interface VoiceChatStreamingManager () <AudioSessionManagerDelegate,
AudioCaptureManagerDelegate,
VoiceChatWebSocketClientDelegate>
@property(nonatomic, strong) AudioSessionManager *audioSession;
@property(nonatomic, strong) AudioCaptureManager *audioCapture;
@property(nonatomic, strong) VoiceChatWebSocketClient *webSocketClient;
@property(nonatomic, strong) dispatch_queue_t stateQueue;
@property(nonatomic, assign) BOOL streaming;
@property(nonatomic, copy) NSString *sessionId;
@property(nonatomic, copy) NSString *pendingToken;
@property(nonatomic, copy) NSString *pendingLanguage;
@property(nonatomic, copy) NSString *pendingVoiceId;
@end
@implementation VoiceChatStreamingManager
- (instancetype)init {
self = [super init];
if (self) {
_stateQueue = dispatch_queue_create("com.keyboard.aitalk.voicechat.manager",
DISPATCH_QUEUE_SERIAL);
_audioSession = [AudioSessionManager sharedManager];
_audioSession.delegate = self;
_audioCapture = [[AudioCaptureManager alloc] init];
_audioCapture.delegate = self;
_webSocketClient = [[VoiceChatWebSocketClient alloc] init];
_webSocketClient.delegate = self;
_serverURL = @"ws://192.168.2.21:7529/api/ws/chat?token=";
_webSocketClient.serverURL = _serverURL;
}
return self;
}
- (void)dealloc {
[self disconnect];
}
- (void)setServerURL:(NSString *)serverURL {
_serverURL = [serverURL copy];
self.webSocketClient.serverURL = _serverURL;
}
#pragma mark - Public Methods
- (void)startWithToken:(NSString *)token
language:(nullable NSString *)language
voiceId:(nullable NSString *)voiceId {
dispatch_async(self.stateQueue, ^{
self.pendingToken = token ?: @"";
self.pendingLanguage = language ?: @"";
self.pendingVoiceId = voiceId ?: @"";
[self.webSocketClient disableAudioSending];
[self startInternal];
});
}
- (void)stopAndFinalize {
dispatch_async(self.stateQueue, ^{
if (self.streaming) {
[self.audioCapture stopCapture];
self.streaming = NO;
}
[self.webSocketClient disableAudioSending];
[self.webSocketClient endAudio];
});
}
- (void)cancel {
dispatch_async(self.stateQueue, ^{
if (self.streaming) {
[self.audioCapture stopCapture];
self.streaming = NO;
}
[self.webSocketClient disableAudioSending];
[self.webSocketClient cancel];
self.sessionId = nil;
});
}
- (void)disconnect {
dispatch_async(self.stateQueue, ^{
if (self.streaming) {
[self.audioCapture stopCapture];
self.streaming = NO;
}
[self.webSocketClient disableAudioSending];
[self.webSocketClient disconnect];
[self.audioSession deactivateSession];
self.sessionId = nil;
});
}
#pragma mark - Private Methods
- (void)startInternal {
if (self.pendingToken.length == 0) {
NSLog(@"[VoiceChatStreamingManager] Start failed: token is empty");
[self reportErrorWithMessage:@"Token is required"];
return;
}
if (![self.audioSession hasMicrophonePermission]) {
__weak typeof(self) weakSelf = self;
[self.audioSession requestMicrophonePermission:^(BOOL granted) {
__strong typeof(weakSelf) strongSelf = weakSelf;
if (!strongSelf) {
return;
}
if (!granted) {
[strongSelf reportErrorWithMessage:@"Microphone permission denied"];
return;
}
dispatch_async(strongSelf.stateQueue, ^{
[strongSelf startInternal];
});
}];
return;
}
NSError *error = nil;
if (![self.audioSession configureForConversation:&error]) {
[self reportError:error];
return;
}
if (![self.audioSession activateSession:&error]) {
[self reportError:error];
return;
}
if (self.serverURL.length == 0) {
NSLog(@"[VoiceChatStreamingManager] Start failed: server URL is empty");
[self reportErrorWithMessage:@"Server URL is required"];
return;
}
NSLog(@"[VoiceChatStreamingManager] Start streaming, server: %@",
self.serverURL);
self.webSocketClient.serverURL = self.serverURL;
[self.webSocketClient connectWithToken:self.pendingToken];
}
- (void)reportError:(NSError *)error {
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatStreamingManagerDidFail:)]) {
[self.delegate voiceChatStreamingManagerDidFail:error];
}
});
}
- (void)reportErrorWithMessage:(NSString *)message {
NSError *error = [NSError errorWithDomain:kVoiceChatStreamingManagerErrorDomain
code:-1
userInfo:@{
NSLocalizedDescriptionKey : message ?: @""
}];
[self reportError:error];
}
#pragma mark - AudioCaptureManagerDelegate
- (void)audioCaptureManagerDidOutputPCMFrame:(NSData *)pcmFrame {
if (!self.streaming) {
return;
}
[self.webSocketClient sendAudioPCMFrame:pcmFrame];
}
- (void)audioCaptureManagerDidUpdateRMS:(float)rms {
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatStreamingManagerDidUpdateRMS:)]) {
[self.delegate voiceChatStreamingManagerDidUpdateRMS:rms];
}
});
}
#pragma mark - AudioSessionManagerDelegate
- (void)audioSessionManagerDidInterrupt:(KBAudioSessionInterruptionType)type {
if (type == KBAudioSessionInterruptionTypeBegan) {
[self cancel];
}
}
- (void)audioSessionManagerMicrophonePermissionDenied {
[self reportErrorWithMessage:@"Microphone permission denied"];
}
#pragma mark - VoiceChatWebSocketClientDelegate
- (void)voiceChatClientDidConnect {
dispatch_async(self.stateQueue, ^{
[self.webSocketClient startSessionWithLanguage:self.pendingLanguage
voiceId:self.pendingVoiceId];
});
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatStreamingManagerDidConnect)]) {
[self.delegate voiceChatStreamingManagerDidConnect];
}
});
}
- (void)voiceChatClientDidDisconnect:(NSError *_Nullable)error {
dispatch_async(self.stateQueue, ^{
if (self.streaming) {
[self.audioCapture stopCapture];
self.streaming = NO;
}
[self.audioSession deactivateSession];
self.sessionId = nil;
});
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatStreamingManagerDidDisconnect:)]) {
[self.delegate voiceChatStreamingManagerDidDisconnect:error];
}
});
}
- (void)voiceChatClientDidStartSession:(NSString *)sessionId {
dispatch_async(self.stateQueue, ^{
self.sessionId = sessionId;
NSError *error = nil;
if (![self.audioCapture startCapture:&error]) {
[self reportError:error];
[self.webSocketClient cancel];
return;
}
self.streaming = YES;
[self.webSocketClient enableAudioSending];
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatStreamingManagerDidStartSession:)]) {
[self.delegate voiceChatStreamingManagerDidStartSession:sessionId];
}
});
});
}
- (void)voiceChatClientDidStartTurn:(NSInteger)turnIndex {
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatStreamingManagerDidStartTurn:)]) {
[self.delegate voiceChatStreamingManagerDidStartTurn:turnIndex];
}
});
}
- (void)voiceChatClientDidReceiveEagerEndOfTurnWithTranscript:(NSString *)text
confidence:(double)confidence {
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate
respondsToSelector:@selector
(voiceChatStreamingManagerDidReceiveEagerEndOfTurnWithTranscript:
confidence:)]) {
[self.delegate
voiceChatStreamingManagerDidReceiveEagerEndOfTurnWithTranscript:text
confidence:confidence];
}
});
}
- (void)voiceChatClientDidResumeTurn {
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatStreamingManagerDidResumeTurn)]) {
[self.delegate voiceChatStreamingManagerDidResumeTurn];
}
});
}
- (void)voiceChatClientDidReceiveInterimTranscript:(NSString *)text {
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatStreamingManagerDidReceiveInterimTranscript:)]) {
[self.delegate voiceChatStreamingManagerDidReceiveInterimTranscript:text];
}
});
}
- (void)voiceChatClientDidReceiveFinalTranscript:(NSString *)text {
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatStreamingManagerDidReceiveFinalTranscript:)]) {
[self.delegate voiceChatStreamingManagerDidReceiveFinalTranscript:text];
}
});
}
- (void)voiceChatClientDidReceiveLLMStart {
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatStreamingManagerDidReceiveLLMStart)]) {
[self.delegate voiceChatStreamingManagerDidReceiveLLMStart];
}
});
}
- (void)voiceChatClientDidReceiveLLMToken:(NSString *)token {
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatStreamingManagerDidReceiveLLMToken:)]) {
[self.delegate voiceChatStreamingManagerDidReceiveLLMToken:token];
}
});
}
- (void)voiceChatClientDidReceiveAudioChunk:(NSData *)audioData {
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatStreamingManagerDidReceiveAudioChunk:)]) {
[self.delegate voiceChatStreamingManagerDidReceiveAudioChunk:audioData];
}
});
}
- (void)voiceChatClientDidCompleteWithTranscript:(NSString *)transcript
aiResponse:(NSString *)aiResponse {
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatStreamingManagerDidCompleteWithTranscript:
aiResponse:)]) {
[self.delegate voiceChatStreamingManagerDidCompleteWithTranscript:transcript
aiResponse:aiResponse];
}
});
}
- (void)voiceChatClientDidReceiveErrorCode:(NSString *)code
message:(NSString *)message {
NSString *desc = message.length > 0 ? message : @"Server error";
NSError *error = [NSError errorWithDomain:kVoiceChatStreamingManagerErrorDomain
code:-2
userInfo:@{
NSLocalizedDescriptionKey : desc,
@"code" : code ?: @""
}];
[self reportError:error];
}
- (void)voiceChatClientDidFail:(NSError *)error {
[self reportError:error];
}
@end

View File

@@ -0,0 +1,57 @@
//
// VoiceChatWebSocketClient.h
// keyBoard
//
// Created by Mac on 2026/1/21.
//
#import <Foundation/Foundation.h>
NS_ASSUME_NONNULL_BEGIN
@protocol VoiceChatWebSocketClientDelegate <NSObject>
@optional
- (void)voiceChatClientDidConnect;
- (void)voiceChatClientDidDisconnect:(NSError *_Nullable)error;
- (void)voiceChatClientDidStartSession:(NSString *)sessionId;
- (void)voiceChatClientDidStartTurn:(NSInteger)turnIndex;
- (void)voiceChatClientDidReceiveEagerEndOfTurnWithTranscript:(NSString *)text
confidence:(double)confidence;
- (void)voiceChatClientDidResumeTurn;
- (void)voiceChatClientDidReceiveInterimTranscript:(NSString *)text;
- (void)voiceChatClientDidReceiveFinalTranscript:(NSString *)text;
- (void)voiceChatClientDidReceiveLLMStart;
- (void)voiceChatClientDidReceiveLLMToken:(NSString *)token;
- (void)voiceChatClientDidReceiveAudioChunk:(NSData *)audioData;
- (void)voiceChatClientDidCompleteWithTranscript:(NSString *)transcript
aiResponse:(NSString *)aiResponse;
- (void)voiceChatClientDidReceiveErrorCode:(NSString *)code
message:(NSString *)message;
- (void)voiceChatClientDidFail:(NSError *)error;
@end
/// WebSocket client for realtime voice chat.
@interface VoiceChatWebSocketClient : NSObject
@property(nonatomic, weak) id<VoiceChatWebSocketClientDelegate> delegate;
/// Base WebSocket URL, e.g. wss://api.yourdomain.com/api/ws/chat
@property(nonatomic, copy) NSString *serverURL;
@property(nonatomic, assign, readonly, getter=isConnected) BOOL connected;
@property(nonatomic, copy, readonly, nullable) NSString *sessionId;
- (void)connectWithToken:(NSString *)token;
- (void)disconnect;
- (void)startSessionWithLanguage:(nullable NSString *)language
voiceId:(nullable NSString *)voiceId;
- (void)enableAudioSending;
- (void)disableAudioSending;
- (void)sendAudioPCMFrame:(NSData *)pcmFrame;
- (void)endAudio;
- (void)cancel;
@end
NS_ASSUME_NONNULL_END

View File

@@ -0,0 +1,457 @@
//
// VoiceChatWebSocketClient.m
// keyBoard
//
// Created by Mac on 2026/1/21.
//
#import "VoiceChatWebSocketClient.h"
static NSString *const kVoiceChatWebSocketClientErrorDomain =
@"VoiceChatWebSocketClient";
@interface VoiceChatWebSocketClient () <NSURLSessionWebSocketDelegate>
@property(nonatomic, strong) NSURLSession *urlSession;
@property(nonatomic, strong) NSURLSessionWebSocketTask *webSocketTask;
@property(nonatomic, strong) dispatch_queue_t networkQueue;
@property(nonatomic, assign) BOOL connected;
@property(nonatomic, copy) NSString *sessionId;
@property(nonatomic, assign) BOOL audioSendingEnabled;
@end
@implementation VoiceChatWebSocketClient
- (instancetype)init {
self = [super init];
if (self) {
_networkQueue = dispatch_queue_create("com.keyboard.aitalk.voicechat.ws",
DISPATCH_QUEUE_SERIAL);
_serverURL = @"wss://api.yourdomain.com/api/ws/chat";
_audioSendingEnabled = NO;
}
return self;
}
- (void)dealloc {
[self disconnect];
}
#pragma mark - Public Methods
- (void)connectWithToken:(NSString *)token {
dispatch_async(self.networkQueue, ^{
[self disconnectInternal];
NSURL *url = [self buildURLWithToken:token];
if (!url) {
[self reportErrorWithMessage:@"Invalid server URL"];
return;
}
NSLog(@"[VoiceChatWebSocketClient] Connecting: %@", url.absoluteString);
NSURLSessionConfiguration *config =
[NSURLSessionConfiguration defaultSessionConfiguration];
config.timeoutIntervalForRequest = 30;
config.timeoutIntervalForResource = 300;
self.urlSession = [NSURLSession sessionWithConfiguration:config
delegate:self
delegateQueue:nil];
self.webSocketTask = [self.urlSession webSocketTaskWithURL:url];
[self.webSocketTask resume];
[self receiveMessage];
});
}
- (void)disconnect {
dispatch_async(self.networkQueue, ^{
BOOL shouldNotify = self.webSocketTask != nil;
if (shouldNotify) {
NSLog(@"[VoiceChatWebSocketClient] Disconnect requested");
}
[self disconnectInternal];
if (shouldNotify) {
[self notifyDisconnect:nil];
}
});
}
- (void)startSessionWithLanguage:(nullable NSString *)language
voiceId:(nullable NSString *)voiceId {
NSMutableDictionary *message = [NSMutableDictionary dictionary];
message[@"type"] = @"session_start";
NSMutableDictionary *config = [NSMutableDictionary dictionary];
if (language.length > 0) {
config[@"language"] = language;
}
if (voiceId.length > 0) {
config[@"voice_id"] = voiceId;
}
if (config.count > 0) {
message[@"config"] = config;
}
[self sendJSON:message];
}
- (void)enableAudioSending {
dispatch_async(self.networkQueue, ^{
self.audioSendingEnabled = YES;
});
}
- (void)disableAudioSending {
dispatch_async(self.networkQueue, ^{
self.audioSendingEnabled = NO;
});
}
- (void)sendAudioPCMFrame:(NSData *)pcmFrame {
if (!self.connected || !self.webSocketTask || pcmFrame.length == 0) {
return;
}
dispatch_async(self.networkQueue, ^{
if (!self.audioSendingEnabled) {
return;
}
if (!self.connected || !self.webSocketTask) {
return;
}
NSURLSessionWebSocketMessage *message =
[[NSURLSessionWebSocketMessage alloc] initWithData:pcmFrame];
[self.webSocketTask
sendMessage:message
completionHandler:^(NSError *_Nullable error) {
if (error) {
[self reportError:error];
} else {
NSLog(@"[VoiceChatWebSocketClient] Sent audio frame: %lu bytes",
(unsigned long)pcmFrame.length);
}
}];
});
}
- (void)endAudio {
NSLog(@"[VoiceChatWebSocketClient] Sending audio_end");
[self sendJSON:@{ @"type" : @"audio_end" }];
}
- (void)cancel {
NSLog(@"[VoiceChatWebSocketClient] Sending cancel");
[self sendJSON:@{ @"type" : @"cancel" }];
}
#pragma mark - Private Methods
- (NSURL *)buildURLWithToken:(NSString *)token {
if (self.serverURL.length == 0) {
return nil;
}
NSURLComponents *components =
[NSURLComponents componentsWithString:self.serverURL];
if (!components) {
return nil;
}
if (token.length > 0) {
NSMutableArray<NSURLQueryItem *> *items =
components.queryItems.mutableCopy ?: [NSMutableArray array];
BOOL didReplace = NO;
for (NSUInteger i = 0; i < items.count; i++) {
NSURLQueryItem *item = items[i];
if ([item.name isEqualToString:@"token"]) {
items[i] = [NSURLQueryItem queryItemWithName:@"token" value:token];
didReplace = YES;
break;
}
}
if (!didReplace) {
[items addObject:[NSURLQueryItem queryItemWithName:@"token"
value:token]];
}
components.queryItems = items;
}
return components.URL;
}
- (void)sendJSON:(NSDictionary *)dict {
if (!self.webSocketTask) {
return;
}
NSError *jsonError = nil;
NSData *jsonData = [NSJSONSerialization dataWithJSONObject:dict
options:0
error:&jsonError];
if (jsonError) {
[self reportError:jsonError];
return;
}
NSString *jsonString =
[[NSString alloc] initWithData:jsonData
encoding:NSUTF8StringEncoding];
if (!jsonString) {
[self reportErrorWithMessage:@"Failed to encode JSON message"];
return;
}
dispatch_async(self.networkQueue, ^{
NSURLSessionWebSocketMessage *message =
[[NSURLSessionWebSocketMessage alloc] initWithString:jsonString];
[self.webSocketTask
sendMessage:message
completionHandler:^(NSError *_Nullable error) {
if (error) {
[self reportError:error];
}
}];
});
}
- (void)receiveMessage {
if (!self.webSocketTask) {
return;
}
__weak typeof(self) weakSelf = self;
[self.webSocketTask receiveMessageWithCompletionHandler:^(
NSURLSessionWebSocketMessage *_Nullable message,
NSError *_Nullable error) {
__strong typeof(weakSelf) strongSelf = weakSelf;
if (!strongSelf) {
return;
}
if (error) {
if (error.code != NSURLErrorCancelled && error.code != 57) {
[strongSelf notifyDisconnect:error];
[strongSelf disconnectInternal];
}
return;
}
if (message.type == NSURLSessionWebSocketMessageTypeString) {
NSLog(@"[VoiceChatWebSocketClient] Received text: %@", message.string);
[strongSelf handleTextMessage:message.string];
} else if (message.type == NSURLSessionWebSocketMessageTypeData) {
NSLog(@"[VoiceChatWebSocketClient] Received binary: %lu bytes",
(unsigned long)message.data.length);
[strongSelf handleBinaryMessage:message.data];
}
[strongSelf receiveMessage];
}];
}
- (void)handleTextMessage:(NSString *)text {
if (text.length == 0) {
return;
}
NSData *data = [text dataUsingEncoding:NSUTF8StringEncoding];
if (!data) {
return;
}
NSError *jsonError = nil;
NSDictionary *json = [NSJSONSerialization JSONObjectWithData:data
options:0
error:&jsonError];
if (jsonError) {
[self reportError:jsonError];
return;
}
NSString *type = json[@"type"];
if (type.length == 0) {
return;
}
if ([type isEqualToString:@"session_started"]) {
NSString *sessionId = json[@"session_id"] ?: @"";
self.sessionId = sessionId;
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatClientDidStartSession:)]) {
[self.delegate voiceChatClientDidStartSession:sessionId];
}
});
} else if ([type isEqualToString:@"transcript_interim"]) {
NSString *transcript = json[@"text"] ?: @"";
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatClientDidReceiveInterimTranscript:)]) {
[self.delegate voiceChatClientDidReceiveInterimTranscript:transcript];
}
});
} else if ([type isEqualToString:@"transcript_final"]) {
NSString *transcript = json[@"text"] ?: @"";
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatClientDidReceiveFinalTranscript:)]) {
[self.delegate voiceChatClientDidReceiveFinalTranscript:transcript];
}
});
} else if ([type isEqualToString:@"turn_start"]) {
NSInteger turnIndex = [json[@"turn_index"] integerValue];
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatClientDidStartTurn:)]) {
[self.delegate voiceChatClientDidStartTurn:turnIndex];
}
});
} else if ([type isEqualToString:@"eager_eot"]) {
NSString *transcript = json[@"transcript"] ?: @"";
double confidence = [json[@"confidence"] doubleValue];
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatClientDidReceiveEagerEndOfTurnWithTranscript:
confidence:)]) {
[self.delegate
voiceChatClientDidReceiveEagerEndOfTurnWithTranscript:transcript
confidence:confidence];
}
});
} else if ([type isEqualToString:@"turn_resumed"]) {
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatClientDidResumeTurn)]) {
[self.delegate voiceChatClientDidResumeTurn];
}
});
} else if ([type isEqualToString:@"llm_start"]) {
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate
respondsToSelector:@selector(voiceChatClientDidReceiveLLMStart)]) {
[self.delegate voiceChatClientDidReceiveLLMStart];
}
});
} else if ([type isEqualToString:@"llm_token"]) {
NSString *token = json[@"token"] ?: @"";
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate
respondsToSelector:@selector(voiceChatClientDidReceiveLLMToken:)]) {
[self.delegate voiceChatClientDidReceiveLLMToken:token];
}
});
} else if ([type isEqualToString:@"complete"]) {
NSString *transcript = json[@"transcript"] ?: @"";
NSString *aiResponse = json[@"ai_response"] ?: @"";
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatClientDidCompleteWithTranscript:
aiResponse:)]) {
[self.delegate voiceChatClientDidCompleteWithTranscript:transcript
aiResponse:aiResponse];
}
});
} else if ([type isEqualToString:@"error"]) {
NSString *code = json[@"code"] ?: @"";
NSString *message = json[@"message"] ?: @"";
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatClientDidReceiveErrorCode:message:)]) {
[self.delegate voiceChatClientDidReceiveErrorCode:code
message:message];
}
});
}
}
- (void)handleBinaryMessage:(NSData *)data {
if (data.length == 0) {
return;
}
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate
respondsToSelector:@selector(voiceChatClientDidReceiveAudioChunk:)]) {
[self.delegate voiceChatClientDidReceiveAudioChunk:data];
}
});
}
- (void)disconnectInternal {
self.connected = NO;
self.sessionId = nil;
self.audioSendingEnabled = NO;
if (self.webSocketTask) {
[self.webSocketTask
cancelWithCloseCode:NSURLSessionWebSocketCloseCodeNormalClosure
reason:nil];
self.webSocketTask = nil;
}
if (self.urlSession) {
[self.urlSession invalidateAndCancel];
self.urlSession = nil;
}
}
- (void)reportError:(NSError *)error {
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector(voiceChatClientDidFail:)]) {
[self.delegate voiceChatClientDidFail:error];
}
});
}
- (void)reportErrorWithMessage:(NSString *)message {
NSError *error = [NSError errorWithDomain:kVoiceChatWebSocketClientErrorDomain
code:-1
userInfo:@{
NSLocalizedDescriptionKey : message ?: @""
}];
[self reportError:error];
}
- (void)notifyDisconnect:(NSError *_Nullable)error {
self.connected = NO;
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector
(voiceChatClientDidDisconnect:)]) {
[self.delegate voiceChatClientDidDisconnect:error];
}
});
}
#pragma mark - NSURLSessionWebSocketDelegate
- (void)URLSession:(NSURLSession *)session
webSocketTask:(NSURLSessionWebSocketTask *)webSocketTask
didOpenWithProtocol:(NSString *)protocol {
self.connected = YES;
NSLog(@"[VoiceChatWebSocketClient] Connected");
dispatch_async(dispatch_get_main_queue(), ^{
if ([self.delegate respondsToSelector:@selector(voiceChatClientDidConnect)]) {
[self.delegate voiceChatClientDidConnect];
}
});
}
- (void)URLSession:(NSURLSession *)session
webSocketTask:(NSURLSessionWebSocketTask *)webSocketTask
didCloseWithCode:(NSURLSessionWebSocketCloseCode)closeCode
reason:(NSData *)reason {
if (!self.webSocketTask) {
return;
}
NSLog(@"[VoiceChatWebSocketClient] Closed with code: %ld",
(long)closeCode);
[self notifyDisconnect:nil];
[self disconnectInternal];
}
@end

View File

@@ -0,0 +1,771 @@
# 实时语音对话 WebSocket API 文档
> Version: 2.0.0 (Flux)
> Last Updated: 2026-01-21
> Author: Backend Team
---
## 概述
本文档描述实时语音对话 WebSocket API用于 iOS 客户端与后端进行实时语音交互。
**v2.0 更新**: 升级为 Deepgram Flux 模型,支持智能轮次检测和 EagerEndOfTurn 提前响应。
### 核心特性
- **智能轮次检测**: Flux 模型语义理解,自动判断用户说完(非简单静默检测)
- **EagerEndOfTurn**: 提前启动 LLM 响应,进一步降低延迟
- **实时语音识别**: 边说边识别,实时显示转写文本
- **流式响应**: AI 响应边生成边返回,无需等待完整响应
- **流式音频**: TTS 音频边合成边播放,极低延迟
- **Barge-in 支持**: 用户可以打断 AI 说话
### 性能指标
| 指标 | 目标值 | 说明 |
|------|--------|------|
| 端点检测延迟 | ~260ms | Flux 智能检测 |
| TTFA (首音频延迟) | < 300ms | EagerEndOfTurn 优化 |
| 端到端延迟 | < 1.5秒 | 完整对话周期 |
| 实时转写延迟 | < 100ms | 中间结果 |
---
## 连接信息
### WebSocket 端点
```
生产环境: wss://api.yourdomain.com/api/ws/chat?token={sa_token}
开发环境: ws://localhost:7529/api/ws/chat?token={sa_token}
```
### 认证方式
通过 URL Query 参数传递 Sa-Token
```
ws://host:port/api/ws/chat?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9...
```
| 参数 | 类型 | 必填 | 描述 |
|------|------|------|------|
| token | String | | Sa-Token 登录令牌通过 Apple Sign-In 获取 |
### 认证失败
如果 token 无效或过期WebSocket 连接将被拒绝HTTP 403)。
---
## 消息格式
### 通用规则
1. **文本消息**: JSON 格式用于控制指令和状态通知
2. **二进制消息**: 原始字节用于音频数据传输
3. **编码**: UTF-8
---
## 客户端 → 服务端消息
### 1. 开始会话 (session_start)
**发送时机**: 建立 WebSocket 连接后准备开始录音前
```json
{
"type": "session_start",
"config": {
"language": "en",
"voice_id": "a5zfmqTslZJBP0jutmVY"
}
}
```
| 字段 | 类型 | 必填 | 描述 |
|------|------|------|------|
| type | String | | 固定值 `session_start` |
| config | Object | | 会话配置可选 |
| config.language | String | | 语音识别语言默认 `en` |
| config.voice_id | String | | TTS 声音 ID默认使用服务端配置 |
**响应**: 服务端返回 `session_started` 消息
---
### 2. 音频数据 (Binary)
**发送时机**: 用户正在录音时持续发送音频数据
**格式**: Binary WebSocket Frame直接发送原始音频字节
**音频规格要求**:
| 参数 | | 说明 |
|------|------|------|
| 编码格式 | PCM (Linear16) | 未压缩的脉冲编码调制 |
| 采样率 | 16000 Hz | 16kHz |
| 位深度 | 16-bit | 有符号整数 |
| 声道数 | 1 (Mono) | 单声道 |
| 字节序 | Little-Endian | 小端序 |
**iOS 代码示例**:
```swift
// AVAudioEngine 配置
let format = AVAudioFormat(
commonFormat: .pcmFormatInt16,
sampleRate: 16000,
channels: 1,
interleaved: true
)!
// 发送音频数据
audioEngine.inputNode.installTap(
onBus: 0,
bufferSize: 1024,
format: format
) { buffer, time in
let audioData = buffer.int16ChannelData![0]
let byteCount = Int(buffer.frameLength) * 2 // 16-bit = 2 bytes
let data = Data(bytes: audioData, count: byteCount)
webSocket.write(data: data)
}
```
**发送频率**: 建议每 20-100ms 发送一次每次 320-1600 字节
---
### 3. 结束录音 (audio_end)
**发送时机**: 用户停止录音松开录音按钮
```json
{
"type": "audio_end"
}
```
| 字段 | 类型 | 必填 | 描述 |
|------|------|------|------|
| type | String | | 固定值 `audio_end` |
**说明**: 发送此消息后服务端将完成语音识别并开始生成 AI 响应
---
### 4. 取消会话 (cancel)
**发送时机**: 用户主动取消对话如点击取消按钮
```json
{
"type": "cancel"
}
```
| 字段 | 类型 | 必填 | 描述 |
|------|------|------|------|
| type | String | | 固定值 `cancel` |
**说明**: 服务端将停止所有处理不再返回任何消息
---
## 服务端 → 客户端消息
### 1. 会话已启动 (session_started)
**接收时机**: 发送 `session_start`
```json
{
"type": "session_started",
"session_id": "abc123-def456-ghi789"
}
```
| 字段 | 类型 | 描述 |
|------|------|------|
| type | String | 固定值 `session_started` |
| session_id | String | 服务端分配的会话 ID |
**客户端处理**: 收到此消息后可以开始发送音频数据
---
### 2. 轮次开始 (turn_start) 🆕
**接收时机**: 用户开始说话时Flux 检测到语音活动
```json
{
"type": "turn_start",
"turn_index": 0
}
```
| 字段 | 类型 | 描述 |
|------|------|------|
| type | String | 固定值 `turn_start` |
| turn_index | Integer | 当前轮次索引 0 开始 |
**客户端处理**:
- 可显示"正在听..."状态
- 准备接收转写结果
---
### 3. 中间转写结果 (transcript_interim)
**接收时机**: 用户说话过程中实时返回
```json
{
"type": "transcript_interim",
"text": "Hello how are",
"is_final": false
}
```
| 字段 | 类型 | 描述 |
|------|------|------|
| type | String | 固定值 `transcript_interim` |
| text | String | 当前识别到的文本可能会变化 |
| is_final | Boolean | 固定为 `false` |
**客户端处理**:
- 实时更新 UI 显示转写文本
- 此文本可能会被后续消息覆盖
- 可用于显示"正在识别..."效果
---
### 3. 最终转写结果 (transcript_final)
**接收时机**: 一句话识别完成时
```json
{
"type": "transcript_final",
"text": "Hello, how are you?"
}
```
| 字段 | 类型 | 描述 |
|------|------|------|
| type | String | 固定值 `transcript_final` |
| text | String | 最终确定的转写文本 |
**客户端处理**:
- 用此文本替换之前的中间结果
- 此文本不会再变化
---
### 6. 提前端点检测 (eager_eot) 🆕
**接收时机**: Flux 检测到用户可能说完时置信度达到阈值
```json
{
"type": "eager_eot",
"transcript": "Hello, how are you",
"confidence": 0.65
}
```
| 字段 | 类型 | 描述 |
|------|------|------|
| type | String | 固定值 `eager_eot` |
| transcript | String | 当前转写文本 |
| confidence | Double | 端点置信度 (0.0-1.0) |
**客户端处理**:
- 这是一个**预测性事件**表示用户可能说完了
- 服务端已开始提前准备 LLM 响应
- 可显示"准备响应..."状态
- **注意**: 用户可能继续说话此时会收到 `turn_resumed`
---
### 7. 轮次恢复 (turn_resumed) 🆕
**接收时机**: 收到 `eager_eot` 用户继续说话
```json
{
"type": "turn_resumed"
}
```
| 字段 | 类型 | 描述 |
|------|------|------|
| type | String | 固定值 `turn_resumed` |
**客户端处理**:
- 用户继续说话之前的 `eager_eot` 是误判
- 服务端已取消正在准备的草稿响应
- 恢复"正在听..."状态
- 继续接收 `transcript_interim` 更新
---
### 8. LLM 开始生成 (llm_start)
**接收时机**: 语音识别完成AI 开始生成响应
```json
{
"type": "llm_start"
}
```
| 字段 | 类型 | 描述 |
|------|------|------|
| type | String | 固定值 `llm_start` |
**客户端处理**:
- 可显示"AI 正在思考..."状态
- 准备接收 AI 响应文本和音频
---
### 5. LLM Token (llm_token)
**接收时机**: AI 生成过程中 token 返回
```json
{
"type": "llm_token",
"token": "Hi"
}
```
| 字段 | 类型 | 描述 |
|------|------|------|
| type | String | 固定值 `llm_token` |
| token | String | AI 输出的单个 token词或字符片段 |
**客户端处理**:
- 可选择实现打字机效果
- 逐个 token 追加显示 AI 响应文本
- 如不需要打字效果可忽略此消息
---
### 6. 音频数据 (Binary)
**接收时机**: TTS 合成过程中流式返回音频
**格式**: Binary WebSocket FrameMP3 音频块
**音频规格**:
| 参数 | |
|------|------|
| 格式 | MP3 |
| 采样率 | 44100 Hz |
| 比特率 | 64 kbps |
| 声道 | 单声道 |
**客户端处理**:
```swift
// 使用 AVAudioEngine 或 AudioQueue 播放流式音频
webSocket.onEvent = { event in
switch event {
case .binary(let data):
// 方案1: 追加到缓冲区,使用 AVAudioPlayerNode
audioBuffer.append(data)
playBufferedAudio()
// 方案2: 使用 AVAudioEngine + AVAudioCompressedBuffer
// 方案3: 累积后使用 AVAudioPlayer
}
}
```
**重要提示**:
- 音频是分块返回的需要正确拼接或流式播放
- 每个二进制消息是 MP3 数据的一部分
- 收到 `complete` 消息后音频传输完成
---
### 7. 处理完成 (complete)
**接收时机**: AI 响应生成完成所有音频已发送
```json
{
"type": "complete",
"transcript": "Hello, how are you?",
"ai_response": "Hi! I'm doing great, thanks for asking!"
}
```
| 字段 | 类型 | 描述 |
|------|------|------|
| type | String | 固定值 `complete` |
| transcript | String | 完整的用户语音转写文本 |
| ai_response | String | 完整的 AI 响应文本 |
**客户端处理**:
- 更新 UI 显示完整对话
- 可开始下一轮对话
- 建议保存对话历史
---
### 8. 错误 (error)
**接收时机**: 处理过程中发生错误
```json
{
"type": "error",
"code": "DEEPGRAM_ERROR",
"message": "Speech recognition failed"
}
```
| 字段 | 类型 | 描述 |
|------|------|------|
| type | String | 固定值 `error` |
| code | String | 错误代码 |
| message | String | 错误描述 |
**错误代码列表**:
| 错误代码 | 描述 | 建议处理 |
|----------|------|----------|
| PARSE_ERROR | 消息解析失败 | 检查消息格式 |
| DEEPGRAM_ERROR | 语音识别服务错误 | 重试或提示用户 |
| DEEPGRAM_INIT_ERROR | 语音识别初始化失败 | 重新开始会话 |
| LLM_ERROR | AI 生成错误 | 重试或提示用户 |
| PIPELINE_ERROR | 处理流程错误 | 重新开始会话 |
| EMPTY_TRANSCRIPT | 未检测到语音 | 提示用户重新说话 |
**客户端处理**:
- 显示友好的错误提示
- 根据错误类型决定是否重试
---
## 完整交互流程
### 时序图
```
iOS Client Server
| |
|------ WebSocket Connect --------->|
| ?token=xxx |
| |
|<-------- Connected ---------------|
| |
|------ session_start ------------->|
| |
|<----- session_started ------------|
| {session_id: "abc"} |
| |
|======= 用户开始说话 ===============|
| |
|------ Binary (audio) ------------>|
|------ Binary (audio) ------------>|
|<----- transcript_interim ---------|
| {text: "Hello"} |
|------ Binary (audio) ------------>|
|<----- transcript_interim ---------|
| {text: "Hello how"} |
|------ Binary (audio) ------------>|
|<----- transcript_final -----------|
| {text: "Hello, how are you?"}|
| |
|======= 用户停止说话 ===============|
| |
|------ audio_end ----------------->|
| |
|<----- llm_start ------------------|
| |
|<----- llm_token ------------------|
| {token: "Hi"} |
|<----- llm_token ------------------|
| {token: "!"} |
|<----- Binary (mp3) ---------------|
|<----- Binary (mp3) ---------------|
|<----- llm_token ------------------|
| {token: " I'm"} |
|<----- Binary (mp3) ---------------|
| ... |
|<----- complete -------------------|
| {transcript, ai_response} |
| |
|======= 可以开始下一轮 =============|
| |
```
---
## iOS 代码示例
### 完整 Swift 实现
```swift
import Foundation
import Starscream // WebSocket 库
class VoiceChatManager: WebSocketDelegate {
private var socket: WebSocket?
private var audioBuffer = Data()
// MARK: - 回调
var onSessionStarted: ((String) -> Void)?
var onTranscriptInterim: ((String) -> Void)?
var onTranscriptFinal: ((String) -> Void)?
var onLLMStart: (() -> Void)?
var onLLMToken: ((String) -> Void)?
var onAudioChunk: ((Data) -> Void)?
var onComplete: ((String, String) -> Void)?
var onError: ((String, String) -> Void)?
// MARK: - 连接
func connect(token: String) {
let urlString = "wss://api.yourdomain.com/api/ws/chat?token=\(token)"
guard let url = URL(string: urlString) else { return }
var request = URLRequest(url: url)
request.timeoutInterval = 30
socket = WebSocket(request: request)
socket?.delegate = self
socket?.connect()
}
func disconnect() {
socket?.disconnect()
socket = nil
}
// MARK: - 发送消息
func startSession(language: String = "en", voiceId: String? = nil) {
var config: [String: Any] = ["language": language]
if let voiceId = voiceId {
config["voice_id"] = voiceId
}
let message: [String: Any] = [
"type": "session_start",
"config": config
]
sendJSON(message)
}
func sendAudio(_ data: Data) {
socket?.write(data: data)
}
func endAudio() {
sendJSON(["type": "audio_end"])
}
func cancel() {
sendJSON(["type": "cancel"])
}
private func sendJSON(_ dict: [String: Any]) {
guard let data = try? JSONSerialization.data(withJSONObject: dict),
let string = String(data: data, encoding: .utf8) else { return }
socket?.write(string: string)
}
// MARK: - WebSocketDelegate
func didReceive(event: WebSocketEvent, client: WebSocketClient) {
switch event {
case .connected(_):
print("WebSocket connected")
case .disconnected(let reason, let code):
print("WebSocket disconnected: \(reason) (\(code))")
case .text(let text):
handleTextMessage(text)
case .binary(let data):
// 收到 MP3 音频数据
onAudioChunk?(data)
case .error(let error):
print("WebSocket error: \(error?.localizedDescription ?? "unknown")")
default:
break
}
}
private func handleTextMessage(_ text: String) {
guard let data = text.data(using: .utf8),
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
let type = json["type"] as? String else { return }
switch type {
case "session_started":
if let sessionId = json["session_id"] as? String {
onSessionStarted?(sessionId)
}
case "transcript_interim":
if let text = json["text"] as? String {
onTranscriptInterim?(text)
}
case "transcript_final":
if let text = json["text"] as? String {
onTranscriptFinal?(text)
}
case "llm_start":
onLLMStart?()
case "llm_token":
if let token = json["token"] as? String {
onLLMToken?(token)
}
case "complete":
if let transcript = json["transcript"] as? String,
let aiResponse = json["ai_response"] as? String {
onComplete?(transcript, aiResponse)
}
case "error":
if let code = json["code"] as? String,
let message = json["message"] as? String {
onError?(code, message)
}
default:
print("Unknown message type: \(type)")
}
}
}
```
### 使用示例
```swift
class VoiceChatViewController: UIViewController {
let chatManager = VoiceChatManager()
let audioRecorder = AudioRecorder() // 自定义录音类
let audioPlayer = StreamingAudioPlayer() // 自定义流式播放类
override func viewDidLoad() {
super.viewDidLoad()
setupCallbacks()
}
func setupCallbacks() {
chatManager.onSessionStarted = { [weak self] sessionId in
print("Session started: \(sessionId)")
// 开始录音
self?.audioRecorder.start { audioData in
self?.chatManager.sendAudio(audioData)
}
}
chatManager.onTranscriptInterim = { [weak self] text in
self?.transcriptLabel.text = text + "..."
}
chatManager.onTranscriptFinal = { [weak self] text in
self?.transcriptLabel.text = text
}
chatManager.onLLMStart = { [weak self] in
self?.statusLabel.text = "AI is thinking..."
}
chatManager.onLLMToken = { [weak self] token in
self?.aiResponseLabel.text = (self?.aiResponseLabel.text ?? "") + token
}
chatManager.onAudioChunk = { [weak self] data in
self?.audioPlayer.appendData(data)
}
chatManager.onComplete = { [weak self] transcript, aiResponse in
self?.statusLabel.text = "Complete"
self?.addToHistory(user: transcript, ai: aiResponse)
}
chatManager.onError = { [weak self] code, message in
self?.showError(message)
}
}
@IBAction func startTapped(_ sender: UIButton) {
// 连接并开始会话
chatManager.connect(token: AuthManager.shared.saToken)
chatManager.onSessionStarted = { [weak self] _ in
self?.chatManager.startSession()
}
}
@IBAction func stopTapped(_ sender: UIButton) {
audioRecorder.stop()
chatManager.endAudio()
}
@IBAction func cancelTapped(_ sender: UIButton) {
audioRecorder.stop()
audioPlayer.stop()
chatManager.cancel()
}
}
```
---
## 注意事项
### 1. 音频录制
- 必须使用 PCM 16-bit, 16kHz, Mono 格式
- 建议每 20-100ms 发送一次音频数据
- 录音权限需要在 Info.plist 中声明
### 2. 音频播放
- 返回的是 MP3 格式音频块
- 需要实现流式播放或缓冲播放
- 建议使用 AVAudioEngine 实现低延迟播放
### 3. 网络处理
- 实现自动重连机制
- 处理网络切换场景
- 设置合理的超时时间
### 4. 用户体验
- 显示实时转写文本
- 显示 AI 响应状态
- 提供取消按钮
- 处理录音权限被拒绝的情况
### 5. 调试建议
- 使用 `wss://` 确保生产环境安全
- 本地开发可使用 `ws://`
- 检查 Sa-Token 是否过期
---
## 版本历史
| 版本 | 日期 | 变更 |
|------|------|------|
| 1.0.0 | 2026-01-21 | 初始版本 |