1
This commit is contained in:
@@ -195,6 +195,9 @@
|
||||
04D1F6B22EDFF10A00B12345 /* KBSkinInstallBridge.m in Sources */ = {isa = PBXBuildFile; fileRef = 04D1F6B12EDFF10A00B12345 /* KBSkinInstallBridge.m */; };
|
||||
04D1F6B32EDFF10A00B12345 /* KBSkinInstallBridge.m in Sources */ = {isa = PBXBuildFile; fileRef = 04D1F6B12EDFF10A00B12345 /* KBSkinInstallBridge.m */; };
|
||||
04E0383E2F1A7C30002CA5A0 /* KBCustomTabBar.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E0383D2F1A7C30002CA5A0 /* KBCustomTabBar.m */; };
|
||||
04E038D82F20BFFB002CA5A0 /* websocket-api.md in Resources */ = {isa = PBXBuildFile; fileRef = 04E038D72F20BFFB002CA5A0 /* websocket-api.md */; };
|
||||
04E038DD2F20C420002CA5A0 /* VoiceChatStreamingManager.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E038DA2F20C420002CA5A0 /* VoiceChatStreamingManager.m */; };
|
||||
04E038DE2F20C420002CA5A0 /* VoiceChatWebSocketClient.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E038DC2F20C420002CA5A0 /* VoiceChatWebSocketClient.m */; };
|
||||
04E161832F10E6470022C23B /* normal_hei_them.zip in Resources */ = {isa = PBXBuildFile; fileRef = 04E161812F10E6470022C23B /* normal_hei_them.zip */; };
|
||||
04E161842F10E6470022C23B /* normal_them.zip in Resources */ = {isa = PBXBuildFile; fileRef = 04E161822F10E6470022C23B /* normal_them.zip */; };
|
||||
04FC95672EB0546C007BD342 /* KBKey.m in Sources */ = {isa = PBXBuildFile; fileRef = 04FC95652EB0546C007BD342 /* KBKey.m */; };
|
||||
@@ -608,6 +611,11 @@
|
||||
04D1F6B12EDFF10A00B12345 /* KBSkinInstallBridge.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBSkinInstallBridge.m; sourceTree = "<group>"; };
|
||||
04E0383C2F1A7C30002CA5A0 /* KBCustomTabBar.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KBCustomTabBar.h; sourceTree = "<group>"; };
|
||||
04E0383D2F1A7C30002CA5A0 /* KBCustomTabBar.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBCustomTabBar.m; sourceTree = "<group>"; };
|
||||
04E038D72F20BFFB002CA5A0 /* websocket-api.md */ = {isa = PBXFileReference; lastKnownFileType = net.daringfireball.markdown; path = "websocket-api.md"; sourceTree = "<group>"; };
|
||||
04E038D92F20C420002CA5A0 /* VoiceChatStreamingManager.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = VoiceChatStreamingManager.h; sourceTree = "<group>"; };
|
||||
04E038DA2F20C420002CA5A0 /* VoiceChatStreamingManager.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = VoiceChatStreamingManager.m; sourceTree = "<group>"; };
|
||||
04E038DB2F20C420002CA5A0 /* VoiceChatWebSocketClient.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = VoiceChatWebSocketClient.h; sourceTree = "<group>"; };
|
||||
04E038DC2F20C420002CA5A0 /* VoiceChatWebSocketClient.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = VoiceChatWebSocketClient.m; sourceTree = "<group>"; };
|
||||
04E161812F10E6470022C23B /* normal_hei_them.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; path = normal_hei_them.zip; sourceTree = "<group>"; };
|
||||
04E161822F10E6470022C23B /* normal_them.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; path = normal_them.zip; sourceTree = "<group>"; };
|
||||
04FC953A2EAFAE56007BD342 /* KeyBoardPrefixHeader.pch */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KeyBoardPrefixHeader.pch; sourceTree = "<group>"; };
|
||||
@@ -986,6 +994,10 @@
|
||||
046086AE2F19239B00757C95 /* TTSPlaybackPipeline.m */,
|
||||
046086AF2F19239B00757C95 /* TTSServiceClient.h */,
|
||||
046086B02F19239B00757C95 /* TTSServiceClient.m */,
|
||||
04E038D92F20C420002CA5A0 /* VoiceChatStreamingManager.h */,
|
||||
04E038DA2F20C420002CA5A0 /* VoiceChatStreamingManager.m */,
|
||||
04E038DB2F20C420002CA5A0 /* VoiceChatWebSocketClient.h */,
|
||||
04E038DC2F20C420002CA5A0 /* VoiceChatWebSocketClient.m */,
|
||||
);
|
||||
path = VM;
|
||||
sourceTree = "<group>";
|
||||
@@ -994,6 +1006,7 @@
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
046086742F191CC700757C95 /* AI技术分析.txt */,
|
||||
04E038D72F20BFFB002CA5A0 /* websocket-api.md */,
|
||||
0460866C2F191A5100757C95 /* M */,
|
||||
0460866D2F191A5100757C95 /* V */,
|
||||
0460866E2F191A5100757C95 /* VC */,
|
||||
@@ -2003,6 +2016,7 @@
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
04286A0F2ECDA71B00CE730C /* 001.zip in Resources */,
|
||||
04E038D82F20BFFB002CA5A0 /* websocket-api.md in Resources */,
|
||||
0479200B2ED87CEE004E8522 /* permiss_video.mp4 in Resources */,
|
||||
04C6EABA2EAF86530089C901 /* Assets.xcassets in Resources */,
|
||||
04A9FE212EB893F10020DB6D /* Localizable.strings in Resources */,
|
||||
@@ -2235,6 +2249,8 @@
|
||||
05A1B2D22F5B1A2B3C4D5E60 /* KBSearchThemeModel.m in Sources */,
|
||||
047C65102EBCA8DD0035E841 /* HomeRankContentVC.m in Sources */,
|
||||
047C655C2EBCD0F80035E841 /* UIView+KBShadow.m in Sources */,
|
||||
04E038DD2F20C420002CA5A0 /* VoiceChatStreamingManager.m in Sources */,
|
||||
04E038DE2F20C420002CA5A0 /* VoiceChatWebSocketClient.m in Sources */,
|
||||
049FB2262EC3136D00FAB05D /* KBPersonInfoItemCell.m in Sources */,
|
||||
048908C32EBE32B800FABA60 /* KBSearchVC.m in Sources */,
|
||||
049FB20B2EC1C13800FAB05D /* KBSkinBottomActionView.m in Sources */,
|
||||
|
||||
@@ -11,8 +11,11 @@
|
||||
#import "KBAiChatView.h"
|
||||
#import "KBAiRecordButton.h"
|
||||
#import "LSTPopView.h"
|
||||
#import "VoiceChatStreamingManager.h"
|
||||
#import "KBUserSessionManager.h"
|
||||
|
||||
@interface KBAiMainVC () <KBAiRecordButtonDelegate>
|
||||
@interface KBAiMainVC () <KBAiRecordButtonDelegate,
|
||||
VoiceChatStreamingManagerDelegate>
|
||||
@property(nonatomic, weak) LSTPopView *popView;
|
||||
|
||||
// UI
|
||||
@@ -28,6 +31,13 @@
|
||||
|
||||
// 核心模块
|
||||
@property(nonatomic, strong) ConversationOrchestrator *orchestrator;
|
||||
@property(nonatomic, strong) VoiceChatStreamingManager *streamingManager;
|
||||
|
||||
// 文本跟踪
|
||||
@property(nonatomic, strong) NSMutableString *assistantVisibleText;
|
||||
|
||||
// 日志节流
|
||||
@property(nonatomic, assign) NSTimeInterval lastRMSLogTime;
|
||||
|
||||
@end
|
||||
|
||||
@@ -44,6 +54,7 @@
|
||||
|
||||
[self setupUI];
|
||||
[self setupOrchestrator];
|
||||
[self setupStreamingManager];
|
||||
}
|
||||
|
||||
- (void)viewWillAppear:(BOOL)animated {
|
||||
@@ -56,6 +67,7 @@
|
||||
|
||||
// 页面消失时停止对话
|
||||
[self.orchestrator stop];
|
||||
[self.streamingManager disconnect];
|
||||
}
|
||||
|
||||
- (void)viewDidLayoutSubviews {
|
||||
@@ -184,11 +196,15 @@
|
||||
- (void)setupOrchestrator {
|
||||
self.orchestrator = [[ConversationOrchestrator alloc] init];
|
||||
|
||||
// 配置服务器地址(TODO: 替换为实际地址)
|
||||
// self.orchestrator.asrServerURL = @"wss://your-asr-server.com/ws/asr";
|
||||
// self.orchestrator.llmServerURL =
|
||||
// @"https://your-llm-server.com/api/chat/stream";
|
||||
// self.orchestrator.ttsServerURL = @"https://your-tts-server.com/api/tts";
|
||||
// 配置服务器地址
|
||||
// 1. ASR 语音识别服务(WebSocket)
|
||||
self.orchestrator.asrServerURL = @"ws://192.168.2.21:7529/ws/asr";
|
||||
|
||||
// 2. LLM 大语言模型服务(HTTP Stream)
|
||||
self.orchestrator.llmServerURL = @"http://192.168.2.21:7529/api/chat/stream";
|
||||
|
||||
// 3. TTS 语音合成服务(HTTP)
|
||||
self.orchestrator.ttsServerURL = @"http://192.168.2.21:7529/api/tts/stream";
|
||||
|
||||
__weak typeof(self) weakSelf = self;
|
||||
|
||||
@@ -278,6 +294,16 @@
|
||||
};
|
||||
}
|
||||
|
||||
#pragma mark - Streaming Manager
|
||||
|
||||
- (void)setupStreamingManager {
|
||||
self.streamingManager = [[VoiceChatStreamingManager alloc] init];
|
||||
self.streamingManager.delegate = self;
|
||||
self.streamingManager.serverURL = @"ws://192.168.2.21:7529/api/ws/chat";
|
||||
self.assistantVisibleText = [[NSMutableString alloc] init];
|
||||
self.lastRMSLogTime = 0;
|
||||
}
|
||||
|
||||
#pragma mark - 事件
|
||||
- (void)showComment {
|
||||
CGFloat customViewHeight = KB_SCREEN_HEIGHT * (0.8);
|
||||
@@ -367,16 +393,112 @@
|
||||
#pragma mark - KBAiRecordButtonDelegate
|
||||
|
||||
- (void)recordButtonDidBeginPress:(KBAiRecordButton *)button {
|
||||
[self.orchestrator userDidPressRecord];
|
||||
NSLog(@"[KBAiMainVC] Record button began press");
|
||||
NSString *token = [[KBUserSessionManager shared] accessToken] ?: @"";
|
||||
if (token.length == 0) {
|
||||
[[KBUserSessionManager shared] goLoginVC];
|
||||
return;
|
||||
}
|
||||
|
||||
self.statusLabel.text = @"正在连接...";
|
||||
self.recordButton.state = KBAiRecordButtonStateRecording;
|
||||
[self.streamingManager startWithToken:token language:@"en" voiceId:nil];
|
||||
}
|
||||
|
||||
- (void)recordButtonDidEndPress:(KBAiRecordButton *)button {
|
||||
[self.orchestrator userDidReleaseRecord];
|
||||
NSLog(@"[KBAiMainVC] Record button end press");
|
||||
[self.streamingManager stopAndFinalize];
|
||||
}
|
||||
|
||||
- (void)recordButtonDidCancelPress:(KBAiRecordButton *)button {
|
||||
// 取消录音(同样调用 release,ASR 会返回空或部分结果)
|
||||
[self.orchestrator userDidReleaseRecord];
|
||||
NSLog(@"[KBAiMainVC] Record button cancel press");
|
||||
[self.streamingManager cancel];
|
||||
}
|
||||
|
||||
#pragma mark - VoiceChatStreamingManagerDelegate
|
||||
|
||||
- (void)voiceChatStreamingManagerDidConnect {
|
||||
self.statusLabel.text = @"已连接,准备中...";
|
||||
}
|
||||
|
||||
- (void)voiceChatStreamingManagerDidDisconnect:(NSError *_Nullable)error {
|
||||
self.recordButton.state = KBAiRecordButtonStateNormal;
|
||||
if (error) {
|
||||
[self showError:error];
|
||||
}
|
||||
}
|
||||
|
||||
- (void)voiceChatStreamingManagerDidStartSession:(NSString *)sessionId {
|
||||
self.statusLabel.text = @"正在聆听...";
|
||||
self.recordButton.state = KBAiRecordButtonStateRecording;
|
||||
}
|
||||
|
||||
- (void)voiceChatStreamingManagerDidStartTurn:(NSInteger)turnIndex {
|
||||
self.statusLabel.text = @"正在聆听...";
|
||||
self.recordButton.state = KBAiRecordButtonStateRecording;
|
||||
}
|
||||
|
||||
- (void)voiceChatStreamingManagerDidReceiveEagerEndOfTurnWithTranscript:(NSString *)text
|
||||
confidence:(double)confidence {
|
||||
self.statusLabel.text = @"准备响应...";
|
||||
}
|
||||
|
||||
- (void)voiceChatStreamingManagerDidResumeTurn {
|
||||
self.statusLabel.text = @"正在聆听...";
|
||||
}
|
||||
|
||||
- (void)voiceChatStreamingManagerDidUpdateRMS:(float)rms {
|
||||
[self.recordButton updateVolumeRMS:rms];
|
||||
NSTimeInterval now = [[NSDate date] timeIntervalSince1970];
|
||||
if (now - self.lastRMSLogTime >= 1.0) {
|
||||
self.lastRMSLogTime = now;
|
||||
NSLog(@"[KBAiMainVC] RMS: %.3f", rms);
|
||||
}
|
||||
}
|
||||
|
||||
- (void)voiceChatStreamingManagerDidReceiveInterimTranscript:(NSString *)text {
|
||||
self.statusLabel.text = text.length > 0 ? text : @"正在识别...";
|
||||
}
|
||||
|
||||
- (void)voiceChatStreamingManagerDidReceiveFinalTranscript:(NSString *)text {
|
||||
if (text.length > 0) {
|
||||
[self.chatView addUserMessage:text];
|
||||
}
|
||||
}
|
||||
|
||||
- (void)voiceChatStreamingManagerDidReceiveLLMStart {
|
||||
self.statusLabel.text = @"AI 正在思考...";
|
||||
[self.assistantVisibleText setString:@""];
|
||||
[self.chatView addAssistantMessage:@""];
|
||||
}
|
||||
|
||||
- (void)voiceChatStreamingManagerDidReceiveLLMToken:(NSString *)token {
|
||||
if (token.length == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
[self.assistantVisibleText appendString:token];
|
||||
[self.chatView updateLastAssistantMessage:self.assistantVisibleText];
|
||||
}
|
||||
|
||||
- (void)voiceChatStreamingManagerDidReceiveAudioChunk:(NSData *)audioData {
|
||||
}
|
||||
|
||||
- (void)voiceChatStreamingManagerDidCompleteWithTranscript:(NSString *)transcript
|
||||
aiResponse:(NSString *)aiResponse {
|
||||
NSString *finalText = aiResponse.length > 0 ? aiResponse
|
||||
: self.assistantVisibleText;
|
||||
if (finalText.length > 0) {
|
||||
[self.chatView updateLastAssistantMessage:finalText];
|
||||
[self.chatView markLastAssistantMessageComplete];
|
||||
}
|
||||
self.recordButton.state = KBAiRecordButtonStateNormal;
|
||||
self.statusLabel.text = @"完成";
|
||||
}
|
||||
|
||||
- (void)voiceChatStreamingManagerDidFail:(NSError *)error {
|
||||
self.recordButton.state = KBAiRecordButtonStateNormal;
|
||||
[self showError:error];
|
||||
}
|
||||
|
||||
@end
|
||||
|
||||
@@ -90,6 +90,11 @@
|
||||
// TTS Client
|
||||
self.ttsClient = [[TTSServiceClient alloc] init];
|
||||
self.ttsClient.delegate = self;
|
||||
// ElevenLabs 配置(通过后端代理)
|
||||
self.ttsClient.voiceId = @"JBFqnCBsd6RMkjVDRZzb"; // 默认语音 George
|
||||
self.ttsClient.languageCode = @"zh"; // 中文
|
||||
self.ttsClient.expectedPayloadType =
|
||||
TTSPayloadTypeURL; // 使用 URL 模式(简单)
|
||||
|
||||
// Playback Pipeline
|
||||
self.playbackPipeline = [[TTSPlaybackPipeline alloc] init];
|
||||
|
||||
@@ -41,6 +41,12 @@ typedef NS_ENUM(NSInteger, TTSPayloadType) {
|
||||
/// TTS 服务器 URL
|
||||
@property(nonatomic, copy) NSString *serverURL;
|
||||
|
||||
/// 语音 ID(ElevenLabs voice ID)
|
||||
@property(nonatomic, copy) NSString *voiceId;
|
||||
|
||||
/// 语言代码(如 "zh", "en")
|
||||
@property(nonatomic, copy) NSString *languageCode;
|
||||
|
||||
/// 当前期望的返回类型(由服务端配置决定)
|
||||
@property(nonatomic, assign) TTSPayloadType expectedPayloadType;
|
||||
|
||||
|
||||
@@ -94,6 +94,8 @@
|
||||
NSDictionary *body = @{
|
||||
@"text" : text,
|
||||
@"segmentId" : segmentId,
|
||||
@"voiceId" : self.voiceId ?: @"JBFqnCBsd6RMkjVDRZzb",
|
||||
@"languageCode" : self.languageCode ?: @"zh",
|
||||
@"format" : @"mp3" // 或 m4a
|
||||
};
|
||||
|
||||
@@ -184,6 +186,8 @@
|
||||
NSDictionary *requestDict = @{
|
||||
@"text" : text,
|
||||
@"segmentId" : segmentId,
|
||||
@"voiceId" : self.voiceId ?: @"JBFqnCBsd6RMkjVDRZzb",
|
||||
@"languageCode" : self.languageCode ?: @"zh",
|
||||
@"format" : [self formatStringForPayloadType:self.expectedPayloadType]
|
||||
};
|
||||
|
||||
|
||||
53
keyBoard/Class/AiTalk/VM/VoiceChatStreamingManager.h
Normal file
53
keyBoard/Class/AiTalk/VM/VoiceChatStreamingManager.h
Normal file
@@ -0,0 +1,53 @@
|
||||
//
|
||||
// VoiceChatStreamingManager.h
|
||||
// keyBoard
|
||||
//
|
||||
// Created by Mac on 2026/1/21.
|
||||
//
|
||||
|
||||
#import <Foundation/Foundation.h>
|
||||
|
||||
NS_ASSUME_NONNULL_BEGIN
|
||||
|
||||
@protocol VoiceChatStreamingManagerDelegate <NSObject>
|
||||
@optional
|
||||
- (void)voiceChatStreamingManagerDidConnect;
|
||||
- (void)voiceChatStreamingManagerDidDisconnect:(NSError *_Nullable)error;
|
||||
- (void)voiceChatStreamingManagerDidStartSession:(NSString *)sessionId;
|
||||
- (void)voiceChatStreamingManagerDidStartTurn:(NSInteger)turnIndex;
|
||||
- (void)voiceChatStreamingManagerDidReceiveEagerEndOfTurnWithTranscript:(NSString *)text
|
||||
confidence:(double)confidence;
|
||||
- (void)voiceChatStreamingManagerDidResumeTurn;
|
||||
- (void)voiceChatStreamingManagerDidUpdateRMS:(float)rms;
|
||||
- (void)voiceChatStreamingManagerDidReceiveInterimTranscript:(NSString *)text;
|
||||
- (void)voiceChatStreamingManagerDidReceiveFinalTranscript:(NSString *)text;
|
||||
- (void)voiceChatStreamingManagerDidReceiveLLMStart;
|
||||
- (void)voiceChatStreamingManagerDidReceiveLLMToken:(NSString *)token;
|
||||
- (void)voiceChatStreamingManagerDidReceiveAudioChunk:(NSData *)audioData;
|
||||
- (void)voiceChatStreamingManagerDidCompleteWithTranscript:(NSString *)transcript
|
||||
aiResponse:(NSString *)aiResponse;
|
||||
- (void)voiceChatStreamingManagerDidFail:(NSError *)error;
|
||||
@end
|
||||
|
||||
/// Manager for realtime recording and streaming.
|
||||
@interface VoiceChatStreamingManager : NSObject
|
||||
|
||||
@property(nonatomic, weak) id<VoiceChatStreamingManagerDelegate> delegate;
|
||||
|
||||
/// Base WebSocket URL, e.g. wss://api.yourdomain.com/api/ws/chat
|
||||
@property(nonatomic, copy) NSString *serverURL;
|
||||
|
||||
@property(nonatomic, assign, readonly, getter=isStreaming) BOOL streaming;
|
||||
@property(nonatomic, copy, readonly, nullable) NSString *sessionId;
|
||||
|
||||
- (void)startWithToken:(NSString *)token
|
||||
language:(nullable NSString *)language
|
||||
voiceId:(nullable NSString *)voiceId;
|
||||
|
||||
- (void)stopAndFinalize;
|
||||
- (void)cancel;
|
||||
- (void)disconnect;
|
||||
|
||||
@end
|
||||
|
||||
NS_ASSUME_NONNULL_END
|
||||
376
keyBoard/Class/AiTalk/VM/VoiceChatStreamingManager.m
Normal file
376
keyBoard/Class/AiTalk/VM/VoiceChatStreamingManager.m
Normal file
@@ -0,0 +1,376 @@
|
||||
//
|
||||
// VoiceChatStreamingManager.m
|
||||
// keyBoard
|
||||
//
|
||||
// Created by Mac on 2026/1/21.
|
||||
//
|
||||
|
||||
#import "VoiceChatStreamingManager.h"
|
||||
#import "AudioCaptureManager.h"
|
||||
#import "AudioSessionManager.h"
|
||||
#import "VoiceChatWebSocketClient.h"
|
||||
|
||||
static NSString *const kVoiceChatStreamingManagerErrorDomain =
|
||||
@"VoiceChatStreamingManager";
|
||||
|
||||
@interface VoiceChatStreamingManager () <AudioSessionManagerDelegate,
|
||||
AudioCaptureManagerDelegate,
|
||||
VoiceChatWebSocketClientDelegate>
|
||||
|
||||
@property(nonatomic, strong) AudioSessionManager *audioSession;
|
||||
@property(nonatomic, strong) AudioCaptureManager *audioCapture;
|
||||
@property(nonatomic, strong) VoiceChatWebSocketClient *webSocketClient;
|
||||
@property(nonatomic, strong) dispatch_queue_t stateQueue;
|
||||
|
||||
@property(nonatomic, assign) BOOL streaming;
|
||||
@property(nonatomic, copy) NSString *sessionId;
|
||||
|
||||
@property(nonatomic, copy) NSString *pendingToken;
|
||||
@property(nonatomic, copy) NSString *pendingLanguage;
|
||||
@property(nonatomic, copy) NSString *pendingVoiceId;
|
||||
|
||||
@end
|
||||
|
||||
@implementation VoiceChatStreamingManager
|
||||
|
||||
- (instancetype)init {
|
||||
self = [super init];
|
||||
if (self) {
|
||||
_stateQueue = dispatch_queue_create("com.keyboard.aitalk.voicechat.manager",
|
||||
DISPATCH_QUEUE_SERIAL);
|
||||
|
||||
_audioSession = [AudioSessionManager sharedManager];
|
||||
_audioSession.delegate = self;
|
||||
|
||||
_audioCapture = [[AudioCaptureManager alloc] init];
|
||||
_audioCapture.delegate = self;
|
||||
|
||||
_webSocketClient = [[VoiceChatWebSocketClient alloc] init];
|
||||
_webSocketClient.delegate = self;
|
||||
|
||||
_serverURL = @"ws://192.168.2.21:7529/api/ws/chat?token=";
|
||||
_webSocketClient.serverURL = _serverURL;
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (void)dealloc {
|
||||
[self disconnect];
|
||||
}
|
||||
|
||||
- (void)setServerURL:(NSString *)serverURL {
|
||||
_serverURL = [serverURL copy];
|
||||
self.webSocketClient.serverURL = _serverURL;
|
||||
}
|
||||
|
||||
#pragma mark - Public Methods
|
||||
|
||||
- (void)startWithToken:(NSString *)token
|
||||
language:(nullable NSString *)language
|
||||
voiceId:(nullable NSString *)voiceId {
|
||||
dispatch_async(self.stateQueue, ^{
|
||||
self.pendingToken = token ?: @"";
|
||||
self.pendingLanguage = language ?: @"";
|
||||
self.pendingVoiceId = voiceId ?: @"";
|
||||
[self.webSocketClient disableAudioSending];
|
||||
[self startInternal];
|
||||
});
|
||||
}
|
||||
|
||||
- (void)stopAndFinalize {
|
||||
dispatch_async(self.stateQueue, ^{
|
||||
if (self.streaming) {
|
||||
[self.audioCapture stopCapture];
|
||||
self.streaming = NO;
|
||||
}
|
||||
[self.webSocketClient disableAudioSending];
|
||||
[self.webSocketClient endAudio];
|
||||
});
|
||||
}
|
||||
|
||||
- (void)cancel {
|
||||
dispatch_async(self.stateQueue, ^{
|
||||
if (self.streaming) {
|
||||
[self.audioCapture stopCapture];
|
||||
self.streaming = NO;
|
||||
}
|
||||
[self.webSocketClient disableAudioSending];
|
||||
[self.webSocketClient cancel];
|
||||
self.sessionId = nil;
|
||||
});
|
||||
}
|
||||
|
||||
- (void)disconnect {
|
||||
dispatch_async(self.stateQueue, ^{
|
||||
if (self.streaming) {
|
||||
[self.audioCapture stopCapture];
|
||||
self.streaming = NO;
|
||||
}
|
||||
[self.webSocketClient disableAudioSending];
|
||||
[self.webSocketClient disconnect];
|
||||
[self.audioSession deactivateSession];
|
||||
self.sessionId = nil;
|
||||
});
|
||||
}
|
||||
|
||||
#pragma mark - Private Methods
|
||||
|
||||
- (void)startInternal {
|
||||
if (self.pendingToken.length == 0) {
|
||||
NSLog(@"[VoiceChatStreamingManager] Start failed: token is empty");
|
||||
[self reportErrorWithMessage:@"Token is required"];
|
||||
return;
|
||||
}
|
||||
|
||||
if (![self.audioSession hasMicrophonePermission]) {
|
||||
__weak typeof(self) weakSelf = self;
|
||||
[self.audioSession requestMicrophonePermission:^(BOOL granted) {
|
||||
__strong typeof(weakSelf) strongSelf = weakSelf;
|
||||
if (!strongSelf) {
|
||||
return;
|
||||
}
|
||||
if (!granted) {
|
||||
[strongSelf reportErrorWithMessage:@"Microphone permission denied"];
|
||||
return;
|
||||
}
|
||||
dispatch_async(strongSelf.stateQueue, ^{
|
||||
[strongSelf startInternal];
|
||||
});
|
||||
}];
|
||||
return;
|
||||
}
|
||||
|
||||
NSError *error = nil;
|
||||
if (![self.audioSession configureForConversation:&error]) {
|
||||
[self reportError:error];
|
||||
return;
|
||||
}
|
||||
|
||||
if (![self.audioSession activateSession:&error]) {
|
||||
[self reportError:error];
|
||||
return;
|
||||
}
|
||||
|
||||
if (self.serverURL.length == 0) {
|
||||
NSLog(@"[VoiceChatStreamingManager] Start failed: server URL is empty");
|
||||
[self reportErrorWithMessage:@"Server URL is required"];
|
||||
return;
|
||||
}
|
||||
|
||||
NSLog(@"[VoiceChatStreamingManager] Start streaming, server: %@",
|
||||
self.serverURL);
|
||||
self.webSocketClient.serverURL = self.serverURL;
|
||||
[self.webSocketClient connectWithToken:self.pendingToken];
|
||||
}
|
||||
|
||||
- (void)reportError:(NSError *)error {
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatStreamingManagerDidFail:)]) {
|
||||
[self.delegate voiceChatStreamingManagerDidFail:error];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
- (void)reportErrorWithMessage:(NSString *)message {
|
||||
NSError *error = [NSError errorWithDomain:kVoiceChatStreamingManagerErrorDomain
|
||||
code:-1
|
||||
userInfo:@{
|
||||
NSLocalizedDescriptionKey : message ?: @""
|
||||
}];
|
||||
[self reportError:error];
|
||||
}
|
||||
|
||||
#pragma mark - AudioCaptureManagerDelegate
|
||||
|
||||
- (void)audioCaptureManagerDidOutputPCMFrame:(NSData *)pcmFrame {
|
||||
if (!self.streaming) {
|
||||
return;
|
||||
}
|
||||
[self.webSocketClient sendAudioPCMFrame:pcmFrame];
|
||||
}
|
||||
|
||||
- (void)audioCaptureManagerDidUpdateRMS:(float)rms {
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatStreamingManagerDidUpdateRMS:)]) {
|
||||
[self.delegate voiceChatStreamingManagerDidUpdateRMS:rms];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#pragma mark - AudioSessionManagerDelegate
|
||||
|
||||
- (void)audioSessionManagerDidInterrupt:(KBAudioSessionInterruptionType)type {
|
||||
if (type == KBAudioSessionInterruptionTypeBegan) {
|
||||
[self cancel];
|
||||
}
|
||||
}
|
||||
|
||||
- (void)audioSessionManagerMicrophonePermissionDenied {
|
||||
[self reportErrorWithMessage:@"Microphone permission denied"];
|
||||
}
|
||||
|
||||
#pragma mark - VoiceChatWebSocketClientDelegate
|
||||
|
||||
- (void)voiceChatClientDidConnect {
|
||||
dispatch_async(self.stateQueue, ^{
|
||||
[self.webSocketClient startSessionWithLanguage:self.pendingLanguage
|
||||
voiceId:self.pendingVoiceId];
|
||||
});
|
||||
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatStreamingManagerDidConnect)]) {
|
||||
[self.delegate voiceChatStreamingManagerDidConnect];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
- (void)voiceChatClientDidDisconnect:(NSError *_Nullable)error {
|
||||
dispatch_async(self.stateQueue, ^{
|
||||
if (self.streaming) {
|
||||
[self.audioCapture stopCapture];
|
||||
self.streaming = NO;
|
||||
}
|
||||
[self.audioSession deactivateSession];
|
||||
self.sessionId = nil;
|
||||
});
|
||||
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatStreamingManagerDidDisconnect:)]) {
|
||||
[self.delegate voiceChatStreamingManagerDidDisconnect:error];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
- (void)voiceChatClientDidStartSession:(NSString *)sessionId {
|
||||
dispatch_async(self.stateQueue, ^{
|
||||
self.sessionId = sessionId;
|
||||
|
||||
NSError *error = nil;
|
||||
if (![self.audioCapture startCapture:&error]) {
|
||||
[self reportError:error];
|
||||
[self.webSocketClient cancel];
|
||||
return;
|
||||
}
|
||||
|
||||
self.streaming = YES;
|
||||
[self.webSocketClient enableAudioSending];
|
||||
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatStreamingManagerDidStartSession:)]) {
|
||||
[self.delegate voiceChatStreamingManagerDidStartSession:sessionId];
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
- (void)voiceChatClientDidStartTurn:(NSInteger)turnIndex {
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatStreamingManagerDidStartTurn:)]) {
|
||||
[self.delegate voiceChatStreamingManagerDidStartTurn:turnIndex];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
- (void)voiceChatClientDidReceiveEagerEndOfTurnWithTranscript:(NSString *)text
|
||||
confidence:(double)confidence {
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate
|
||||
respondsToSelector:@selector
|
||||
(voiceChatStreamingManagerDidReceiveEagerEndOfTurnWithTranscript:
|
||||
confidence:)]) {
|
||||
[self.delegate
|
||||
voiceChatStreamingManagerDidReceiveEagerEndOfTurnWithTranscript:text
|
||||
confidence:confidence];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
- (void)voiceChatClientDidResumeTurn {
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatStreamingManagerDidResumeTurn)]) {
|
||||
[self.delegate voiceChatStreamingManagerDidResumeTurn];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
- (void)voiceChatClientDidReceiveInterimTranscript:(NSString *)text {
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatStreamingManagerDidReceiveInterimTranscript:)]) {
|
||||
[self.delegate voiceChatStreamingManagerDidReceiveInterimTranscript:text];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
- (void)voiceChatClientDidReceiveFinalTranscript:(NSString *)text {
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatStreamingManagerDidReceiveFinalTranscript:)]) {
|
||||
[self.delegate voiceChatStreamingManagerDidReceiveFinalTranscript:text];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
- (void)voiceChatClientDidReceiveLLMStart {
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatStreamingManagerDidReceiveLLMStart)]) {
|
||||
[self.delegate voiceChatStreamingManagerDidReceiveLLMStart];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
- (void)voiceChatClientDidReceiveLLMToken:(NSString *)token {
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatStreamingManagerDidReceiveLLMToken:)]) {
|
||||
[self.delegate voiceChatStreamingManagerDidReceiveLLMToken:token];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
- (void)voiceChatClientDidReceiveAudioChunk:(NSData *)audioData {
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatStreamingManagerDidReceiveAudioChunk:)]) {
|
||||
[self.delegate voiceChatStreamingManagerDidReceiveAudioChunk:audioData];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
- (void)voiceChatClientDidCompleteWithTranscript:(NSString *)transcript
|
||||
aiResponse:(NSString *)aiResponse {
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatStreamingManagerDidCompleteWithTranscript:
|
||||
aiResponse:)]) {
|
||||
[self.delegate voiceChatStreamingManagerDidCompleteWithTranscript:transcript
|
||||
aiResponse:aiResponse];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
- (void)voiceChatClientDidReceiveErrorCode:(NSString *)code
|
||||
message:(NSString *)message {
|
||||
NSString *desc = message.length > 0 ? message : @"Server error";
|
||||
NSError *error = [NSError errorWithDomain:kVoiceChatStreamingManagerErrorDomain
|
||||
code:-2
|
||||
userInfo:@{
|
||||
NSLocalizedDescriptionKey : desc,
|
||||
@"code" : code ?: @""
|
||||
}];
|
||||
[self reportError:error];
|
||||
}
|
||||
|
||||
- (void)voiceChatClientDidFail:(NSError *)error {
|
||||
[self reportError:error];
|
||||
}
|
||||
|
||||
@end
|
||||
57
keyBoard/Class/AiTalk/VM/VoiceChatWebSocketClient.h
Normal file
57
keyBoard/Class/AiTalk/VM/VoiceChatWebSocketClient.h
Normal file
@@ -0,0 +1,57 @@
|
||||
//
|
||||
// VoiceChatWebSocketClient.h
|
||||
// keyBoard
|
||||
//
|
||||
// Created by Mac on 2026/1/21.
|
||||
//
|
||||
|
||||
#import <Foundation/Foundation.h>
|
||||
|
||||
NS_ASSUME_NONNULL_BEGIN
|
||||
|
||||
@protocol VoiceChatWebSocketClientDelegate <NSObject>
|
||||
@optional
|
||||
- (void)voiceChatClientDidConnect;
|
||||
- (void)voiceChatClientDidDisconnect:(NSError *_Nullable)error;
|
||||
- (void)voiceChatClientDidStartSession:(NSString *)sessionId;
|
||||
- (void)voiceChatClientDidStartTurn:(NSInteger)turnIndex;
|
||||
- (void)voiceChatClientDidReceiveEagerEndOfTurnWithTranscript:(NSString *)text
|
||||
confidence:(double)confidence;
|
||||
- (void)voiceChatClientDidResumeTurn;
|
||||
- (void)voiceChatClientDidReceiveInterimTranscript:(NSString *)text;
|
||||
- (void)voiceChatClientDidReceiveFinalTranscript:(NSString *)text;
|
||||
- (void)voiceChatClientDidReceiveLLMStart;
|
||||
- (void)voiceChatClientDidReceiveLLMToken:(NSString *)token;
|
||||
- (void)voiceChatClientDidReceiveAudioChunk:(NSData *)audioData;
|
||||
- (void)voiceChatClientDidCompleteWithTranscript:(NSString *)transcript
|
||||
aiResponse:(NSString *)aiResponse;
|
||||
- (void)voiceChatClientDidReceiveErrorCode:(NSString *)code
|
||||
message:(NSString *)message;
|
||||
- (void)voiceChatClientDidFail:(NSError *)error;
|
||||
@end
|
||||
|
||||
/// WebSocket client for realtime voice chat.
|
||||
@interface VoiceChatWebSocketClient : NSObject
|
||||
|
||||
@property(nonatomic, weak) id<VoiceChatWebSocketClientDelegate> delegate;
|
||||
|
||||
/// Base WebSocket URL, e.g. wss://api.yourdomain.com/api/ws/chat
|
||||
@property(nonatomic, copy) NSString *serverURL;
|
||||
|
||||
@property(nonatomic, assign, readonly, getter=isConnected) BOOL connected;
|
||||
@property(nonatomic, copy, readonly, nullable) NSString *sessionId;
|
||||
|
||||
- (void)connectWithToken:(NSString *)token;
|
||||
- (void)disconnect;
|
||||
|
||||
- (void)startSessionWithLanguage:(nullable NSString *)language
|
||||
voiceId:(nullable NSString *)voiceId;
|
||||
- (void)enableAudioSending;
|
||||
- (void)disableAudioSending;
|
||||
- (void)sendAudioPCMFrame:(NSData *)pcmFrame;
|
||||
- (void)endAudio;
|
||||
- (void)cancel;
|
||||
|
||||
@end
|
||||
|
||||
NS_ASSUME_NONNULL_END
|
||||
457
keyBoard/Class/AiTalk/VM/VoiceChatWebSocketClient.m
Normal file
457
keyBoard/Class/AiTalk/VM/VoiceChatWebSocketClient.m
Normal file
@@ -0,0 +1,457 @@
|
||||
//
|
||||
// VoiceChatWebSocketClient.m
|
||||
// keyBoard
|
||||
//
|
||||
// Created by Mac on 2026/1/21.
|
||||
//
|
||||
|
||||
#import "VoiceChatWebSocketClient.h"
|
||||
|
||||
static NSString *const kVoiceChatWebSocketClientErrorDomain =
|
||||
@"VoiceChatWebSocketClient";
|
||||
|
||||
@interface VoiceChatWebSocketClient () <NSURLSessionWebSocketDelegate>
|
||||
|
||||
@property(nonatomic, strong) NSURLSession *urlSession;
|
||||
@property(nonatomic, strong) NSURLSessionWebSocketTask *webSocketTask;
|
||||
@property(nonatomic, strong) dispatch_queue_t networkQueue;
|
||||
@property(nonatomic, assign) BOOL connected;
|
||||
@property(nonatomic, copy) NSString *sessionId;
|
||||
@property(nonatomic, assign) BOOL audioSendingEnabled;
|
||||
|
||||
@end
|
||||
|
||||
@implementation VoiceChatWebSocketClient
|
||||
|
||||
- (instancetype)init {
|
||||
self = [super init];
|
||||
if (self) {
|
||||
_networkQueue = dispatch_queue_create("com.keyboard.aitalk.voicechat.ws",
|
||||
DISPATCH_QUEUE_SERIAL);
|
||||
_serverURL = @"wss://api.yourdomain.com/api/ws/chat";
|
||||
_audioSendingEnabled = NO;
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (void)dealloc {
|
||||
[self disconnect];
|
||||
}
|
||||
|
||||
#pragma mark - Public Methods
|
||||
|
||||
- (void)connectWithToken:(NSString *)token {
|
||||
dispatch_async(self.networkQueue, ^{
|
||||
[self disconnectInternal];
|
||||
|
||||
NSURL *url = [self buildURLWithToken:token];
|
||||
if (!url) {
|
||||
[self reportErrorWithMessage:@"Invalid server URL"];
|
||||
return;
|
||||
}
|
||||
|
||||
NSLog(@"[VoiceChatWebSocketClient] Connecting: %@", url.absoluteString);
|
||||
|
||||
NSURLSessionConfiguration *config =
|
||||
[NSURLSessionConfiguration defaultSessionConfiguration];
|
||||
config.timeoutIntervalForRequest = 30;
|
||||
config.timeoutIntervalForResource = 300;
|
||||
|
||||
self.urlSession = [NSURLSession sessionWithConfiguration:config
|
||||
delegate:self
|
||||
delegateQueue:nil];
|
||||
|
||||
self.webSocketTask = [self.urlSession webSocketTaskWithURL:url];
|
||||
[self.webSocketTask resume];
|
||||
[self receiveMessage];
|
||||
});
|
||||
}
|
||||
|
||||
- (void)disconnect {
|
||||
dispatch_async(self.networkQueue, ^{
|
||||
BOOL shouldNotify = self.webSocketTask != nil;
|
||||
if (shouldNotify) {
|
||||
NSLog(@"[VoiceChatWebSocketClient] Disconnect requested");
|
||||
}
|
||||
[self disconnectInternal];
|
||||
if (shouldNotify) {
|
||||
[self notifyDisconnect:nil];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
- (void)startSessionWithLanguage:(nullable NSString *)language
|
||||
voiceId:(nullable NSString *)voiceId {
|
||||
NSMutableDictionary *message = [NSMutableDictionary dictionary];
|
||||
message[@"type"] = @"session_start";
|
||||
|
||||
NSMutableDictionary *config = [NSMutableDictionary dictionary];
|
||||
if (language.length > 0) {
|
||||
config[@"language"] = language;
|
||||
}
|
||||
if (voiceId.length > 0) {
|
||||
config[@"voice_id"] = voiceId;
|
||||
}
|
||||
if (config.count > 0) {
|
||||
message[@"config"] = config;
|
||||
}
|
||||
|
||||
[self sendJSON:message];
|
||||
}
|
||||
|
||||
- (void)enableAudioSending {
|
||||
dispatch_async(self.networkQueue, ^{
|
||||
self.audioSendingEnabled = YES;
|
||||
});
|
||||
}
|
||||
|
||||
- (void)disableAudioSending {
|
||||
dispatch_async(self.networkQueue, ^{
|
||||
self.audioSendingEnabled = NO;
|
||||
});
|
||||
}
|
||||
|
||||
- (void)sendAudioPCMFrame:(NSData *)pcmFrame {
|
||||
if (!self.connected || !self.webSocketTask || pcmFrame.length == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
dispatch_async(self.networkQueue, ^{
|
||||
if (!self.audioSendingEnabled) {
|
||||
return;
|
||||
}
|
||||
if (!self.connected || !self.webSocketTask) {
|
||||
return;
|
||||
}
|
||||
NSURLSessionWebSocketMessage *message =
|
||||
[[NSURLSessionWebSocketMessage alloc] initWithData:pcmFrame];
|
||||
[self.webSocketTask
|
||||
sendMessage:message
|
||||
completionHandler:^(NSError *_Nullable error) {
|
||||
if (error) {
|
||||
[self reportError:error];
|
||||
} else {
|
||||
NSLog(@"[VoiceChatWebSocketClient] Sent audio frame: %lu bytes",
|
||||
(unsigned long)pcmFrame.length);
|
||||
}
|
||||
}];
|
||||
});
|
||||
}
|
||||
|
||||
- (void)endAudio {
|
||||
NSLog(@"[VoiceChatWebSocketClient] Sending audio_end");
|
||||
[self sendJSON:@{ @"type" : @"audio_end" }];
|
||||
}
|
||||
|
||||
- (void)cancel {
|
||||
NSLog(@"[VoiceChatWebSocketClient] Sending cancel");
|
||||
[self sendJSON:@{ @"type" : @"cancel" }];
|
||||
}
|
||||
|
||||
#pragma mark - Private Methods
|
||||
|
||||
- (NSURL *)buildURLWithToken:(NSString *)token {
|
||||
if (self.serverURL.length == 0) {
|
||||
return nil;
|
||||
}
|
||||
|
||||
NSURLComponents *components =
|
||||
[NSURLComponents componentsWithString:self.serverURL];
|
||||
if (!components) {
|
||||
return nil;
|
||||
}
|
||||
|
||||
if (token.length > 0) {
|
||||
NSMutableArray<NSURLQueryItem *> *items =
|
||||
components.queryItems.mutableCopy ?: [NSMutableArray array];
|
||||
BOOL didReplace = NO;
|
||||
for (NSUInteger i = 0; i < items.count; i++) {
|
||||
NSURLQueryItem *item = items[i];
|
||||
if ([item.name isEqualToString:@"token"]) {
|
||||
items[i] = [NSURLQueryItem queryItemWithName:@"token" value:token];
|
||||
didReplace = YES;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!didReplace) {
|
||||
[items addObject:[NSURLQueryItem queryItemWithName:@"token"
|
||||
value:token]];
|
||||
}
|
||||
components.queryItems = items;
|
||||
}
|
||||
|
||||
return components.URL;
|
||||
}
|
||||
|
||||
- (void)sendJSON:(NSDictionary *)dict {
|
||||
if (!self.webSocketTask) {
|
||||
return;
|
||||
}
|
||||
|
||||
NSError *jsonError = nil;
|
||||
NSData *jsonData = [NSJSONSerialization dataWithJSONObject:dict
|
||||
options:0
|
||||
error:&jsonError];
|
||||
if (jsonError) {
|
||||
[self reportError:jsonError];
|
||||
return;
|
||||
}
|
||||
|
||||
NSString *jsonString =
|
||||
[[NSString alloc] initWithData:jsonData
|
||||
encoding:NSUTF8StringEncoding];
|
||||
if (!jsonString) {
|
||||
[self reportErrorWithMessage:@"Failed to encode JSON message"];
|
||||
return;
|
||||
}
|
||||
|
||||
dispatch_async(self.networkQueue, ^{
|
||||
NSURLSessionWebSocketMessage *message =
|
||||
[[NSURLSessionWebSocketMessage alloc] initWithString:jsonString];
|
||||
[self.webSocketTask
|
||||
sendMessage:message
|
||||
completionHandler:^(NSError *_Nullable error) {
|
||||
if (error) {
|
||||
[self reportError:error];
|
||||
}
|
||||
}];
|
||||
});
|
||||
}
|
||||
|
||||
- (void)receiveMessage {
|
||||
if (!self.webSocketTask) {
|
||||
return;
|
||||
}
|
||||
|
||||
__weak typeof(self) weakSelf = self;
|
||||
[self.webSocketTask receiveMessageWithCompletionHandler:^(
|
||||
NSURLSessionWebSocketMessage *_Nullable message,
|
||||
NSError *_Nullable error) {
|
||||
__strong typeof(weakSelf) strongSelf = weakSelf;
|
||||
if (!strongSelf) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (error) {
|
||||
if (error.code != NSURLErrorCancelled && error.code != 57) {
|
||||
[strongSelf notifyDisconnect:error];
|
||||
[strongSelf disconnectInternal];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (message.type == NSURLSessionWebSocketMessageTypeString) {
|
||||
NSLog(@"[VoiceChatWebSocketClient] Received text: %@", message.string);
|
||||
[strongSelf handleTextMessage:message.string];
|
||||
} else if (message.type == NSURLSessionWebSocketMessageTypeData) {
|
||||
NSLog(@"[VoiceChatWebSocketClient] Received binary: %lu bytes",
|
||||
(unsigned long)message.data.length);
|
||||
[strongSelf handleBinaryMessage:message.data];
|
||||
}
|
||||
|
||||
[strongSelf receiveMessage];
|
||||
}];
|
||||
}
|
||||
|
||||
- (void)handleTextMessage:(NSString *)text {
|
||||
if (text.length == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
NSData *data = [text dataUsingEncoding:NSUTF8StringEncoding];
|
||||
if (!data) {
|
||||
return;
|
||||
}
|
||||
|
||||
NSError *jsonError = nil;
|
||||
NSDictionary *json = [NSJSONSerialization JSONObjectWithData:data
|
||||
options:0
|
||||
error:&jsonError];
|
||||
if (jsonError) {
|
||||
[self reportError:jsonError];
|
||||
return;
|
||||
}
|
||||
|
||||
NSString *type = json[@"type"];
|
||||
if (type.length == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if ([type isEqualToString:@"session_started"]) {
|
||||
NSString *sessionId = json[@"session_id"] ?: @"";
|
||||
self.sessionId = sessionId;
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatClientDidStartSession:)]) {
|
||||
[self.delegate voiceChatClientDidStartSession:sessionId];
|
||||
}
|
||||
});
|
||||
} else if ([type isEqualToString:@"transcript_interim"]) {
|
||||
NSString *transcript = json[@"text"] ?: @"";
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatClientDidReceiveInterimTranscript:)]) {
|
||||
[self.delegate voiceChatClientDidReceiveInterimTranscript:transcript];
|
||||
}
|
||||
});
|
||||
} else if ([type isEqualToString:@"transcript_final"]) {
|
||||
NSString *transcript = json[@"text"] ?: @"";
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatClientDidReceiveFinalTranscript:)]) {
|
||||
[self.delegate voiceChatClientDidReceiveFinalTranscript:transcript];
|
||||
}
|
||||
});
|
||||
} else if ([type isEqualToString:@"turn_start"]) {
|
||||
NSInteger turnIndex = [json[@"turn_index"] integerValue];
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatClientDidStartTurn:)]) {
|
||||
[self.delegate voiceChatClientDidStartTurn:turnIndex];
|
||||
}
|
||||
});
|
||||
} else if ([type isEqualToString:@"eager_eot"]) {
|
||||
NSString *transcript = json[@"transcript"] ?: @"";
|
||||
double confidence = [json[@"confidence"] doubleValue];
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatClientDidReceiveEagerEndOfTurnWithTranscript:
|
||||
confidence:)]) {
|
||||
[self.delegate
|
||||
voiceChatClientDidReceiveEagerEndOfTurnWithTranscript:transcript
|
||||
confidence:confidence];
|
||||
}
|
||||
});
|
||||
} else if ([type isEqualToString:@"turn_resumed"]) {
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatClientDidResumeTurn)]) {
|
||||
[self.delegate voiceChatClientDidResumeTurn];
|
||||
}
|
||||
});
|
||||
} else if ([type isEqualToString:@"llm_start"]) {
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate
|
||||
respondsToSelector:@selector(voiceChatClientDidReceiveLLMStart)]) {
|
||||
[self.delegate voiceChatClientDidReceiveLLMStart];
|
||||
}
|
||||
});
|
||||
} else if ([type isEqualToString:@"llm_token"]) {
|
||||
NSString *token = json[@"token"] ?: @"";
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate
|
||||
respondsToSelector:@selector(voiceChatClientDidReceiveLLMToken:)]) {
|
||||
[self.delegate voiceChatClientDidReceiveLLMToken:token];
|
||||
}
|
||||
});
|
||||
} else if ([type isEqualToString:@"complete"]) {
|
||||
NSString *transcript = json[@"transcript"] ?: @"";
|
||||
NSString *aiResponse = json[@"ai_response"] ?: @"";
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatClientDidCompleteWithTranscript:
|
||||
aiResponse:)]) {
|
||||
[self.delegate voiceChatClientDidCompleteWithTranscript:transcript
|
||||
aiResponse:aiResponse];
|
||||
}
|
||||
});
|
||||
} else if ([type isEqualToString:@"error"]) {
|
||||
NSString *code = json[@"code"] ?: @"";
|
||||
NSString *message = json[@"message"] ?: @"";
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatClientDidReceiveErrorCode:message:)]) {
|
||||
[self.delegate voiceChatClientDidReceiveErrorCode:code
|
||||
message:message];
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
- (void)handleBinaryMessage:(NSData *)data {
|
||||
if (data.length == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate
|
||||
respondsToSelector:@selector(voiceChatClientDidReceiveAudioChunk:)]) {
|
||||
[self.delegate voiceChatClientDidReceiveAudioChunk:data];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
- (void)disconnectInternal {
|
||||
self.connected = NO;
|
||||
self.sessionId = nil;
|
||||
self.audioSendingEnabled = NO;
|
||||
|
||||
if (self.webSocketTask) {
|
||||
[self.webSocketTask
|
||||
cancelWithCloseCode:NSURLSessionWebSocketCloseCodeNormalClosure
|
||||
reason:nil];
|
||||
self.webSocketTask = nil;
|
||||
}
|
||||
|
||||
if (self.urlSession) {
|
||||
[self.urlSession invalidateAndCancel];
|
||||
self.urlSession = nil;
|
||||
}
|
||||
}
|
||||
|
||||
- (void)reportError:(NSError *)error {
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector(voiceChatClientDidFail:)]) {
|
||||
[self.delegate voiceChatClientDidFail:error];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
- (void)reportErrorWithMessage:(NSString *)message {
|
||||
NSError *error = [NSError errorWithDomain:kVoiceChatWebSocketClientErrorDomain
|
||||
code:-1
|
||||
userInfo:@{
|
||||
NSLocalizedDescriptionKey : message ?: @""
|
||||
}];
|
||||
[self reportError:error];
|
||||
}
|
||||
|
||||
- (void)notifyDisconnect:(NSError *_Nullable)error {
|
||||
self.connected = NO;
|
||||
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector
|
||||
(voiceChatClientDidDisconnect:)]) {
|
||||
[self.delegate voiceChatClientDidDisconnect:error];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#pragma mark - NSURLSessionWebSocketDelegate
|
||||
|
||||
- (void)URLSession:(NSURLSession *)session
|
||||
webSocketTask:(NSURLSessionWebSocketTask *)webSocketTask
|
||||
didOpenWithProtocol:(NSString *)protocol {
|
||||
self.connected = YES;
|
||||
NSLog(@"[VoiceChatWebSocketClient] Connected");
|
||||
dispatch_async(dispatch_get_main_queue(), ^{
|
||||
if ([self.delegate respondsToSelector:@selector(voiceChatClientDidConnect)]) {
|
||||
[self.delegate voiceChatClientDidConnect];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
- (void)URLSession:(NSURLSession *)session
|
||||
webSocketTask:(NSURLSessionWebSocketTask *)webSocketTask
|
||||
didCloseWithCode:(NSURLSessionWebSocketCloseCode)closeCode
|
||||
reason:(NSData *)reason {
|
||||
if (!self.webSocketTask) {
|
||||
return;
|
||||
}
|
||||
NSLog(@"[VoiceChatWebSocketClient] Closed with code: %ld",
|
||||
(long)closeCode);
|
||||
[self notifyDisconnect:nil];
|
||||
[self disconnectInternal];
|
||||
}
|
||||
|
||||
@end
|
||||
771
keyBoard/Class/AiTalk/websocket-api.md
Normal file
771
keyBoard/Class/AiTalk/websocket-api.md
Normal file
@@ -0,0 +1,771 @@
|
||||
# 实时语音对话 WebSocket API 文档
|
||||
|
||||
> Version: 2.0.0 (Flux)
|
||||
> Last Updated: 2026-01-21
|
||||
> Author: Backend Team
|
||||
|
||||
---
|
||||
|
||||
## 概述
|
||||
|
||||
本文档描述实时语音对话 WebSocket API,用于 iOS 客户端与后端进行实时语音交互。
|
||||
|
||||
**v2.0 更新**: 升级为 Deepgram Flux 模型,支持智能轮次检测和 EagerEndOfTurn 提前响应。
|
||||
|
||||
### 核心特性
|
||||
- **智能轮次检测**: Flux 模型语义理解,自动判断用户说完(非简单静默检测)
|
||||
- **EagerEndOfTurn**: 提前启动 LLM 响应,进一步降低延迟
|
||||
- **实时语音识别**: 边说边识别,实时显示转写文本
|
||||
- **流式响应**: AI 响应边生成边返回,无需等待完整响应
|
||||
- **流式音频**: TTS 音频边合成边播放,极低延迟
|
||||
- **Barge-in 支持**: 用户可以打断 AI 说话
|
||||
|
||||
### 性能指标
|
||||
| 指标 | 目标值 | 说明 |
|
||||
|------|--------|------|
|
||||
| 端点检测延迟 | ~260ms | Flux 智能检测 |
|
||||
| TTFA (首音频延迟) | < 300ms | EagerEndOfTurn 优化 |
|
||||
| 端到端延迟 | < 1.5秒 | 完整对话周期 |
|
||||
| 实时转写延迟 | < 100ms | 中间结果 |
|
||||
|
||||
---
|
||||
|
||||
## 连接信息
|
||||
|
||||
### WebSocket 端点
|
||||
|
||||
```
|
||||
生产环境: wss://api.yourdomain.com/api/ws/chat?token={sa_token}
|
||||
开发环境: ws://localhost:7529/api/ws/chat?token={sa_token}
|
||||
```
|
||||
|
||||
### 认证方式
|
||||
|
||||
通过 URL Query 参数传递 Sa-Token:
|
||||
|
||||
```
|
||||
ws://host:port/api/ws/chat?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9...
|
||||
```
|
||||
|
||||
| 参数 | 类型 | 必填 | 描述 |
|
||||
|------|------|------|------|
|
||||
| token | String | ✅ | Sa-Token 登录令牌,通过 Apple Sign-In 获取 |
|
||||
|
||||
### 认证失败
|
||||
|
||||
如果 token 无效或过期,WebSocket 连接将被拒绝(HTTP 403)。
|
||||
|
||||
---
|
||||
|
||||
## 消息格式
|
||||
|
||||
### 通用规则
|
||||
|
||||
1. **文本消息**: JSON 格式,用于控制指令和状态通知
|
||||
2. **二进制消息**: 原始字节,用于音频数据传输
|
||||
3. **编码**: UTF-8
|
||||
|
||||
---
|
||||
|
||||
## 客户端 → 服务端消息
|
||||
|
||||
### 1. 开始会话 (session_start)
|
||||
|
||||
**发送时机**: 建立 WebSocket 连接后,准备开始录音前
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "session_start",
|
||||
"config": {
|
||||
"language": "en",
|
||||
"voice_id": "a5zfmqTslZJBP0jutmVY"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
| 字段 | 类型 | 必填 | 描述 |
|
||||
|------|------|------|------|
|
||||
| type | String | ✅ | 固定值 `session_start` |
|
||||
| config | Object | ❌ | 会话配置(可选) |
|
||||
| config.language | String | ❌ | 语音识别语言,默认 `en` |
|
||||
| config.voice_id | String | ❌ | TTS 声音 ID,默认使用服务端配置 |
|
||||
|
||||
**响应**: 服务端返回 `session_started` 消息
|
||||
|
||||
---
|
||||
|
||||
### 2. 音频数据 (Binary)
|
||||
|
||||
**发送时机**: 用户正在录音时,持续发送音频数据
|
||||
|
||||
**格式**: Binary WebSocket Frame,直接发送原始音频字节
|
||||
|
||||
**音频规格要求**:
|
||||
|
||||
| 参数 | 值 | 说明 |
|
||||
|------|------|------|
|
||||
| 编码格式 | PCM (Linear16) | 未压缩的脉冲编码调制 |
|
||||
| 采样率 | 16000 Hz | 16kHz |
|
||||
| 位深度 | 16-bit | 有符号整数 |
|
||||
| 声道数 | 1 (Mono) | 单声道 |
|
||||
| 字节序 | Little-Endian | 小端序 |
|
||||
|
||||
**iOS 代码示例**:
|
||||
|
||||
```swift
|
||||
// AVAudioEngine 配置
|
||||
let format = AVAudioFormat(
|
||||
commonFormat: .pcmFormatInt16,
|
||||
sampleRate: 16000,
|
||||
channels: 1,
|
||||
interleaved: true
|
||||
)!
|
||||
|
||||
// 发送音频数据
|
||||
audioEngine.inputNode.installTap(
|
||||
onBus: 0,
|
||||
bufferSize: 1024,
|
||||
format: format
|
||||
) { buffer, time in
|
||||
let audioData = buffer.int16ChannelData![0]
|
||||
let byteCount = Int(buffer.frameLength) * 2 // 16-bit = 2 bytes
|
||||
let data = Data(bytes: audioData, count: byteCount)
|
||||
|
||||
webSocket.write(data: data)
|
||||
}
|
||||
```
|
||||
|
||||
**发送频率**: 建议每 20-100ms 发送一次,每次 320-1600 字节
|
||||
|
||||
---
|
||||
|
||||
### 3. 结束录音 (audio_end)
|
||||
|
||||
**发送时机**: 用户停止录音(松开录音按钮)
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "audio_end"
|
||||
}
|
||||
```
|
||||
|
||||
| 字段 | 类型 | 必填 | 描述 |
|
||||
|------|------|------|------|
|
||||
| type | String | ✅ | 固定值 `audio_end` |
|
||||
|
||||
**说明**: 发送此消息后,服务端将完成语音识别并开始生成 AI 响应
|
||||
|
||||
---
|
||||
|
||||
### 4. 取消会话 (cancel)
|
||||
|
||||
**发送时机**: 用户主动取消对话(如点击取消按钮)
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "cancel"
|
||||
}
|
||||
```
|
||||
|
||||
| 字段 | 类型 | 必填 | 描述 |
|
||||
|------|------|------|------|
|
||||
| type | String | ✅ | 固定值 `cancel` |
|
||||
|
||||
**说明**: 服务端将停止所有处理,不再返回任何消息
|
||||
|
||||
---
|
||||
|
||||
## 服务端 → 客户端消息
|
||||
|
||||
### 1. 会话已启动 (session_started)
|
||||
|
||||
**接收时机**: 发送 `session_start` 后
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "session_started",
|
||||
"session_id": "abc123-def456-ghi789"
|
||||
}
|
||||
```
|
||||
|
||||
| 字段 | 类型 | 描述 |
|
||||
|------|------|------|
|
||||
| type | String | 固定值 `session_started` |
|
||||
| session_id | String | 服务端分配的会话 ID |
|
||||
|
||||
**客户端处理**: 收到此消息后,可以开始发送音频数据
|
||||
|
||||
---
|
||||
|
||||
### 2. 轮次开始 (turn_start) 🆕
|
||||
|
||||
**接收时机**: 用户开始说话时(Flux 检测到语音活动)
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "turn_start",
|
||||
"turn_index": 0
|
||||
}
|
||||
```
|
||||
|
||||
| 字段 | 类型 | 描述 |
|
||||
|------|------|------|
|
||||
| type | String | 固定值 `turn_start` |
|
||||
| turn_index | Integer | 当前轮次索引(从 0 开始) |
|
||||
|
||||
**客户端处理**:
|
||||
- 可显示"正在听..."状态
|
||||
- 准备接收转写结果
|
||||
|
||||
---
|
||||
|
||||
### 3. 中间转写结果 (transcript_interim)
|
||||
|
||||
**接收时机**: 用户说话过程中,实时返回
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "transcript_interim",
|
||||
"text": "Hello how are",
|
||||
"is_final": false
|
||||
}
|
||||
```
|
||||
|
||||
| 字段 | 类型 | 描述 |
|
||||
|------|------|------|
|
||||
| type | String | 固定值 `transcript_interim` |
|
||||
| text | String | 当前识别到的文本(可能会变化) |
|
||||
| is_final | Boolean | 固定为 `false` |
|
||||
|
||||
**客户端处理**:
|
||||
- 实时更新 UI 显示转写文本
|
||||
- 此文本可能会被后续消息覆盖
|
||||
- 可用于显示"正在识别..."效果
|
||||
|
||||
---
|
||||
|
||||
### 3. 最终转写结果 (transcript_final)
|
||||
|
||||
**接收时机**: 一句话识别完成时
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "transcript_final",
|
||||
"text": "Hello, how are you?"
|
||||
}
|
||||
```
|
||||
|
||||
| 字段 | 类型 | 描述 |
|
||||
|------|------|------|
|
||||
| type | String | 固定值 `transcript_final` |
|
||||
| text | String | 最终确定的转写文本 |
|
||||
|
||||
**客户端处理**:
|
||||
- 用此文本替换之前的中间结果
|
||||
- 此文本不会再变化
|
||||
|
||||
---
|
||||
|
||||
### 6. 提前端点检测 (eager_eot) 🆕
|
||||
|
||||
**接收时机**: Flux 检测到用户可能说完时(置信度达到阈值)
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "eager_eot",
|
||||
"transcript": "Hello, how are you",
|
||||
"confidence": 0.65
|
||||
}
|
||||
```
|
||||
|
||||
| 字段 | 类型 | 描述 |
|
||||
|------|------|------|
|
||||
| type | String | 固定值 `eager_eot` |
|
||||
| transcript | String | 当前转写文本 |
|
||||
| confidence | Double | 端点置信度 (0.0-1.0) |
|
||||
|
||||
**客户端处理**:
|
||||
- 这是一个**预测性事件**,表示用户可能说完了
|
||||
- 服务端已开始提前准备 LLM 响应
|
||||
- 可显示"准备响应..."状态
|
||||
- **注意**: 用户可能继续说话,此时会收到 `turn_resumed`
|
||||
|
||||
---
|
||||
|
||||
### 7. 轮次恢复 (turn_resumed) 🆕
|
||||
|
||||
**接收时机**: 收到 `eager_eot` 后,用户继续说话
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "turn_resumed"
|
||||
}
|
||||
```
|
||||
|
||||
| 字段 | 类型 | 描述 |
|
||||
|------|------|------|
|
||||
| type | String | 固定值 `turn_resumed` |
|
||||
|
||||
**客户端处理**:
|
||||
- 用户继续说话,之前的 `eager_eot` 是误判
|
||||
- 服务端已取消正在准备的草稿响应
|
||||
- 恢复"正在听..."状态
|
||||
- 继续接收 `transcript_interim` 更新
|
||||
|
||||
---
|
||||
|
||||
### 8. LLM 开始生成 (llm_start)
|
||||
|
||||
**接收时机**: 语音识别完成,AI 开始生成响应
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "llm_start"
|
||||
}
|
||||
```
|
||||
|
||||
| 字段 | 类型 | 描述 |
|
||||
|------|------|------|
|
||||
| type | String | 固定值 `llm_start` |
|
||||
|
||||
**客户端处理**:
|
||||
- 可显示"AI 正在思考..."状态
|
||||
- 准备接收 AI 响应文本和音频
|
||||
|
||||
---
|
||||
|
||||
### 5. LLM Token (llm_token)
|
||||
|
||||
**接收时机**: AI 生成过程中,逐 token 返回
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "llm_token",
|
||||
"token": "Hi"
|
||||
}
|
||||
```
|
||||
|
||||
| 字段 | 类型 | 描述 |
|
||||
|------|------|------|
|
||||
| type | String | 固定值 `llm_token` |
|
||||
| token | String | AI 输出的单个 token(词或字符片段) |
|
||||
|
||||
**客户端处理**:
|
||||
- 可选择实现打字机效果
|
||||
- 逐个 token 追加显示 AI 响应文本
|
||||
- 如不需要打字效果,可忽略此消息
|
||||
|
||||
---
|
||||
|
||||
### 6. 音频数据 (Binary)
|
||||
|
||||
**接收时机**: TTS 合成过程中,流式返回音频
|
||||
|
||||
**格式**: Binary WebSocket Frame,MP3 音频块
|
||||
|
||||
**音频规格**:
|
||||
|
||||
| 参数 | 值 |
|
||||
|------|------|
|
||||
| 格式 | MP3 |
|
||||
| 采样率 | 44100 Hz |
|
||||
| 比特率 | 64 kbps |
|
||||
| 声道 | 单声道 |
|
||||
|
||||
**客户端处理**:
|
||||
|
||||
```swift
|
||||
// 使用 AVAudioEngine 或 AudioQueue 播放流式音频
|
||||
webSocket.onEvent = { event in
|
||||
switch event {
|
||||
case .binary(let data):
|
||||
// 方案1: 追加到缓冲区,使用 AVAudioPlayerNode
|
||||
audioBuffer.append(data)
|
||||
playBufferedAudio()
|
||||
|
||||
// 方案2: 使用 AVAudioEngine + AVAudioCompressedBuffer
|
||||
// 方案3: 累积后使用 AVAudioPlayer
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**重要提示**:
|
||||
- 音频是分块返回的,需要正确拼接或流式播放
|
||||
- 每个二进制消息是 MP3 数据的一部分
|
||||
- 收到 `complete` 消息后,音频传输完成
|
||||
|
||||
---
|
||||
|
||||
### 7. 处理完成 (complete)
|
||||
|
||||
**接收时机**: AI 响应生成完成,所有音频已发送
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "complete",
|
||||
"transcript": "Hello, how are you?",
|
||||
"ai_response": "Hi! I'm doing great, thanks for asking!"
|
||||
}
|
||||
```
|
||||
|
||||
| 字段 | 类型 | 描述 |
|
||||
|------|------|------|
|
||||
| type | String | 固定值 `complete` |
|
||||
| transcript | String | 完整的用户语音转写文本 |
|
||||
| ai_response | String | 完整的 AI 响应文本 |
|
||||
|
||||
**客户端处理**:
|
||||
- 更新 UI 显示完整对话
|
||||
- 可开始下一轮对话
|
||||
- 建议保存对话历史
|
||||
|
||||
---
|
||||
|
||||
### 8. 错误 (error)
|
||||
|
||||
**接收时机**: 处理过程中发生错误
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "error",
|
||||
"code": "DEEPGRAM_ERROR",
|
||||
"message": "Speech recognition failed"
|
||||
}
|
||||
```
|
||||
|
||||
| 字段 | 类型 | 描述 |
|
||||
|------|------|------|
|
||||
| type | String | 固定值 `error` |
|
||||
| code | String | 错误代码 |
|
||||
| message | String | 错误描述 |
|
||||
|
||||
**错误代码列表**:
|
||||
|
||||
| 错误代码 | 描述 | 建议处理 |
|
||||
|----------|------|----------|
|
||||
| PARSE_ERROR | 消息解析失败 | 检查消息格式 |
|
||||
| DEEPGRAM_ERROR | 语音识别服务错误 | 重试或提示用户 |
|
||||
| DEEPGRAM_INIT_ERROR | 语音识别初始化失败 | 重新开始会话 |
|
||||
| LLM_ERROR | AI 生成错误 | 重试或提示用户 |
|
||||
| PIPELINE_ERROR | 处理流程错误 | 重新开始会话 |
|
||||
| EMPTY_TRANSCRIPT | 未检测到语音 | 提示用户重新说话 |
|
||||
|
||||
**客户端处理**:
|
||||
- 显示友好的错误提示
|
||||
- 根据错误类型决定是否重试
|
||||
|
||||
---
|
||||
|
||||
## 完整交互流程
|
||||
|
||||
### 时序图
|
||||
|
||||
```
|
||||
iOS Client Server
|
||||
| |
|
||||
|------ WebSocket Connect --------->|
|
||||
| ?token=xxx |
|
||||
| |
|
||||
|<-------- Connected ---------------|
|
||||
| |
|
||||
|------ session_start ------------->|
|
||||
| |
|
||||
|<----- session_started ------------|
|
||||
| {session_id: "abc"} |
|
||||
| |
|
||||
|======= 用户开始说话 ===============|
|
||||
| |
|
||||
|------ Binary (audio) ------------>|
|
||||
|------ Binary (audio) ------------>|
|
||||
|<----- transcript_interim ---------|
|
||||
| {text: "Hello"} |
|
||||
|------ Binary (audio) ------------>|
|
||||
|<----- transcript_interim ---------|
|
||||
| {text: "Hello how"} |
|
||||
|------ Binary (audio) ------------>|
|
||||
|<----- transcript_final -----------|
|
||||
| {text: "Hello, how are you?"}|
|
||||
| |
|
||||
|======= 用户停止说话 ===============|
|
||||
| |
|
||||
|------ audio_end ----------------->|
|
||||
| |
|
||||
|<----- llm_start ------------------|
|
||||
| |
|
||||
|<----- llm_token ------------------|
|
||||
| {token: "Hi"} |
|
||||
|<----- llm_token ------------------|
|
||||
| {token: "!"} |
|
||||
|<----- Binary (mp3) ---------------|
|
||||
|<----- Binary (mp3) ---------------|
|
||||
|<----- llm_token ------------------|
|
||||
| {token: " I'm"} |
|
||||
|<----- Binary (mp3) ---------------|
|
||||
| ... |
|
||||
|<----- complete -------------------|
|
||||
| {transcript, ai_response} |
|
||||
| |
|
||||
|======= 可以开始下一轮 =============|
|
||||
| |
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## iOS 代码示例
|
||||
|
||||
### 完整 Swift 实现
|
||||
|
||||
```swift
|
||||
import Foundation
|
||||
import Starscream // WebSocket 库
|
||||
|
||||
class VoiceChatManager: WebSocketDelegate {
|
||||
|
||||
private var socket: WebSocket?
|
||||
private var audioBuffer = Data()
|
||||
|
||||
// MARK: - 回调
|
||||
var onSessionStarted: ((String) -> Void)?
|
||||
var onTranscriptInterim: ((String) -> Void)?
|
||||
var onTranscriptFinal: ((String) -> Void)?
|
||||
var onLLMStart: (() -> Void)?
|
||||
var onLLMToken: ((String) -> Void)?
|
||||
var onAudioChunk: ((Data) -> Void)?
|
||||
var onComplete: ((String, String) -> Void)?
|
||||
var onError: ((String, String) -> Void)?
|
||||
|
||||
// MARK: - 连接
|
||||
func connect(token: String) {
|
||||
let urlString = "wss://api.yourdomain.com/api/ws/chat?token=\(token)"
|
||||
guard let url = URL(string: urlString) else { return }
|
||||
|
||||
var request = URLRequest(url: url)
|
||||
request.timeoutInterval = 30
|
||||
|
||||
socket = WebSocket(request: request)
|
||||
socket?.delegate = self
|
||||
socket?.connect()
|
||||
}
|
||||
|
||||
func disconnect() {
|
||||
socket?.disconnect()
|
||||
socket = nil
|
||||
}
|
||||
|
||||
// MARK: - 发送消息
|
||||
func startSession(language: String = "en", voiceId: String? = nil) {
|
||||
var config: [String: Any] = ["language": language]
|
||||
if let voiceId = voiceId {
|
||||
config["voice_id"] = voiceId
|
||||
}
|
||||
|
||||
let message: [String: Any] = [
|
||||
"type": "session_start",
|
||||
"config": config
|
||||
]
|
||||
|
||||
sendJSON(message)
|
||||
}
|
||||
|
||||
func sendAudio(_ data: Data) {
|
||||
socket?.write(data: data)
|
||||
}
|
||||
|
||||
func endAudio() {
|
||||
sendJSON(["type": "audio_end"])
|
||||
}
|
||||
|
||||
func cancel() {
|
||||
sendJSON(["type": "cancel"])
|
||||
}
|
||||
|
||||
private func sendJSON(_ dict: [String: Any]) {
|
||||
guard let data = try? JSONSerialization.data(withJSONObject: dict),
|
||||
let string = String(data: data, encoding: .utf8) else { return }
|
||||
socket?.write(string: string)
|
||||
}
|
||||
|
||||
// MARK: - WebSocketDelegate
|
||||
func didReceive(event: WebSocketEvent, client: WebSocketClient) {
|
||||
switch event {
|
||||
case .connected(_):
|
||||
print("WebSocket connected")
|
||||
|
||||
case .disconnected(let reason, let code):
|
||||
print("WebSocket disconnected: \(reason) (\(code))")
|
||||
|
||||
case .text(let text):
|
||||
handleTextMessage(text)
|
||||
|
||||
case .binary(let data):
|
||||
// 收到 MP3 音频数据
|
||||
onAudioChunk?(data)
|
||||
|
||||
case .error(let error):
|
||||
print("WebSocket error: \(error?.localizedDescription ?? "unknown")")
|
||||
|
||||
default:
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
private func handleTextMessage(_ text: String) {
|
||||
guard let data = text.data(using: .utf8),
|
||||
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
|
||||
let type = json["type"] as? String else { return }
|
||||
|
||||
switch type {
|
||||
case "session_started":
|
||||
if let sessionId = json["session_id"] as? String {
|
||||
onSessionStarted?(sessionId)
|
||||
}
|
||||
|
||||
case "transcript_interim":
|
||||
if let text = json["text"] as? String {
|
||||
onTranscriptInterim?(text)
|
||||
}
|
||||
|
||||
case "transcript_final":
|
||||
if let text = json["text"] as? String {
|
||||
onTranscriptFinal?(text)
|
||||
}
|
||||
|
||||
case "llm_start":
|
||||
onLLMStart?()
|
||||
|
||||
case "llm_token":
|
||||
if let token = json["token"] as? String {
|
||||
onLLMToken?(token)
|
||||
}
|
||||
|
||||
case "complete":
|
||||
if let transcript = json["transcript"] as? String,
|
||||
let aiResponse = json["ai_response"] as? String {
|
||||
onComplete?(transcript, aiResponse)
|
||||
}
|
||||
|
||||
case "error":
|
||||
if let code = json["code"] as? String,
|
||||
let message = json["message"] as? String {
|
||||
onError?(code, message)
|
||||
}
|
||||
|
||||
default:
|
||||
print("Unknown message type: \(type)")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 使用示例
|
||||
|
||||
```swift
|
||||
class VoiceChatViewController: UIViewController {
|
||||
|
||||
let chatManager = VoiceChatManager()
|
||||
let audioRecorder = AudioRecorder() // 自定义录音类
|
||||
let audioPlayer = StreamingAudioPlayer() // 自定义流式播放类
|
||||
|
||||
override func viewDidLoad() {
|
||||
super.viewDidLoad()
|
||||
setupCallbacks()
|
||||
}
|
||||
|
||||
func setupCallbacks() {
|
||||
chatManager.onSessionStarted = { [weak self] sessionId in
|
||||
print("Session started: \(sessionId)")
|
||||
// 开始录音
|
||||
self?.audioRecorder.start { audioData in
|
||||
self?.chatManager.sendAudio(audioData)
|
||||
}
|
||||
}
|
||||
|
||||
chatManager.onTranscriptInterim = { [weak self] text in
|
||||
self?.transcriptLabel.text = text + "..."
|
||||
}
|
||||
|
||||
chatManager.onTranscriptFinal = { [weak self] text in
|
||||
self?.transcriptLabel.text = text
|
||||
}
|
||||
|
||||
chatManager.onLLMStart = { [weak self] in
|
||||
self?.statusLabel.text = "AI is thinking..."
|
||||
}
|
||||
|
||||
chatManager.onLLMToken = { [weak self] token in
|
||||
self?.aiResponseLabel.text = (self?.aiResponseLabel.text ?? "") + token
|
||||
}
|
||||
|
||||
chatManager.onAudioChunk = { [weak self] data in
|
||||
self?.audioPlayer.appendData(data)
|
||||
}
|
||||
|
||||
chatManager.onComplete = { [weak self] transcript, aiResponse in
|
||||
self?.statusLabel.text = "Complete"
|
||||
self?.addToHistory(user: transcript, ai: aiResponse)
|
||||
}
|
||||
|
||||
chatManager.onError = { [weak self] code, message in
|
||||
self?.showError(message)
|
||||
}
|
||||
}
|
||||
|
||||
@IBAction func startTapped(_ sender: UIButton) {
|
||||
// 连接并开始会话
|
||||
chatManager.connect(token: AuthManager.shared.saToken)
|
||||
chatManager.onSessionStarted = { [weak self] _ in
|
||||
self?.chatManager.startSession()
|
||||
}
|
||||
}
|
||||
|
||||
@IBAction func stopTapped(_ sender: UIButton) {
|
||||
audioRecorder.stop()
|
||||
chatManager.endAudio()
|
||||
}
|
||||
|
||||
@IBAction func cancelTapped(_ sender: UIButton) {
|
||||
audioRecorder.stop()
|
||||
audioPlayer.stop()
|
||||
chatManager.cancel()
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 注意事项
|
||||
|
||||
### 1. 音频录制
|
||||
- 必须使用 PCM 16-bit, 16kHz, Mono 格式
|
||||
- 建议每 20-100ms 发送一次音频数据
|
||||
- 录音权限需要在 Info.plist 中声明
|
||||
|
||||
### 2. 音频播放
|
||||
- 返回的是 MP3 格式音频块
|
||||
- 需要实现流式播放或缓冲播放
|
||||
- 建议使用 AVAudioEngine 实现低延迟播放
|
||||
|
||||
### 3. 网络处理
|
||||
- 实现自动重连机制
|
||||
- 处理网络切换场景
|
||||
- 设置合理的超时时间
|
||||
|
||||
### 4. 用户体验
|
||||
- 显示实时转写文本
|
||||
- 显示 AI 响应状态
|
||||
- 提供取消按钮
|
||||
- 处理录音权限被拒绝的情况
|
||||
|
||||
### 5. 调试建议
|
||||
- 使用 `wss://` 确保生产环境安全
|
||||
- 本地开发可使用 `ws://`
|
||||
- 检查 Sa-Token 是否过期
|
||||
|
||||
---
|
||||
|
||||
## 版本历史
|
||||
|
||||
| 版本 | 日期 | 变更 |
|
||||
|------|------|------|
|
||||
| 1.0.0 | 2026-01-21 | 初始版本 |
|
||||
|
||||
Reference in New Issue
Block a user