feat(speech): 新增语音转文字功能
新增 Deepgram 集成,支持音频文件上传、格式校验与转写;补充相关错误码并放行 /speech/transcribe 接口
This commit is contained in:
@@ -2,6 +2,6 @@
|
|||||||
"active": true,
|
"active": true,
|
||||||
"started_at": "2026-01-26T13:01:18.447Z",
|
"started_at": "2026-01-26T13:01:18.447Z",
|
||||||
"original_prompt": "刚刚回滚了代码,现在AI陪聊角色评论需要使用KeyboardAiCompanionCommentLikeService添加一个评论点赞接口,用来记录点赞和取消点赞。 ulw",
|
"original_prompt": "刚刚回滚了代码,现在AI陪聊角色评论需要使用KeyboardAiCompanionCommentLikeService添加一个评论点赞接口,用来记录点赞和取消点赞。 ulw",
|
||||||
"reinforcement_count": 4,
|
"reinforcement_count": 5,
|
||||||
"last_checked_at": "2026-01-26T13:55:34.306Z"
|
"last_checked_at": "2026-01-27T05:14:53.054Z"
|
||||||
}
|
}
|
||||||
@@ -70,7 +70,11 @@ public enum ErrorCode {
|
|||||||
INVITE_CODE_ALREADY_BOUND(50028, "您已绑定过邀请码,无法重复绑定"),
|
INVITE_CODE_ALREADY_BOUND(50028, "您已绑定过邀请码,无法重复绑定"),
|
||||||
INVITE_CODE_CANNOT_BIND_SELF(50029, "不能绑定自己的邀请码"),
|
INVITE_CODE_CANNOT_BIND_SELF(50029, "不能绑定自己的邀请码"),
|
||||||
RECEIPT_ALREADY_PROCESSED(50027, "收据已处理"),
|
RECEIPT_ALREADY_PROCESSED(50027, "收据已处理"),
|
||||||
VIP_TRIAL_LIMIT_REACHED(50030, "今日体验次数已达上限,请开通会员");
|
VIP_TRIAL_LIMIT_REACHED(50030, "今日体验次数已达上限,请开通会员"),
|
||||||
|
AUDIO_FILE_EMPTY(40016, "音频文件不能为空"),
|
||||||
|
AUDIO_FILE_TOO_LARGE(40017, "音频文件过大"),
|
||||||
|
AUDIO_FORMAT_NOT_SUPPORTED(40018, "音频格式不支持"),
|
||||||
|
STT_SERVICE_ERROR(50031, "语音转文字服务异常");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 状态码
|
* 状态码
|
||||||
|
|||||||
@@ -0,0 +1,34 @@
|
|||||||
|
package com.yolo.keyborad.config;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Deepgram STT 配置
|
||||||
|
*
|
||||||
|
* @author ziin
|
||||||
|
*/
|
||||||
|
@Data
|
||||||
|
@Component
|
||||||
|
@ConfigurationProperties(prefix = "deepgram")
|
||||||
|
public class DeepgramProperties {
|
||||||
|
|
||||||
|
/** API Key */
|
||||||
|
private String apiKey;
|
||||||
|
|
||||||
|
/** 基础 URL */
|
||||||
|
private String baseUrl = "https://api.deepgram.com/v1";
|
||||||
|
|
||||||
|
/** 模型 ID */
|
||||||
|
private String model = "nova-2";
|
||||||
|
|
||||||
|
/** 默认语言 */
|
||||||
|
private String language = "en";
|
||||||
|
|
||||||
|
/** 智能格式化 */
|
||||||
|
private Boolean smartFormat = true;
|
||||||
|
|
||||||
|
/** 添加标点符号 */
|
||||||
|
private Boolean punctuate = true;
|
||||||
|
}
|
||||||
@@ -114,7 +114,8 @@ public class SaTokenConfigure implements WebMvcConfigurer {
|
|||||||
"/chat/audio/*",
|
"/chat/audio/*",
|
||||||
"/ai-companion/page",
|
"/ai-companion/page",
|
||||||
"/chat/history",
|
"/chat/history",
|
||||||
"/ai-companion/comment/add"
|
"/ai-companion/comment/add",
|
||||||
|
"/speech/transcribe"
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@Bean
|
@Bean
|
||||||
|
|||||||
@@ -0,0 +1,34 @@
|
|||||||
|
package com.yolo.keyborad.controller;
|
||||||
|
|
||||||
|
import com.yolo.keyborad.common.BaseResponse;
|
||||||
|
import com.yolo.keyborad.common.ResultUtils;
|
||||||
|
import com.yolo.keyborad.model.vo.SpeechToTextVO;
|
||||||
|
import com.yolo.keyborad.service.DeepgramService;
|
||||||
|
import io.swagger.v3.oas.annotations.Operation;
|
||||||
|
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||||
|
import jakarta.annotation.Resource;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.springframework.web.bind.annotation.*;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 语音服务控制器
|
||||||
|
*
|
||||||
|
* @author ziin
|
||||||
|
*/
|
||||||
|
@RestController
|
||||||
|
@Slf4j
|
||||||
|
@RequestMapping("/speech")
|
||||||
|
@Tag(name = "语音服务", description = "语音相关功能接口")
|
||||||
|
public class SpeechController {
|
||||||
|
|
||||||
|
@Resource
|
||||||
|
private DeepgramService deepgramService;
|
||||||
|
|
||||||
|
@PostMapping("/transcribe")
|
||||||
|
@Operation(summary = "语音转文字", description = "上传音频文件并转换为文本")
|
||||||
|
public BaseResponse<SpeechToTextVO> transcribe(@RequestPart("file") MultipartFile file) {
|
||||||
|
SpeechToTextVO result = deepgramService.transcribe(file);
|
||||||
|
return ResultUtils.success(result);
|
||||||
|
}
|
||||||
|
}
|
||||||
32
src/main/java/com/yolo/keyborad/model/vo/SpeechToTextVO.java
Normal file
32
src/main/java/com/yolo/keyborad/model/vo/SpeechToTextVO.java
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
package com.yolo.keyborad.model.vo;
|
||||||
|
|
||||||
|
import io.swagger.v3.oas.annotations.media.Schema;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 语音转文字响应VO
|
||||||
|
*
|
||||||
|
* @author ziin
|
||||||
|
*/
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
@Schema(description = "语音转文字响应")
|
||||||
|
public class SpeechToTextVO {
|
||||||
|
|
||||||
|
@Schema(description = "转录文本")
|
||||||
|
private String transcript;
|
||||||
|
|
||||||
|
@Schema(description = "置信度")
|
||||||
|
private Double confidence;
|
||||||
|
|
||||||
|
@Schema(description = "音频时长(秒)")
|
||||||
|
private Double duration;
|
||||||
|
|
||||||
|
@Schema(description = "检测到的语言")
|
||||||
|
private String detectedLanguage;
|
||||||
|
}
|
||||||
29
src/main/java/com/yolo/keyborad/service/DeepgramService.java
Normal file
29
src/main/java/com/yolo/keyborad/service/DeepgramService.java
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
package com.yolo.keyborad.service;
|
||||||
|
|
||||||
|
import com.yolo.keyborad.model.vo.SpeechToTextVO;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Deepgram STT 语音转文字服务接口
|
||||||
|
*
|
||||||
|
* @author ziin
|
||||||
|
*/
|
||||||
|
public interface DeepgramService {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 将音频文件转换为文字(使用默认语言)
|
||||||
|
*
|
||||||
|
* @param audioFile 音频文件
|
||||||
|
* @return 语音转文字结果
|
||||||
|
*/
|
||||||
|
SpeechToTextVO transcribe(MultipartFile audioFile);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 将音频文件转换为文字(指定语言)
|
||||||
|
*
|
||||||
|
* @param audioFile 音频文件
|
||||||
|
* @param language 语言代码(如 en, zh, ja 等)
|
||||||
|
* @return 语音转文字结果
|
||||||
|
*/
|
||||||
|
SpeechToTextVO transcribe(MultipartFile audioFile, String language);
|
||||||
|
}
|
||||||
@@ -0,0 +1,182 @@
|
|||||||
|
package com.yolo.keyborad.service.impl;
|
||||||
|
|
||||||
|
import cn.hutool.core.util.StrUtil;
|
||||||
|
import com.alibaba.fastjson.JSONArray;
|
||||||
|
import com.alibaba.fastjson.JSONObject;
|
||||||
|
import com.yolo.keyborad.common.ErrorCode;
|
||||||
|
import com.yolo.keyborad.config.DeepgramProperties;
|
||||||
|
import com.yolo.keyborad.exception.BusinessException;
|
||||||
|
import com.yolo.keyborad.model.vo.SpeechToTextVO;
|
||||||
|
import com.yolo.keyborad.service.DeepgramService;
|
||||||
|
import jakarta.annotation.Resource;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.springframework.http.MediaType;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import org.springframework.web.client.RestClient;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Deepgram STT 语音转文字服务实现
|
||||||
|
* 参考: https://developers.deepgram.com/docs/getting-started-with-pre-recorded-audio
|
||||||
|
*
|
||||||
|
* @author ziin
|
||||||
|
*/
|
||||||
|
@Service
|
||||||
|
@Slf4j
|
||||||
|
public class DeepgramServiceImpl implements DeepgramService {
|
||||||
|
|
||||||
|
@Resource
|
||||||
|
private DeepgramProperties deepgramProperties;
|
||||||
|
|
||||||
|
@Resource
|
||||||
|
private RestClient restClient;
|
||||||
|
|
||||||
|
// 支持的音频MIME类型
|
||||||
|
private static final List<String> ALLOWED_AUDIO_TYPES = Arrays.asList(
|
||||||
|
"audio/wav", "audio/wave",
|
||||||
|
"audio/mp3", "audio/mpeg",
|
||||||
|
"audio/webm",
|
||||||
|
"audio/ogg",
|
||||||
|
"audio/flac",
|
||||||
|
"audio/m4a"
|
||||||
|
);
|
||||||
|
|
||||||
|
// 最大文件大小:20MB
|
||||||
|
private static final long MAX_FILE_SIZE = 20 * 1024 * 1024;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SpeechToTextVO transcribe(MultipartFile audioFile) {
|
||||||
|
return transcribe(audioFile, deepgramProperties.getLanguage());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SpeechToTextVO transcribe(MultipartFile audioFile, String language) {
|
||||||
|
// 1. 参数校验
|
||||||
|
validateAudioFile(audioFile);
|
||||||
|
|
||||||
|
if (StrUtil.isBlank(language)) {
|
||||||
|
language = deepgramProperties.getLanguage();
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. 获取音频Content-Type
|
||||||
|
String contentType = audioFile.getContentType();
|
||||||
|
if (StrUtil.isBlank(contentType) || !ALLOWED_AUDIO_TYPES.contains(contentType)) {
|
||||||
|
log.warn("不支持的音频格式: {}", contentType);
|
||||||
|
throw new BusinessException(ErrorCode.AUDIO_FORMAT_NOT_SUPPORTED);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. 构建请求URL
|
||||||
|
String requestUrl = buildRequestUrl(language);
|
||||||
|
|
||||||
|
log.info("调用 Deepgram STT API, language: {}, contentType: {}, 文件大小: {} bytes",
|
||||||
|
language, contentType, audioFile.getSize());
|
||||||
|
long startTime = System.currentTimeMillis();
|
||||||
|
|
||||||
|
try {
|
||||||
|
// 4. 发送请求
|
||||||
|
byte[] audioBytes = audioFile.getBytes();
|
||||||
|
|
||||||
|
String responseJson = restClient.post()
|
||||||
|
.uri(requestUrl)
|
||||||
|
.contentType(MediaType.parseMediaType(contentType))
|
||||||
|
.header("Authorization", "Token " + deepgramProperties.getApiKey())
|
||||||
|
.body(audioBytes)
|
||||||
|
.retrieve()
|
||||||
|
.body(String.class);
|
||||||
|
|
||||||
|
long duration = System.currentTimeMillis() - startTime;
|
||||||
|
log.info("Deepgram STT API 响应成功, 耗时: {}ms", duration);
|
||||||
|
|
||||||
|
// 5. 解析响应
|
||||||
|
return parseResponse(responseJson);
|
||||||
|
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.error("读取音频文件失败", e);
|
||||||
|
throw new BusinessException(ErrorCode.SYSTEM_ERROR, "音频文件读取失败: " + e.getMessage());
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error("调用 Deepgram STT API 发生异常", e);
|
||||||
|
throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "语音转文字服务异常: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 校验音频文件
|
||||||
|
*/
|
||||||
|
private void validateAudioFile(MultipartFile audioFile) {
|
||||||
|
if (audioFile == null || audioFile.isEmpty()) {
|
||||||
|
throw new BusinessException(ErrorCode.AUDIO_FILE_EMPTY);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (audioFile.getSize() > MAX_FILE_SIZE) {
|
||||||
|
throw new BusinessException(ErrorCode.AUDIO_FILE_TOO_LARGE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 构建请求URL
|
||||||
|
*/
|
||||||
|
private String buildRequestUrl(String language) {
|
||||||
|
StringBuilder url = new StringBuilder(deepgramProperties.getBaseUrl());
|
||||||
|
url.append("/listen");
|
||||||
|
|
||||||
|
// 添加查询参数
|
||||||
|
url.append("?model=").append(deepgramProperties.getModel());
|
||||||
|
url.append("&language=").append(language);
|
||||||
|
|
||||||
|
if (deepgramProperties.getSmartFormat()) {
|
||||||
|
url.append("&smart_format=true");
|
||||||
|
}
|
||||||
|
if (deepgramProperties.getPunctuate()) {
|
||||||
|
url.append("&punctuate=true");
|
||||||
|
}
|
||||||
|
|
||||||
|
return url.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 解析响应JSON
|
||||||
|
*/
|
||||||
|
private SpeechToTextVO parseResponse(String responseJson) {
|
||||||
|
JSONObject jsonResponse = JSONObject.parseObject(responseJson);
|
||||||
|
|
||||||
|
// 解析 metadata
|
||||||
|
JSONObject metadata = jsonResponse.getJSONObject("metadata");
|
||||||
|
Double duration = metadata != null ? metadata.getDouble("duration") : null;
|
||||||
|
|
||||||
|
// 解析 results
|
||||||
|
JSONObject results = jsonResponse.getJSONObject("results");
|
||||||
|
if (results == null) {
|
||||||
|
throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "响应格式错误: 缺少 results");
|
||||||
|
}
|
||||||
|
|
||||||
|
JSONArray channels = results.getJSONArray("channels");
|
||||||
|
if (channels == null || channels.isEmpty()) {
|
||||||
|
throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "响应格式错误: 缺少 channels");
|
||||||
|
}
|
||||||
|
|
||||||
|
JSONObject channel = channels.getJSONObject(0);
|
||||||
|
JSONArray alternatives = channel.getJSONArray("alternatives");
|
||||||
|
if (alternatives == null || alternatives.isEmpty()) {
|
||||||
|
throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "响应格式错误: 缺少 alternatives");
|
||||||
|
}
|
||||||
|
|
||||||
|
JSONObject alternative = alternatives.getJSONObject(0);
|
||||||
|
String transcript = alternative.getString("transcript");
|
||||||
|
Double confidence = alternative.getDouble("confidence");
|
||||||
|
String detectedLanguage = channel.getString("detected_language");
|
||||||
|
|
||||||
|
log.info("转录成功, 文本长度: {}, 置信度: {}, 检测语言: {}",
|
||||||
|
transcript != null ? transcript.length() : 0, confidence, detectedLanguage);
|
||||||
|
|
||||||
|
return SpeechToTextVO.builder()
|
||||||
|
.transcript(transcript)
|
||||||
|
.confidence(confidence)
|
||||||
|
.duration(duration)
|
||||||
|
.detectedLanguage(detectedLanguage)
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -105,4 +105,11 @@ elevenlabs:
|
|||||||
api-key: sk_25339d32bb14c91f460ed9fce83a1951672f07846a7a10ce
|
api-key: sk_25339d32bb14c91f460ed9fce83a1951672f07846a7a10ce
|
||||||
voice-id: JBFqnCBsd6RMkjVDRZzb
|
voice-id: JBFqnCBsd6RMkjVDRZzb
|
||||||
model-id: eleven_turbo_v2_5
|
model-id: eleven_turbo_v2_5
|
||||||
output-format: mp3_44100_128
|
output-format: mp3_44100_128
|
||||||
|
|
||||||
|
deepgram:
|
||||||
|
api-key: 9c792eb63a65d644cbc95785155754cd1e84f8cf
|
||||||
|
model: nova-2
|
||||||
|
language: en
|
||||||
|
smart-format: true
|
||||||
|
punctuate: true
|
||||||
Reference in New Issue
Block a user