diff --git a/.omc/ultrawork-state.json b/.omc/ultrawork-state.json index 198f9fc..d876b5c 100644 --- a/.omc/ultrawork-state.json +++ b/.omc/ultrawork-state.json @@ -2,6 +2,6 @@ "active": true, "started_at": "2026-01-26T13:01:18.447Z", "original_prompt": "刚刚回滚了代码,现在AI陪聊角色评论需要使用KeyboardAiCompanionCommentLikeService添加一个评论点赞接口,用来记录点赞和取消点赞。 ulw", - "reinforcement_count": 4, - "last_checked_at": "2026-01-26T13:55:34.306Z" + "reinforcement_count": 5, + "last_checked_at": "2026-01-27T05:14:53.054Z" } \ No newline at end of file diff --git a/src/main/java/com/yolo/keyborad/common/ErrorCode.java b/src/main/java/com/yolo/keyborad/common/ErrorCode.java index 59d4f4e..9c76eea 100644 --- a/src/main/java/com/yolo/keyborad/common/ErrorCode.java +++ b/src/main/java/com/yolo/keyborad/common/ErrorCode.java @@ -70,7 +70,11 @@ public enum ErrorCode { INVITE_CODE_ALREADY_BOUND(50028, "您已绑定过邀请码,无法重复绑定"), INVITE_CODE_CANNOT_BIND_SELF(50029, "不能绑定自己的邀请码"), RECEIPT_ALREADY_PROCESSED(50027, "收据已处理"), - VIP_TRIAL_LIMIT_REACHED(50030, "今日体验次数已达上限,请开通会员"); + VIP_TRIAL_LIMIT_REACHED(50030, "今日体验次数已达上限,请开通会员"), + AUDIO_FILE_EMPTY(40016, "音频文件不能为空"), + AUDIO_FILE_TOO_LARGE(40017, "音频文件过大"), + AUDIO_FORMAT_NOT_SUPPORTED(40018, "音频格式不支持"), + STT_SERVICE_ERROR(50031, "语音转文字服务异常"); /** * 状态码 diff --git a/src/main/java/com/yolo/keyborad/config/DeepgramProperties.java b/src/main/java/com/yolo/keyborad/config/DeepgramProperties.java new file mode 100644 index 0000000..5138258 --- /dev/null +++ b/src/main/java/com/yolo/keyborad/config/DeepgramProperties.java @@ -0,0 +1,34 @@ +package com.yolo.keyborad.config; + +import lombok.Data; +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.springframework.stereotype.Component; + +/** + * Deepgram STT 配置 + * + * @author ziin + */ +@Data +@Component +@ConfigurationProperties(prefix = "deepgram") +public class DeepgramProperties { + + /** API Key */ + private String apiKey; + + /** 基础 URL */ + private String baseUrl = "https://api.deepgram.com/v1"; + + /** 模型 ID */ + private String model = "nova-2"; + + /** 默认语言 */ + private String language = "en"; + + /** 智能格式化 */ + private Boolean smartFormat = true; + + /** 添加标点符号 */ + private Boolean punctuate = true; +} diff --git a/src/main/java/com/yolo/keyborad/config/SaTokenConfigure.java b/src/main/java/com/yolo/keyborad/config/SaTokenConfigure.java index 26d5837..439c34b 100644 --- a/src/main/java/com/yolo/keyborad/config/SaTokenConfigure.java +++ b/src/main/java/com/yolo/keyborad/config/SaTokenConfigure.java @@ -114,7 +114,8 @@ public class SaTokenConfigure implements WebMvcConfigurer { "/chat/audio/*", "/ai-companion/page", "/chat/history", - "/ai-companion/comment/add" + "/ai-companion/comment/add", + "/speech/transcribe" }; } @Bean diff --git a/src/main/java/com/yolo/keyborad/controller/SpeechController.java b/src/main/java/com/yolo/keyborad/controller/SpeechController.java new file mode 100644 index 0000000..27afa21 --- /dev/null +++ b/src/main/java/com/yolo/keyborad/controller/SpeechController.java @@ -0,0 +1,34 @@ +package com.yolo.keyborad.controller; + +import com.yolo.keyborad.common.BaseResponse; +import com.yolo.keyborad.common.ResultUtils; +import com.yolo.keyborad.model.vo.SpeechToTextVO; +import com.yolo.keyborad.service.DeepgramService; +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.tags.Tag; +import jakarta.annotation.Resource; +import lombok.extern.slf4j.Slf4j; +import org.springframework.web.bind.annotation.*; +import org.springframework.web.multipart.MultipartFile; + +/** + * 语音服务控制器 + * + * @author ziin + */ +@RestController +@Slf4j +@RequestMapping("/speech") +@Tag(name = "语音服务", description = "语音相关功能接口") +public class SpeechController { + + @Resource + private DeepgramService deepgramService; + + @PostMapping("/transcribe") + @Operation(summary = "语音转文字", description = "上传音频文件并转换为文本") + public BaseResponse transcribe(@RequestPart("file") MultipartFile file) { + SpeechToTextVO result = deepgramService.transcribe(file); + return ResultUtils.success(result); + } +} diff --git a/src/main/java/com/yolo/keyborad/model/vo/SpeechToTextVO.java b/src/main/java/com/yolo/keyborad/model/vo/SpeechToTextVO.java new file mode 100644 index 0000000..9be0996 --- /dev/null +++ b/src/main/java/com/yolo/keyborad/model/vo/SpeechToTextVO.java @@ -0,0 +1,32 @@ +package com.yolo.keyborad.model.vo; + +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +/** + * 语音转文字响应VO + * + * @author ziin + */ +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +@Schema(description = "语音转文字响应") +public class SpeechToTextVO { + + @Schema(description = "转录文本") + private String transcript; + + @Schema(description = "置信度") + private Double confidence; + + @Schema(description = "音频时长(秒)") + private Double duration; + + @Schema(description = "检测到的语言") + private String detectedLanguage; +} diff --git a/src/main/java/com/yolo/keyborad/service/DeepgramService.java b/src/main/java/com/yolo/keyborad/service/DeepgramService.java new file mode 100644 index 0000000..cc2c358 --- /dev/null +++ b/src/main/java/com/yolo/keyborad/service/DeepgramService.java @@ -0,0 +1,29 @@ +package com.yolo.keyborad.service; + +import com.yolo.keyborad.model.vo.SpeechToTextVO; +import org.springframework.web.multipart.MultipartFile; + +/** + * Deepgram STT 语音转文字服务接口 + * + * @author ziin + */ +public interface DeepgramService { + + /** + * 将音频文件转换为文字(使用默认语言) + * + * @param audioFile 音频文件 + * @return 语音转文字结果 + */ + SpeechToTextVO transcribe(MultipartFile audioFile); + + /** + * 将音频文件转换为文字(指定语言) + * + * @param audioFile 音频文件 + * @param language 语言代码(如 en, zh, ja 等) + * @return 语音转文字结果 + */ + SpeechToTextVO transcribe(MultipartFile audioFile, String language); +} diff --git a/src/main/java/com/yolo/keyborad/service/impl/DeepgramServiceImpl.java b/src/main/java/com/yolo/keyborad/service/impl/DeepgramServiceImpl.java new file mode 100644 index 0000000..94ac3bb --- /dev/null +++ b/src/main/java/com/yolo/keyborad/service/impl/DeepgramServiceImpl.java @@ -0,0 +1,182 @@ +package com.yolo.keyborad.service.impl; + +import cn.hutool.core.util.StrUtil; +import com.alibaba.fastjson.JSONArray; +import com.alibaba.fastjson.JSONObject; +import com.yolo.keyborad.common.ErrorCode; +import com.yolo.keyborad.config.DeepgramProperties; +import com.yolo.keyborad.exception.BusinessException; +import com.yolo.keyborad.model.vo.SpeechToTextVO; +import com.yolo.keyborad.service.DeepgramService; +import jakarta.annotation.Resource; +import lombok.extern.slf4j.Slf4j; +import org.springframework.http.MediaType; +import org.springframework.stereotype.Service; +import org.springframework.web.client.RestClient; +import org.springframework.web.multipart.MultipartFile; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +/** + * Deepgram STT 语音转文字服务实现 + * 参考: https://developers.deepgram.com/docs/getting-started-with-pre-recorded-audio + * + * @author ziin + */ +@Service +@Slf4j +public class DeepgramServiceImpl implements DeepgramService { + + @Resource + private DeepgramProperties deepgramProperties; + + @Resource + private RestClient restClient; + + // 支持的音频MIME类型 + private static final List ALLOWED_AUDIO_TYPES = Arrays.asList( + "audio/wav", "audio/wave", + "audio/mp3", "audio/mpeg", + "audio/webm", + "audio/ogg", + "audio/flac", + "audio/m4a" + ); + + // 最大文件大小:20MB + private static final long MAX_FILE_SIZE = 20 * 1024 * 1024; + + @Override + public SpeechToTextVO transcribe(MultipartFile audioFile) { + return transcribe(audioFile, deepgramProperties.getLanguage()); + } + + @Override + public SpeechToTextVO transcribe(MultipartFile audioFile, String language) { + // 1. 参数校验 + validateAudioFile(audioFile); + + if (StrUtil.isBlank(language)) { + language = deepgramProperties.getLanguage(); + } + + // 2. 获取音频Content-Type + String contentType = audioFile.getContentType(); + if (StrUtil.isBlank(contentType) || !ALLOWED_AUDIO_TYPES.contains(contentType)) { + log.warn("不支持的音频格式: {}", contentType); + throw new BusinessException(ErrorCode.AUDIO_FORMAT_NOT_SUPPORTED); + } + + // 3. 构建请求URL + String requestUrl = buildRequestUrl(language); + + log.info("调用 Deepgram STT API, language: {}, contentType: {}, 文件大小: {} bytes", + language, contentType, audioFile.getSize()); + long startTime = System.currentTimeMillis(); + + try { + // 4. 发送请求 + byte[] audioBytes = audioFile.getBytes(); + + String responseJson = restClient.post() + .uri(requestUrl) + .contentType(MediaType.parseMediaType(contentType)) + .header("Authorization", "Token " + deepgramProperties.getApiKey()) + .body(audioBytes) + .retrieve() + .body(String.class); + + long duration = System.currentTimeMillis() - startTime; + log.info("Deepgram STT API 响应成功, 耗时: {}ms", duration); + + // 5. 解析响应 + return parseResponse(responseJson); + + } catch (IOException e) { + log.error("读取音频文件失败", e); + throw new BusinessException(ErrorCode.SYSTEM_ERROR, "音频文件读取失败: " + e.getMessage()); + } catch (Exception e) { + log.error("调用 Deepgram STT API 发生异常", e); + throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "语音转文字服务异常: " + e.getMessage()); + } + } + + /** + * 校验音频文件 + */ + private void validateAudioFile(MultipartFile audioFile) { + if (audioFile == null || audioFile.isEmpty()) { + throw new BusinessException(ErrorCode.AUDIO_FILE_EMPTY); + } + + if (audioFile.getSize() > MAX_FILE_SIZE) { + throw new BusinessException(ErrorCode.AUDIO_FILE_TOO_LARGE); + } + } + + /** + * 构建请求URL + */ + private String buildRequestUrl(String language) { + StringBuilder url = new StringBuilder(deepgramProperties.getBaseUrl()); + url.append("/listen"); + + // 添加查询参数 + url.append("?model=").append(deepgramProperties.getModel()); + url.append("&language=").append(language); + + if (deepgramProperties.getSmartFormat()) { + url.append("&smart_format=true"); + } + if (deepgramProperties.getPunctuate()) { + url.append("&punctuate=true"); + } + + return url.toString(); + } + + /** + * 解析响应JSON + */ + private SpeechToTextVO parseResponse(String responseJson) { + JSONObject jsonResponse = JSONObject.parseObject(responseJson); + + // 解析 metadata + JSONObject metadata = jsonResponse.getJSONObject("metadata"); + Double duration = metadata != null ? metadata.getDouble("duration") : null; + + // 解析 results + JSONObject results = jsonResponse.getJSONObject("results"); + if (results == null) { + throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "响应格式错误: 缺少 results"); + } + + JSONArray channels = results.getJSONArray("channels"); + if (channels == null || channels.isEmpty()) { + throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "响应格式错误: 缺少 channels"); + } + + JSONObject channel = channels.getJSONObject(0); + JSONArray alternatives = channel.getJSONArray("alternatives"); + if (alternatives == null || alternatives.isEmpty()) { + throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "响应格式错误: 缺少 alternatives"); + } + + JSONObject alternative = alternatives.getJSONObject(0); + String transcript = alternative.getString("transcript"); + Double confidence = alternative.getDouble("confidence"); + String detectedLanguage = channel.getString("detected_language"); + + log.info("转录成功, 文本长度: {}, 置信度: {}, 检测语言: {}", + transcript != null ? transcript.length() : 0, confidence, detectedLanguage); + + return SpeechToTextVO.builder() + .transcript(transcript) + .confidence(confidence) + .duration(duration) + .detectedLanguage(detectedLanguage) + .build(); + } +} diff --git a/src/main/resources/application-dev.yml b/src/main/resources/application-dev.yml index 9174723..abe5162 100644 --- a/src/main/resources/application-dev.yml +++ b/src/main/resources/application-dev.yml @@ -105,4 +105,11 @@ elevenlabs: api-key: sk_25339d32bb14c91f460ed9fce83a1951672f07846a7a10ce voice-id: JBFqnCBsd6RMkjVDRZzb model-id: eleven_turbo_v2_5 - output-format: mp3_44100_128 \ No newline at end of file + output-format: mp3_44100_128 + +deepgram: + api-key: 9c792eb63a65d644cbc95785155754cd1e84f8cf + model: nova-2 + language: en + smart-format: true + punctuate: true \ No newline at end of file