feat(speech): 新增语音转文字功能

新增 Deepgram 集成，支持音频文件上传、格式校验与转写；补充相关错误码并放行 /speech/transcribe 接口
2026-01-27 18:17:36 +08:00
parent f18217ba93
commit 6cf0275980
9 changed files with 328 additions and 5 deletions
--- a/.omc/ultrawork-state.json
+++ b/.omc/ultrawork-state.json
@@ -2,6 +2,6 @@
  "active": true,
  "started_at": "2026-01-26T13:01:18.447Z",
  "original_prompt": "刚刚回滚了代码，现在AI陪聊角色评论需要使用KeyboardAiCompanionCommentLikeService添加一个评论点赞接口，用来记录点赞和取消点赞。 ulw",
-  "reinforcement_count": 4,
+  "reinforcement_count": 5,
-  "last_checked_at": "2026-01-26T13:55:34.306Z"
+  "last_checked_at": "2026-01-27T05:14:53.054Z"
 }
--- a/src/main/java/com/yolo/keyborad/common/ErrorCode.java
+++ b/src/main/java/com/yolo/keyborad/common/ErrorCode.java
@@ -70,7 +70,11 @@ public enum ErrorCode {
    INVITE_CODE_ALREADY_BOUND(50028, "您已绑定过邀请码，无法重复绑定"),
    INVITE_CODE_CANNOT_BIND_SELF(50029, "不能绑定自己的邀请码"),
    RECEIPT_ALREADY_PROCESSED(50027, "收据已处理"),
-    VIP_TRIAL_LIMIT_REACHED(50030, "今日体验次数已达上限，请开通会员");
+    VIP_TRIAL_LIMIT_REACHED(50030, "今日体验次数已达上限，请开通会员"),
    AUDIO_FILE_EMPTY(40016, "音频文件不能为空"),
    AUDIO_FILE_TOO_LARGE(40017, "音频文件过大"),
    AUDIO_FORMAT_NOT_SUPPORTED(40018, "音频格式不支持"),
    STT_SERVICE_ERROR(50031, "语音转文字服务异常");
    /**
     * 状态码
--- a/src/main/java/com/yolo/keyborad/config/DeepgramProperties.java
+++ b/src/main/java/com/yolo/keyborad/config/DeepgramProperties.java
@@ -0,0 +1,34 @@
 package com.yolo.keyborad.config;
 import lombok.Data;
 import org.springframework.boot.context.properties.ConfigurationProperties;
 import org.springframework.stereotype.Component;
 /**
 * Deepgram STT 配置
 *
 * @author ziin
 */
@Data
@Component
@ConfigurationProperties(prefix = "deepgram")
 public class DeepgramProperties {
    /** API Key */
    private String apiKey;
    /** 基础 URL */
    private String baseUrl = "https://api.deepgram.com/v1";
    /** 模型 ID */
    private String model = "nova-2";
    /** 默认语言 */
    private String language = "en";
    /** 智能格式化 */
    private Boolean smartFormat = true;
    /** 添加标点符号 */
    private Boolean punctuate = true;
 }
--- a/src/main/java/com/yolo/keyborad/config/SaTokenConfigure.java
+++ b/src/main/java/com/yolo/keyborad/config/SaTokenConfigure.java
@@ -114,7 +114,8 @@ public class SaTokenConfigure implements WebMvcConfigurer {
                "/chat/audio/*",
                "/ai-companion/page",
                "/chat/history",
-                "/ai-companion/comment/add"
+                "/ai-companion/comment/add",
                "/speech/transcribe"
        };
    }
    @Bean
--- a/src/main/java/com/yolo/keyborad/controller/SpeechController.java
+++ b/src/main/java/com/yolo/keyborad/controller/SpeechController.java
@@ -0,0 +1,34 @@
 package com.yolo.keyborad.controller;
 import com.yolo.keyborad.common.BaseResponse;
 import com.yolo.keyborad.common.ResultUtils;
 import com.yolo.keyborad.model.vo.SpeechToTextVO;
 import com.yolo.keyborad.service.DeepgramService;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.tags.Tag;
 import jakarta.annotation.Resource;
 import lombok.extern.slf4j.Slf4j;
 import org.springframework.web.bind.annotation.*;
 import org.springframework.web.multipart.MultipartFile;
 /**
 * 语音服务控制器
 *
 * @author ziin
 */
@RestController
@Slf4j
@RequestMapping("/speech")
@Tag(name = "语音服务", description = "语音相关功能接口")
 public class SpeechController {
    @Resource
    private DeepgramService deepgramService;
    @PostMapping("/transcribe")
    @Operation(summary = "语音转文字", description = "上传音频文件并转换为文本")
    public BaseResponse<SpeechToTextVO> transcribe(@RequestPart("file") MultipartFile file) {
        SpeechToTextVO result = deepgramService.transcribe(file);
        return ResultUtils.success(result);
    }
 }
--- a/src/main/java/com/yolo/keyborad/model/vo/SpeechToTextVO.java
+++ b/src/main/java/com/yolo/keyborad/model/vo/SpeechToTextVO.java
@@ -0,0 +1,32 @@
 package com.yolo.keyborad.model.vo;
 import io.swagger.v3.oas.annotations.media.Schema;
 import lombok.AllArgsConstructor;
 import lombok.Builder;
 import lombok.Data;
 import lombok.NoArgsConstructor;
 /**
 * 语音转文字响应VO
 *
 * @author ziin
 */
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@Schema(description = "语音转文字响应")
 public class SpeechToTextVO {
    @Schema(description = "转录文本")
    private String transcript;
    @Schema(description = "置信度")
    private Double confidence;
    @Schema(description = "音频时长（秒）")
    private Double duration;
    @Schema(description = "检测到的语言")
    private String detectedLanguage;
 }
--- a/src/main/java/com/yolo/keyborad/service/DeepgramService.java
+++ b/src/main/java/com/yolo/keyborad/service/DeepgramService.java
@@ -0,0 +1,29 @@
 package com.yolo.keyborad.service;
 import com.yolo.keyborad.model.vo.SpeechToTextVO;
 import org.springframework.web.multipart.MultipartFile;
 /**
 * Deepgram STT 语音转文字服务接口
 *
 * @author ziin
 */
 public interface DeepgramService {
    /**
     * 将音频文件转换为文字（使用默认语言）
     *
     * @param audioFile 音频文件
     * @return 语音转文字结果
     */
    SpeechToTextVO transcribe(MultipartFile audioFile);
    /**
     * 将音频文件转换为文字（指定语言）
     *
     * @param audioFile 音频文件
     * @param language  语言代码（如 en, zh, ja 等）
     * @return 语音转文字结果
     */
    SpeechToTextVO transcribe(MultipartFile audioFile, String language);
 }
--- a/src/main/java/com/yolo/keyborad/service/impl/DeepgramServiceImpl.java
+++ b/src/main/java/com/yolo/keyborad/service/impl/DeepgramServiceImpl.java
@@ -0,0 +1,182 @@
 package com.yolo.keyborad.service.impl;
 import cn.hutool.core.util.StrUtil;
 import com.alibaba.fastjson.JSONArray;
 import com.alibaba.fastjson.JSONObject;
 import com.yolo.keyborad.common.ErrorCode;
 import com.yolo.keyborad.config.DeepgramProperties;
 import com.yolo.keyborad.exception.BusinessException;
 import com.yolo.keyborad.model.vo.SpeechToTextVO;
 import com.yolo.keyborad.service.DeepgramService;
 import jakarta.annotation.Resource;
 import lombok.extern.slf4j.Slf4j;
 import org.springframework.http.MediaType;
 import org.springframework.stereotype.Service;
 import org.springframework.web.client.RestClient;
 import org.springframework.web.multipart.MultipartFile;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.List;
 /**
 * Deepgram STT 语音转文字服务实现
 * 参考: https://developers.deepgram.com/docs/getting-started-with-pre-recorded-audio
 *
 * @author ziin
 */
@Service
@Slf4j
 public class DeepgramServiceImpl implements DeepgramService {
    @Resource
    private DeepgramProperties deepgramProperties;
    @Resource
    private RestClient restClient;
    // 支持的音频MIME类型
    private static final List<String> ALLOWED_AUDIO_TYPES = Arrays.asList(
            "audio/wav", "audio/wave",
            "audio/mp3", "audio/mpeg",
            "audio/webm",
            "audio/ogg",
            "audio/flac",
            "audio/m4a"
    );
    // 最大文件大小：20MB
    private static final long MAX_FILE_SIZE = 20 * 1024 * 1024;
    @Override
    public SpeechToTextVO transcribe(MultipartFile audioFile) {
        return transcribe(audioFile, deepgramProperties.getLanguage());
    }
    @Override
    public SpeechToTextVO transcribe(MultipartFile audioFile, String language) {
        // 1. 参数校验
        validateAudioFile(audioFile);
        if (StrUtil.isBlank(language)) {
            language = deepgramProperties.getLanguage();
        }
        // 2. 获取音频Content-Type
        String contentType = audioFile.getContentType();
        if (StrUtil.isBlank(contentType) || !ALLOWED_AUDIO_TYPES.contains(contentType)) {
            log.warn("不支持的音频格式: {}", contentType);
            throw new BusinessException(ErrorCode.AUDIO_FORMAT_NOT_SUPPORTED);
        }
        // 3. 构建请求URL
        String requestUrl = buildRequestUrl(language);
        log.info("调用 Deepgram STT API, language: {}, contentType: {}, 文件大小: {} bytes",
                language, contentType, audioFile.getSize());
        long startTime = System.currentTimeMillis();
        try {
            // 4. 发送请求
            byte[] audioBytes = audioFile.getBytes();
            String responseJson = restClient.post()
                    .uri(requestUrl)
                    .contentType(MediaType.parseMediaType(contentType))
                    .header("Authorization", "Token " + deepgramProperties.getApiKey())
                    .body(audioBytes)
                    .retrieve()
                    .body(String.class);
            long duration = System.currentTimeMillis() - startTime;
            log.info("Deepgram STT API 响应成功, 耗时: {}ms", duration);
            // 5. 解析响应
            return parseResponse(responseJson);
        } catch (IOException e) {
            log.error("读取音频文件失败", e);
            throw new BusinessException(ErrorCode.SYSTEM_ERROR, "音频文件读取失败: " + e.getMessage());
        } catch (Exception e) {
            log.error("调用 Deepgram STT API 发生异常", e);
            throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "语音转文字服务异常: " + e.getMessage());
        }
    }
    /**
     * 校验音频文件
     */
    private void validateAudioFile(MultipartFile audioFile) {
        if (audioFile == null || audioFile.isEmpty()) {
            throw new BusinessException(ErrorCode.AUDIO_FILE_EMPTY);
        }
        if (audioFile.getSize() > MAX_FILE_SIZE) {
            throw new BusinessException(ErrorCode.AUDIO_FILE_TOO_LARGE);
        }
    }
    /**
     * 构建请求URL
     */
    private String buildRequestUrl(String language) {
        StringBuilder url = new StringBuilder(deepgramProperties.getBaseUrl());
        url.append("/listen");
        // 添加查询参数
        url.append("?model=").append(deepgramProperties.getModel());
        url.append("&language=").append(language);
        if (deepgramProperties.getSmartFormat()) {
            url.append("&smart_format=true");
        }
        if (deepgramProperties.getPunctuate()) {
            url.append("&punctuate=true");
        }
        return url.toString();
    }
    /**
     * 解析响应JSON
     */
    private SpeechToTextVO parseResponse(String responseJson) {
        JSONObject jsonResponse = JSONObject.parseObject(responseJson);
        // 解析 metadata
        JSONObject metadata = jsonResponse.getJSONObject("metadata");
        Double duration = metadata != null ? metadata.getDouble("duration") : null;
        // 解析 results
        JSONObject results = jsonResponse.getJSONObject("results");
        if (results == null) {
            throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "响应格式错误: 缺少 results");
        }
        JSONArray channels = results.getJSONArray("channels");
        if (channels == null || channels.isEmpty()) {
            throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "响应格式错误: 缺少 channels");
        }
        JSONObject channel = channels.getJSONObject(0);
        JSONArray alternatives = channel.getJSONArray("alternatives");
        if (alternatives == null || alternatives.isEmpty()) {
            throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "响应格式错误: 缺少 alternatives");
        }
        JSONObject alternative = alternatives.getJSONObject(0);
        String transcript = alternative.getString("transcript");
        Double confidence = alternative.getDouble("confidence");
        String detectedLanguage = channel.getString("detected_language");
        log.info("转录成功, 文本长度: {}, 置信度: {}, 检测语言: {}",
                transcript != null ? transcript.length() : 0, confidence, detectedLanguage);
        return SpeechToTextVO.builder()
                .transcript(transcript)
                .confidence(confidence)
                .duration(duration)
                .detectedLanguage(detectedLanguage)
                .build();
    }
 }
--- a/src/main/resources/application-dev.yml
+++ b/src/main/resources/application-dev.yml
@@ -105,4 +105,11 @@ elevenlabs:
  api-key: sk_25339d32bb14c91f460ed9fce83a1951672f07846a7a10ce
  voice-id: JBFqnCBsd6RMkjVDRZzb
  model-id: eleven_turbo_v2_5
-  output-format: mp3_44100_128
+  output-format: mp3_44100_128
 deepgram:
  api-key: 9c792eb63a65d644cbc95785155754cd1e84f8cf
  model: nova-2
  language: en
  smart-format: true
  punctuate: true