chatgpt-on-wechat/voice/dashscope/dashscope_voice.py

# encoding:utf-8
"""
DashScope (Aliyun Bailian) voice service.

ASR : qwen3-asr-flash via dashscope.MultiModalConversation
TTS : not yet implemented (see CosyVoice / qwen3-tts)

Why MultiModalConversation instead of the OpenAI-compatible endpoint:
  - SDK is already a project dep (used by chat/vision)
  - Native API accepts local file:// paths up to 100 QPS without an OSS
    round-trip, which is what we need for the "send a short voice
    message" flow. Public URLs / Base64 also work.
"""
import os
from typing import Optional

import dashscope
from dashscope import MultiModalConversation

from bridge.reply import Reply, ReplyType
from common.log import logger
from config import conf
from voice import audio_convert
from voice.voice import Voice


DEFAULT_ASR_MODEL = "qwen3-asr-flash"
# qwen3-asr-flash hard cap (single file, sync call). Longer audio needs
# qwen3-asr-flash-filetrans which is async-only and out of scope here.
MAX_DURATION_SECONDS = 300
MAX_FILE_BYTES = 10 * 1024 * 1024


class DashScopeVoice(Voice):
    def __init__(self):
        # api_key is applied per-call (chat bot does the same) so a live
        # config change via the web console takes effect without restart.
        pass

    def voiceToText(self, voice_file: str):
        try:
            voice_file = self._ensure_compatible_format(voice_file)

            try:
                size = os.path.getsize(voice_file)
                if size > MAX_FILE_BYTES:
                    logger.warning(
                        f"[DashScopeVoice] audio file {size}B exceeds {MAX_FILE_BYTES}B; "
                        f"qwen3-asr-flash may reject it"
                    )
            except OSError:
                pass

            api_key = conf().get("dashscope_api_key", "")
            if not api_key:
                logger.error("[DashScopeVoice] dashscope_api_key is not configured")
                return Reply(ReplyType.ERROR, "未配置 DashScope API key")
            dashscope.api_key = api_key

            model = conf().get("voice_to_text_model") or DEFAULT_ASR_MODEL
            abs_path = os.path.abspath(voice_file)
            file_uri = f"file://{abs_path}"

            messages = [
                {"role": "user", "content": [{"audio": file_uri}]},
            ]
            response = MultiModalConversation.call(
                model=model,
                messages=messages,
                result_format="message",
                asr_options={"enable_itn": False, "enable_lid": True},
            )

            text = self._extract_text(response)
            if text is None:
                logger.error(f"[DashScopeVoice] voiceToText failed: {response}")
                return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音，请稍后再试吧~")

            logger.info(f"[DashScopeVoice] voiceToText model={model} text={text}")
            return Reply(ReplyType.TEXT, text)
        except Exception as e:
            logger.exception(f"[DashScopeVoice] voiceToText exception: {e}")
            return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音，请稍后再试吧~")

    def textToVoice(self, text: str):
        # TTS will be added in a follow-up commit (qwen3-tts / cosyvoice).
        return Reply(ReplyType.ERROR, "DashScope 语音合成尚未接入")

    @staticmethod
    def _ensure_compatible_format(voice_file: str) -> str:
        """Convert AMR/SILK to mp3 since qwen3-asr-flash doesn't accept them.
        Other formats (mp3/wav/m4a/aac/opus/webm) are passed through.
        """
        lower = voice_file.lower()
        if lower.endswith(".amr") or lower.endswith(".silk") or lower.endswith(".slk"):
            try:
                mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
                audio_convert.any_to_mp3(voice_file, mp3_file)
                return mp3_file
            except Exception as e:
                logger.warning(
                    f"[DashScopeVoice] convert {voice_file} to mp3 failed: {e}; "
                    f"submitting original file"
                )
        return voice_file

    @staticmethod
    def _extract_text(response) -> Optional[str]:
        """Pull the recognized text out of MultiModalConversation response.

        Successful shape (result_format="message"):
          response.output.choices[0].message.content -> list of {"text": "..."}
          or in some SDK versions a plain string.
        """
        try:
            if getattr(response, "status_code", 200) != 200:
                return None
            choices = response.output.get("choices") or []
            if not choices:
                return None
            content = choices[0].get("message", {}).get("content")
            if isinstance(content, str):
                return content.strip() or None
            if isinstance(content, list):
                parts = []
                for item in content:
                    if isinstance(item, dict) and "text" in item:
                        parts.append(item["text"])
                    elif isinstance(item, str):
                        parts.append(item)
                text = "".join(parts).strip()
                return text or None
            return None
        except Exception:
            return None