mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-06-02 00:57:41 +08:00
136 lines
5.2 KiB
Python
136 lines
5.2 KiB
Python
# encoding:utf-8
|
|
"""
|
|
DashScope (Aliyun Bailian) voice service.
|
|
|
|
ASR : qwen3-asr-flash via dashscope.MultiModalConversation
|
|
TTS : not yet implemented (see CosyVoice / qwen3-tts)
|
|
|
|
Why MultiModalConversation instead of the OpenAI-compatible endpoint:
|
|
- SDK is already a project dep (used by chat/vision)
|
|
- Native API accepts local file:// paths up to 100 QPS without an OSS
|
|
round-trip, which is what we need for the "send a short voice
|
|
message" flow. Public URLs / Base64 also work.
|
|
"""
|
|
import os
|
|
from typing import Optional
|
|
|
|
import dashscope
|
|
from dashscope import MultiModalConversation
|
|
|
|
from bridge.reply import Reply, ReplyType
|
|
from common.log import logger
|
|
from config import conf
|
|
from voice import audio_convert
|
|
from voice.voice import Voice
|
|
|
|
|
|
DEFAULT_ASR_MODEL = "qwen3-asr-flash"
|
|
# qwen3-asr-flash hard cap (single file, sync call). Longer audio needs
|
|
# qwen3-asr-flash-filetrans which is async-only and out of scope here.
|
|
MAX_DURATION_SECONDS = 300
|
|
MAX_FILE_BYTES = 10 * 1024 * 1024
|
|
|
|
|
|
class DashScopeVoice(Voice):
|
|
def __init__(self):
|
|
# api_key is applied per-call (chat bot does the same) so a live
|
|
# config change via the web console takes effect without restart.
|
|
pass
|
|
|
|
def voiceToText(self, voice_file: str):
|
|
try:
|
|
voice_file = self._ensure_compatible_format(voice_file)
|
|
|
|
try:
|
|
size = os.path.getsize(voice_file)
|
|
if size > MAX_FILE_BYTES:
|
|
logger.warning(
|
|
f"[DashScopeVoice] audio file {size}B exceeds {MAX_FILE_BYTES}B; "
|
|
f"qwen3-asr-flash may reject it"
|
|
)
|
|
except OSError:
|
|
pass
|
|
|
|
api_key = conf().get("dashscope_api_key", "")
|
|
if not api_key:
|
|
logger.error("[DashScopeVoice] dashscope_api_key is not configured")
|
|
return Reply(ReplyType.ERROR, "未配置 DashScope API key")
|
|
dashscope.api_key = api_key
|
|
|
|
model = conf().get("voice_to_text_model") or DEFAULT_ASR_MODEL
|
|
abs_path = os.path.abspath(voice_file)
|
|
file_uri = f"file://{abs_path}"
|
|
|
|
messages = [
|
|
{"role": "user", "content": [{"audio": file_uri}]},
|
|
]
|
|
response = MultiModalConversation.call(
|
|
model=model,
|
|
messages=messages,
|
|
result_format="message",
|
|
asr_options={"enable_itn": False, "enable_lid": True},
|
|
)
|
|
|
|
text = self._extract_text(response)
|
|
if text is None:
|
|
logger.error(f"[DashScopeVoice] voiceToText failed: {response}")
|
|
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
|
|
|
logger.info(f"[DashScopeVoice] voiceToText model={model} text={text}")
|
|
return Reply(ReplyType.TEXT, text)
|
|
except Exception as e:
|
|
logger.exception(f"[DashScopeVoice] voiceToText exception: {e}")
|
|
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
|
|
|
def textToVoice(self, text: str):
|
|
# TTS will be added in a follow-up commit (qwen3-tts / cosyvoice).
|
|
return Reply(ReplyType.ERROR, "DashScope 语音合成尚未接入")
|
|
|
|
@staticmethod
|
|
def _ensure_compatible_format(voice_file: str) -> str:
|
|
"""Convert AMR/SILK to mp3 since qwen3-asr-flash doesn't accept them.
|
|
Other formats (mp3/wav/m4a/aac/opus/webm) are passed through.
|
|
"""
|
|
lower = voice_file.lower()
|
|
if lower.endswith(".amr") or lower.endswith(".silk") or lower.endswith(".slk"):
|
|
try:
|
|
mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
|
|
audio_convert.any_to_mp3(voice_file, mp3_file)
|
|
return mp3_file
|
|
except Exception as e:
|
|
logger.warning(
|
|
f"[DashScopeVoice] convert {voice_file} to mp3 failed: {e}; "
|
|
f"submitting original file"
|
|
)
|
|
return voice_file
|
|
|
|
@staticmethod
|
|
def _extract_text(response) -> Optional[str]:
|
|
"""Pull the recognized text out of MultiModalConversation response.
|
|
|
|
Successful shape (result_format="message"):
|
|
response.output.choices[0].message.content -> list of {"text": "..."}
|
|
or in some SDK versions a plain string.
|
|
"""
|
|
try:
|
|
if getattr(response, "status_code", 200) != 200:
|
|
return None
|
|
choices = response.output.get("choices") or []
|
|
if not choices:
|
|
return None
|
|
content = choices[0].get("message", {}).get("content")
|
|
if isinstance(content, str):
|
|
return content.strip() or None
|
|
if isinstance(content, list):
|
|
parts = []
|
|
for item in content:
|
|
if isinstance(item, dict) and "text" in item:
|
|
parts.append(item["text"])
|
|
elif isinstance(item, str):
|
|
parts.append(item)
|
|
text = "".join(parts).strip()
|
|
return text or None
|
|
return None
|
|
except Exception:
|
|
return None
|