feat(voice): add dashscope & zhipu ASR, in-page mic input

This commit is contained in:
zhayujie
2026-05-20 22:36:37 +08:00
parent fff7326209
commit 2b90f377e6
9 changed files with 786 additions and 34 deletions

View File

View File

@@ -0,0 +1,135 @@
# encoding:utf-8
"""
DashScope (Aliyun Bailian) voice service.
ASR : qwen3-asr-flash via dashscope.MultiModalConversation
TTS : not yet implemented (see CosyVoice / qwen3-tts)
Why MultiModalConversation instead of the OpenAI-compatible endpoint:
- SDK is already a project dep (used by chat/vision)
- Native API accepts local file:// paths up to 100 QPS without an OSS
round-trip, which is what we need for the "send a short voice
message" flow. Public URLs / Base64 also work.
"""
import os
from typing import Optional
import dashscope
from dashscope import MultiModalConversation
from bridge.reply import Reply, ReplyType
from common.log import logger
from config import conf
from voice import audio_convert
from voice.voice import Voice
DEFAULT_ASR_MODEL = "qwen3-asr-flash"
# qwen3-asr-flash hard cap (single file, sync call). Longer audio needs
# qwen3-asr-flash-filetrans which is async-only and out of scope here.
MAX_DURATION_SECONDS = 300
MAX_FILE_BYTES = 10 * 1024 * 1024
class DashScopeVoice(Voice):
def __init__(self):
# api_key is applied per-call (chat bot does the same) so a live
# config change via the web console takes effect without restart.
pass
def voiceToText(self, voice_file: str):
try:
voice_file = self._ensure_compatible_format(voice_file)
try:
size = os.path.getsize(voice_file)
if size > MAX_FILE_BYTES:
logger.warning(
f"[DashScopeVoice] audio file {size}B exceeds {MAX_FILE_BYTES}B; "
f"qwen3-asr-flash may reject it"
)
except OSError:
pass
api_key = conf().get("dashscope_api_key", "")
if not api_key:
logger.error("[DashScopeVoice] dashscope_api_key is not configured")
return Reply(ReplyType.ERROR, "未配置 DashScope API key")
dashscope.api_key = api_key
model = conf().get("voice_to_text_model") or DEFAULT_ASR_MODEL
abs_path = os.path.abspath(voice_file)
file_uri = f"file://{abs_path}"
messages = [
{"role": "user", "content": [{"audio": file_uri}]},
]
response = MultiModalConversation.call(
model=model,
messages=messages,
result_format="message",
asr_options={"enable_itn": False, "enable_lid": True},
)
text = self._extract_text(response)
if text is None:
logger.error(f"[DashScopeVoice] voiceToText failed: {response}")
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
logger.info(f"[DashScopeVoice] voiceToText model={model} text={text}")
return Reply(ReplyType.TEXT, text)
except Exception as e:
logger.exception(f"[DashScopeVoice] voiceToText exception: {e}")
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
def textToVoice(self, text: str):
# TTS will be added in a follow-up commit (qwen3-tts / cosyvoice).
return Reply(ReplyType.ERROR, "DashScope 语音合成尚未接入")
@staticmethod
def _ensure_compatible_format(voice_file: str) -> str:
"""Convert AMR/SILK to mp3 since qwen3-asr-flash doesn't accept them.
Other formats (mp3/wav/m4a/aac/opus/webm) are passed through.
"""
lower = voice_file.lower()
if lower.endswith(".amr") or lower.endswith(".silk") or lower.endswith(".slk"):
try:
mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
audio_convert.any_to_mp3(voice_file, mp3_file)
return mp3_file
except Exception as e:
logger.warning(
f"[DashScopeVoice] convert {voice_file} to mp3 failed: {e}; "
f"submitting original file"
)
return voice_file
@staticmethod
def _extract_text(response) -> Optional[str]:
"""Pull the recognized text out of MultiModalConversation response.
Successful shape (result_format="message"):
response.output.choices[0].message.content -> list of {"text": "..."}
or in some SDK versions a plain string.
"""
try:
if getattr(response, "status_code", 200) != 200:
return None
choices = response.output.get("choices") or []
if not choices:
return None
content = choices[0].get("message", {}).get("content")
if isinstance(content, str):
return content.strip() or None
if isinstance(content, list):
parts = []
for item in content:
if isinstance(item, dict) and "text" in item:
parts.append(item["text"])
elif isinstance(item, str):
parts.append(item)
text = "".join(parts).strip()
return text or None
return None
except Exception:
return None

View File

@@ -58,4 +58,12 @@ def create_voice(voice_type):
from voice.minimax.minimax_voice import MinimaxVoice
return MinimaxVoice()
elif voice_type == "dashscope":
from voice.dashscope.dashscope_voice import DashScopeVoice
return DashScopeVoice()
elif voice_type == "zhipu" or voice_type == "zhipuai":
from voice.zhipuai.zhipuai_voice import ZhipuAIVoice
return ZhipuAIVoice()
raise RuntimeError

View File

View File

@@ -0,0 +1,102 @@
# encoding:utf-8
"""
ZhipuAI (BigModel) voice service.
ASR : glm-asr-2512 via the OpenAI-compatible /audio/transcriptions endpoint.
TTS : not yet implemented.
Endpoint accepts multipart/form-data with `model`, `file`, and `stream`.
File size <= 25MB, duration <= 30s per request.
"""
import os
import requests
from bridge.reply import Reply, ReplyType
from common.log import logger
from config import conf
from voice import audio_convert
from voice.voice import Voice
DEFAULT_ASR_MODEL = "glm-asr-2512"
DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4"
MAX_FILE_BYTES = 25 * 1024 * 1024
REQUEST_TIMEOUT = (5, 60)
class ZhipuAIVoice(Voice):
def __init__(self):
# api_key / base read per-call so live config edits take effect.
pass
def voiceToText(self, voice_file: str):
try:
voice_file = self._ensure_compatible_format(voice_file)
try:
size = os.path.getsize(voice_file)
if size > MAX_FILE_BYTES:
logger.warning(
f"[ZhipuAIVoice] audio file {size}B exceeds {MAX_FILE_BYTES}B; "
f"glm-asr-2512 may reject it"
)
except OSError:
pass
api_key = conf().get("zhipu_ai_api_key", "")
if not api_key:
logger.error("[ZhipuAIVoice] zhipu_ai_api_key is not configured")
return Reply(ReplyType.ERROR, "未配置 ZhipuAI API key")
api_base = (conf().get("zhipu_ai_api_base") or DEFAULT_API_BASE).rstrip("/")
url = f"{api_base}/audio/transcriptions"
model = conf().get("voice_to_text_model") or DEFAULT_ASR_MODEL
with open(voice_file, "rb") as f:
files = {"file": (os.path.basename(voice_file), f)}
data = {"model": model, "stream": "false"}
headers = {"Authorization": f"Bearer {api_key}"}
response = requests.post(
url, headers=headers, files=files, data=data, timeout=REQUEST_TIMEOUT
)
if response.status_code != 200:
logger.error(
f"[ZhipuAIVoice] voiceToText failed: status={response.status_code} "
f"body={response.text[:500]}"
)
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
payload = response.json()
text = (payload.get("text") or "").strip()
if not text:
logger.error(f"[ZhipuAIVoice] voiceToText empty text: {payload}")
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
logger.info(f"[ZhipuAIVoice] voiceToText model={model} text={text}")
return Reply(ReplyType.TEXT, text)
except Exception as e:
logger.exception(f"[ZhipuAIVoice] voiceToText exception: {e}")
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
def textToVoice(self, text: str):
return Reply(ReplyType.ERROR, "ZhipuAI 语音合成尚未接入")
@staticmethod
def _ensure_compatible_format(voice_file: str) -> str:
# glm-asr-2512 only accepts .wav / .mp3 — convert everything else
# (webm from the browser mic, m4a/amr/silk from chat channels, etc).
lower = voice_file.lower()
if lower.endswith(".mp3") or lower.endswith(".wav"):
return voice_file
try:
mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
audio_convert.any_to_mp3(voice_file, mp3_file)
return mp3_file
except Exception as e:
logger.warning(
f"[ZhipuAIVoice] convert {voice_file} to mp3 failed: {e}; "
f"submitting original file"
)
return voice_file