feat(voice): rework TTS/ASR stack and unify tool/skill config schema

This commit is contained in:
zhayujie
2026-05-21 16:00:54 +08:00
parent 2b90f377e6
commit b8333e351c
31 changed files with 1551 additions and 335 deletions

View File

@@ -1,20 +1,13 @@
# encoding:utf-8
"""
DashScope (Aliyun Bailian) voice service.
ASR : qwen3-asr-flash via dashscope.MultiModalConversation
TTS : not yet implemented (see CosyVoice / qwen3-tts)
Why MultiModalConversation instead of the OpenAI-compatible endpoint:
- SDK is already a project dep (used by chat/vision)
- Native API accepts local file:// paths up to 100 QPS without an OSS
round-trip, which is what we need for the "send a short voice
message" flow. Public URLs / Base64 also work.
"""
"""DashScope voice: qwen3-asr-flash (ASR) + qwen3-tts-flash (TTS)
via dashscope.MultiModalConversation."""
import datetime
import os
import random
from typing import Optional
import dashscope
import requests
from dashscope import MultiModalConversation
from bridge.reply import Reply, ReplyType
@@ -25,16 +18,14 @@ from voice.voice import Voice
DEFAULT_ASR_MODEL = "qwen3-asr-flash"
# qwen3-asr-flash hard cap (single file, sync call). Longer audio needs
# qwen3-asr-flash-filetrans which is async-only and out of scope here.
DEFAULT_TTS_MODEL = "qwen3-tts-flash"
DEFAULT_TTS_VOICE = "Cherry"
MAX_DURATION_SECONDS = 300
MAX_FILE_BYTES = 10 * 1024 * 1024
class DashScopeVoice(Voice):
def __init__(self):
# api_key is applied per-call (chat bot does the same) so a live
# config change via the web console takes effect without restart.
pass
def voiceToText(self, voice_file: str):
@@ -83,14 +74,72 @@ class DashScopeVoice(Voice):
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
def textToVoice(self, text: str):
# TTS will be added in a follow-up commit (qwen3-tts / cosyvoice).
return Reply(ReplyType.ERROR, "DashScope 语音合成尚未接入")
try:
api_key = conf().get("dashscope_api_key", "")
if not api_key:
logger.error("[DashScopeVoice] dashscope_api_key is not configured")
return Reply(ReplyType.ERROR, "未配置 DashScope API key")
dashscope.api_key = api_key
model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
voice = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
response = MultiModalConversation.call(
model=model,
api_key=api_key,
text=text,
voice=voice,
stream=False,
)
url = self._extract_audio_url(response)
if not url:
logger.error(f"[DashScopeVoice] textToVoice failed: {response}")
return Reply(ReplyType.ERROR, "语音合成失败")
local_path = self._download_audio(url)
if not local_path:
return Reply(ReplyType.ERROR, "语音合成失败")
logger.info(f"[DashScopeVoice] textToVoice model={model} voice={voice} file={local_path}")
return Reply(ReplyType.VOICE, local_path)
except Exception as e:
logger.exception(f"[DashScopeVoice] textToVoice exception: {e}")
return Reply(ReplyType.ERROR, "语音合成失败")
@staticmethod
def _extract_audio_url(response) -> Optional[str]:
try:
if getattr(response, "status_code", 200) != 200:
return None
audio = response.output.get("audio") if response.output else None
if isinstance(audio, dict):
return audio.get("url") or None
return getattr(audio, "url", None)
except Exception:
return None
@staticmethod
def _download_audio(url: str) -> Optional[str]:
try:
tmp_dir = os.path.join(os.getcwd(), "tmp")
os.makedirs(tmp_dir, exist_ok=True)
ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
ext = os.path.splitext(url.split("?", 1)[0])[1].lower() or ".wav"
if ext not in (".mp3", ".wav", ".m4a", ".aac", ".opus"):
ext = ".wav"
dst = os.path.join(tmp_dir, f"dashscope_tts_{ts}_{random.randint(0, 9999)}{ext}")
resp = requests.get(url, timeout=60)
resp.raise_for_status()
with open(dst, "wb") as f:
f.write(resp.content)
return dst
except Exception as e:
logger.error(f"[DashScopeVoice] download audio failed: {e}")
return None
@staticmethod
def _ensure_compatible_format(voice_file: str) -> str:
"""Convert AMR/SILK to mp3 since qwen3-asr-flash doesn't accept them.
Other formats (mp3/wav/m4a/aac/opus/webm) are passed through.
"""
# qwen3-asr-flash doesn't accept AMR/SILK; mp3/wav/m4a/aac/opus pass through.
lower = voice_file.lower()
if lower.endswith(".amr") or lower.endswith(".silk") or lower.endswith(".slk"):
try:
@@ -98,20 +147,11 @@ class DashScopeVoice(Voice):
audio_convert.any_to_mp3(voice_file, mp3_file)
return mp3_file
except Exception as e:
logger.warning(
f"[DashScopeVoice] convert {voice_file} to mp3 failed: {e}; "
f"submitting original file"
)
logger.warning(f"[DashScopeVoice] mp3 convert failed: {e}")
return voice_file
@staticmethod
def _extract_text(response) -> Optional[str]:
"""Pull the recognized text out of MultiModalConversation response.
Successful shape (result_format="message"):
response.output.choices[0].message.content -> list of {"text": "..."}
or in some SDK versions a plain string.
"""
try:
if getattr(response, "status_code", 200) != 200:
return None

View File

@@ -1,16 +1,18 @@
"""
google voice service
"""
"""LinkAI voice: Whisper ASR + multi-vendor TTS (OpenAI / Doubao / Baidu)
proxied via https://docs.link-ai.tech/platform/api/voice-speech."""
import datetime
import os
import random
import requests
from voice import audio_convert
from bridge.reply import Reply, ReplyType
from common import const
from common.log import logger
from config import conf
from voice import audio_convert
from voice.voice import Voice
from common import const
import os
import datetime
class LinkAIVoice(Voice):
def __init__(self):
@@ -21,63 +23,67 @@ class LinkAIVoice(Voice):
try:
url = conf().get("linkai_api_base", "https://api.link-ai.tech") + "/v1/audio/transcriptions"
headers = {"Authorization": "Bearer " + conf().get("linkai_api_key")}
model = None
if not conf().get("text_to_voice") or conf().get("voice_to_text") == "openai":
model = const.WHISPER_1
# Pin whisper-1: gateway ignores any other ASR model id.
model = const.WHISPER_1
if voice_file.endswith(".amr"):
try:
mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
audio_convert.any_to_mp3(voice_file, mp3_file)
voice_file = mp3_file
except Exception as e:
logger.warn(f"[LinkVoice] amr file transfer failed, directly send amr voice file: {format(e)}")
file = open(voice_file, "rb")
file_body = {
"file": file
}
data = {
"model": model
}
res = requests.post(url, files=file_body, headers=headers, data=data, timeout=(5, 60))
if res.status_code == 200:
text = res.json().get("text")
else:
res_json = res.json()
logger.error(f"[LinkVoice] voiceToText error, status_code={res.status_code}, msg={res_json.get('message')}")
logger.warning(f"[LinkVoice] amr file transfer failed, directly send amr voice file: {e}")
with open(voice_file, "rb") as file:
res = requests.post(
url,
files={"file": file},
headers=headers,
data={"model": model},
timeout=(5, 60),
)
if res.status_code != 200:
msg = ""
try:
msg = res.json().get("message", "")
except Exception:
pass
logger.error(f"[LinkVoice] voiceToText error, status_code={res.status_code}, msg={msg}")
return None
reply = Reply(ReplyType.TEXT, text)
text = res.json().get("text")
logger.info(f"[LinkVoice] voiceToText success, text={text}, file name={voice_file}")
return Reply(ReplyType.TEXT, text)
except Exception as e:
logger.error(e)
return None
return reply
def textToVoice(self, text):
try:
url = conf().get("linkai_api_base", "https://api.link-ai.tech") + "/v1/audio/speech"
headers = {"Authorization": "Bearer " + conf().get("linkai_api_key")}
model = const.TTS_1
if not conf().get("text_to_voice") or conf().get("text_to_voice") in ["openai", const.TTS_1, const.TTS_1_HD]:
model = conf().get("text_to_voice_model") or const.TTS_1
# Gateway routes by `model` (tts-1 / doubao / baidu) + `voice` from
# that engine's catalog. `app_code` is optional workspace override.
data = {
"model": model,
"input": text,
"voice": conf().get("tts_voice_id"),
"app_code": conf().get("linkai_app_code")
"app_code": conf().get("linkai_app_code"),
}
model = conf().get("text_to_voice_model")
if model:
data["model"] = model
res = requests.post(url, headers=headers, json=data, timeout=(5, 120))
if res.status_code == 200:
tmp_file_name = "tmp/" + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + str(random.randint(0, 1000)) + ".mp3"
with open(tmp_file_name, 'wb') as f:
f.write(res.content)
reply = Reply(ReplyType.VOICE, tmp_file_name)
logger.info(f"[LinkVoice] textToVoice success, input={text}, model={model}, voice_id={data.get('voice')}")
return reply
else:
res_json = res.json()
logger.error(f"[LinkVoice] textToVoice error, status_code={res.status_code}, msg={res_json.get('message')}")
if res.status_code != 200:
msg = ""
try:
msg = res.json().get("message", "")
except Exception:
pass
logger.error(f"[LinkVoice] textToVoice error, status_code={res.status_code}, msg={msg}")
return None
tmp_file_name = "tmp/" + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + str(random.randint(0, 1000)) + ".mp3"
os.makedirs(os.path.dirname(tmp_file_name), exist_ok=True)
with open(tmp_file_name, 'wb') as f:
f.write(res.content)
logger.info(f"[LinkVoice] textToVoice success, input={text}, voice_id={data.get('voice')}")
return Reply(ReplyType.VOICE, tmp_file_name)
except Exception as e:
logger.error(e)
# reply = Reply(ReplyType.ERROR, "遇到了一点小问题,请稍后再问我吧")
return None

View File

@@ -1,8 +1,7 @@
# encoding:utf-8
"""
MiniMax TTS voice service
"""
"""MiniMax TTS via /v1/t2a_v2 (SSE stream, hex-encoded mp3 chunks)."""
import datetime
import json
import random
import requests
@@ -12,24 +11,12 @@ from config import conf
from voice.voice import Voice
MINIMAX_TTS_VOICES = [
"English_Graceful_Lady",
"English_Insightful_Speaker",
"English_radiant_girl",
"English_Persuasive_Man",
"English_Lucky_Robot",
"English_expressive_narrator",
"Chinese_Warm_Woman",
"Chinese_Gentle_Man",
]
class MinimaxVoice(Voice):
def __init__(self):
self.api_key = conf().get("minimax_api_key")
self.api_base = conf().get("minimax_api_base") or "https://api.minimax.io"
# Strip trailing /v1 if present so we can always append /v1/t2a_v2
self.api_base = self.api_base.rstrip("/")
# Mainland endpoint matches `sk-api-0-...` keys; override via
# `minimax_api_base` for international (api.minimax.io) workspaces.
self.api_base = (conf().get("minimax_api_base") or "https://api.minimaxi.com").rstrip("/")
if self.api_base.endswith("/v1"):
self.api_base = self.api_base[:-3]
@@ -68,12 +55,14 @@ class MinimaxVoice(Voice):
response = requests.post(url, headers=headers, json=payload, stream=True, timeout=60)
response.raise_for_status()
# Parse SSE stream and collect hex-encoded audio chunks
# MiniMax returns HTTP 200 even on errors; capture base_resp for diagnostics.
audio_chunks = []
buffer = ""
last_base_resp = None
event_count = 0
for raw in response.iter_lines():
if not raw:
continue
event_count += 1
line = raw.decode("utf-8") if isinstance(raw, bytes) else raw
if not line.startswith("data:"):
continue
@@ -81,16 +70,31 @@ class MinimaxVoice(Voice):
if not json_str or json_str == "[DONE]":
continue
try:
import json
event_data = json.loads(json_str)
audio_hex = event_data.get("data", {}).get("audio")
if audio_hex:
audio_chunks.append(bytes.fromhex(audio_hex))
except Exception:
continue
base_resp = event_data.get("base_resp") or {}
if base_resp:
last_base_resp = base_resp
audio_hex = (event_data.get("data") or {}).get("audio")
if audio_hex:
try:
audio_chunks.append(bytes.fromhex(audio_hex))
except Exception as e:
logger.warning(f"[MINIMAX] skip bad audio hex chunk: {e}")
if not audio_chunks:
logger.error("[MINIMAX] TTS returned no audio data")
ct = response.headers.get("Content-Type", "")
if last_base_resp and last_base_resp.get("status_code") not in (None, 0):
logger.error(
f"[MINIMAX] TTS failed: status_code={last_base_resp.get('status_code')}, "
f"status_msg={last_base_resp.get('status_msg')}, model={model}, voice_id={voice_id}"
)
else:
logger.error(
f"[MINIMAX] TTS returned no audio data, model={model}, voice_id={voice_id}, "
f"url={url}, http={response.status_code}, content_type={ct!r}, events={event_count}"
)
return Reply(ReplyType.ERROR, "语音合成失败,未获取到音频数据")
audio_data = b"".join(audio_chunks)

View File

@@ -31,7 +31,8 @@ class OpenaiVoice(Voice):
"file": file,
}
data = {
"model": "whisper-1",
# Override via `voice_to_text_model` (e.g. fall back to whisper-1).
"model": conf().get("voice_to_text_model") or "gpt-4o-mini-transcribe",
}
response = requests.post(url, headers=headers, files=files, data=data)
response_data = response.json()

View File

@@ -1,14 +1,8 @@
# encoding:utf-8
"""
ZhipuAI (BigModel) voice service.
ASR : glm-asr-2512 via the OpenAI-compatible /audio/transcriptions endpoint.
TTS : not yet implemented.
Endpoint accepts multipart/form-data with `model`, `file`, and `stream`.
File size <= 25MB, duration <= 30s per request.
"""
"""ZhipuAI voice: glm-asr-2512 (ASR) + glm-tts (TTS) via BigModel REST API."""
import datetime
import os
import random
import requests
@@ -20,6 +14,8 @@ from voice.voice import Voice
DEFAULT_ASR_MODEL = "glm-asr-2512"
DEFAULT_TTS_MODEL = "glm-tts"
DEFAULT_TTS_VOICE = "tongtong"
DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4"
MAX_FILE_BYTES = 25 * 1024 * 1024
REQUEST_TIMEOUT = (5, 60)
@@ -27,7 +23,6 @@ REQUEST_TIMEOUT = (5, 60)
class ZhipuAIVoice(Voice):
def __init__(self):
# api_key / base read per-call so live config edits take effect.
pass
def voiceToText(self, voice_file: str):
@@ -81,12 +76,91 @@ class ZhipuAIVoice(Voice):
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
def textToVoice(self, text: str):
return Reply(ReplyType.ERROR, "ZhipuAI 语音合成尚未接入")
try:
api_key = conf().get("zhipu_ai_api_key", "")
if not api_key:
logger.error("[ZhipuAIVoice] zhipu_ai_api_key is not configured")
return Reply(ReplyType.ERROR, "未配置 ZhipuAI API key")
api_base = (conf().get("zhipu_ai_api_base") or DEFAULT_API_BASE).rstrip("/")
url = f"{api_base}/audio/speech"
model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
voice_id = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
payload = {
"model": model,
"input": text,
"voice": voice_id,
"response_format": "wav",
"speed": 1.0,
"volume": 1.0,
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
response = requests.post(
url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT
)
if response.status_code != 200:
logger.error(
f"[ZhipuAIVoice] textToVoice failed: status={response.status_code} "
f"body={response.text[:500]} model={model} voice={voice_id}"
)
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
# Some errors come back as JSON / SSE with HTTP 200.
ct = response.headers.get("Content-Type", "")
if "application/json" in ct or "text/event-stream" in ct:
try:
err = response.json()
except Exception:
err = {"raw": response.text[:500]}
logger.error(
f"[ZhipuAIVoice] textToVoice unexpected text response "
f"(content_type={ct}): {err}"
)
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
audio_bytes = response.content
ext = self._sniff_audio_ext(audio_bytes) or "wav"
file_name = (
"tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+ str(random.randint(0, 1000)) + "." + ext
)
os.makedirs(os.path.dirname(file_name), exist_ok=True)
with open(file_name, "wb") as f:
f.write(audio_bytes)
logger.info(
f"[ZhipuAIVoice] textToVoice model={model} voice={voice_id} "
f"file={file_name} bytes={len(audio_bytes)} ext={ext}"
)
return Reply(ReplyType.VOICE, file_name)
except Exception as e:
logger.exception(f"[ZhipuAIVoice] textToVoice exception: {e}")
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
@staticmethod
def _sniff_audio_ext(data: bytes) -> str:
"""Detect audio container by magic bytes; returns '' on unknown."""
if len(data) < 12:
return ""
head = data[:12]
if head[:4] == b"RIFF" and head[8:12] == b"WAVE":
return "wav"
if head[:3] == b"ID3" or head[:2] == b"\xff\xfb" or head[:2] == b"\xff\xf3" or head[:2] == b"\xff\xf2":
return "mp3"
if head[:4] == b"OggS":
return "ogg"
if head[:4] == b"fLaC":
return "flac"
return ""
@staticmethod
def _ensure_compatible_format(voice_file: str) -> str:
# glm-asr-2512 only accepts .wav / .mp3 — convert everything else
# (webm from the browser mic, m4a/amr/silk from chat channels, etc).
# glm-asr-2512 only accepts .wav / .mp3
lower = voice_file.lower()
if lower.endswith(".mp3") or lower.endswith(".wav"):
return voice_file
@@ -95,8 +169,5 @@ class ZhipuAIVoice(Voice):
audio_convert.any_to_mp3(voice_file, mp3_file)
return mp3_file
except Exception as e:
logger.warning(
f"[ZhipuAIVoice] convert {voice_file} to mp3 failed: {e}; "
f"submitting original file"
)
logger.warning(f"[ZhipuAIVoice] mp3 convert failed: {e}")
return voice_file