mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-06-02 00:57:41 +08:00
feat(voice): rework TTS/ASR stack and unify tool/skill config schema
This commit is contained in:
@@ -1,14 +1,8 @@
|
||||
# encoding:utf-8
|
||||
"""
|
||||
ZhipuAI (BigModel) voice service.
|
||||
|
||||
ASR : glm-asr-2512 via the OpenAI-compatible /audio/transcriptions endpoint.
|
||||
TTS : not yet implemented.
|
||||
|
||||
Endpoint accepts multipart/form-data with `model`, `file`, and `stream`.
|
||||
File size <= 25MB, duration <= 30s per request.
|
||||
"""
|
||||
"""ZhipuAI voice: glm-asr-2512 (ASR) + glm-tts (TTS) via BigModel REST API."""
|
||||
import datetime
|
||||
import os
|
||||
import random
|
||||
|
||||
import requests
|
||||
|
||||
@@ -20,6 +14,8 @@ from voice.voice import Voice
|
||||
|
||||
|
||||
DEFAULT_ASR_MODEL = "glm-asr-2512"
|
||||
DEFAULT_TTS_MODEL = "glm-tts"
|
||||
DEFAULT_TTS_VOICE = "tongtong"
|
||||
DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4"
|
||||
MAX_FILE_BYTES = 25 * 1024 * 1024
|
||||
REQUEST_TIMEOUT = (5, 60)
|
||||
@@ -27,7 +23,6 @@ REQUEST_TIMEOUT = (5, 60)
|
||||
|
||||
class ZhipuAIVoice(Voice):
|
||||
def __init__(self):
|
||||
# api_key / base read per-call so live config edits take effect.
|
||||
pass
|
||||
|
||||
def voiceToText(self, voice_file: str):
|
||||
@@ -81,12 +76,91 @@ class ZhipuAIVoice(Voice):
|
||||
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
||||
|
||||
def textToVoice(self, text: str):
|
||||
return Reply(ReplyType.ERROR, "ZhipuAI 语音合成尚未接入")
|
||||
try:
|
||||
api_key = conf().get("zhipu_ai_api_key", "")
|
||||
if not api_key:
|
||||
logger.error("[ZhipuAIVoice] zhipu_ai_api_key is not configured")
|
||||
return Reply(ReplyType.ERROR, "未配置 ZhipuAI API key")
|
||||
|
||||
api_base = (conf().get("zhipu_ai_api_base") or DEFAULT_API_BASE).rstrip("/")
|
||||
url = f"{api_base}/audio/speech"
|
||||
model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
|
||||
voice_id = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"input": text,
|
||||
"voice": voice_id,
|
||||
"response_format": "wav",
|
||||
"speed": 1.0,
|
||||
"volume": 1.0,
|
||||
}
|
||||
headers = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
response = requests.post(
|
||||
url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.error(
|
||||
f"[ZhipuAIVoice] textToVoice failed: status={response.status_code} "
|
||||
f"body={response.text[:500]} model={model} voice={voice_id}"
|
||||
)
|
||||
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
|
||||
|
||||
# Some errors come back as JSON / SSE with HTTP 200.
|
||||
ct = response.headers.get("Content-Type", "")
|
||||
if "application/json" in ct or "text/event-stream" in ct:
|
||||
try:
|
||||
err = response.json()
|
||||
except Exception:
|
||||
err = {"raw": response.text[:500]}
|
||||
logger.error(
|
||||
f"[ZhipuAIVoice] textToVoice unexpected text response "
|
||||
f"(content_type={ct}): {err}"
|
||||
)
|
||||
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
|
||||
|
||||
audio_bytes = response.content
|
||||
ext = self._sniff_audio_ext(audio_bytes) or "wav"
|
||||
|
||||
file_name = (
|
||||
"tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
+ str(random.randint(0, 1000)) + "." + ext
|
||||
)
|
||||
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
||||
with open(file_name, "wb") as f:
|
||||
f.write(audio_bytes)
|
||||
logger.info(
|
||||
f"[ZhipuAIVoice] textToVoice model={model} voice={voice_id} "
|
||||
f"file={file_name} bytes={len(audio_bytes)} ext={ext}"
|
||||
)
|
||||
return Reply(ReplyType.VOICE, file_name)
|
||||
except Exception as e:
|
||||
logger.exception(f"[ZhipuAIVoice] textToVoice exception: {e}")
|
||||
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
|
||||
|
||||
@staticmethod
|
||||
def _sniff_audio_ext(data: bytes) -> str:
|
||||
"""Detect audio container by magic bytes; returns '' on unknown."""
|
||||
if len(data) < 12:
|
||||
return ""
|
||||
head = data[:12]
|
||||
if head[:4] == b"RIFF" and head[8:12] == b"WAVE":
|
||||
return "wav"
|
||||
if head[:3] == b"ID3" or head[:2] == b"\xff\xfb" or head[:2] == b"\xff\xf3" or head[:2] == b"\xff\xf2":
|
||||
return "mp3"
|
||||
if head[:4] == b"OggS":
|
||||
return "ogg"
|
||||
if head[:4] == b"fLaC":
|
||||
return "flac"
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _ensure_compatible_format(voice_file: str) -> str:
|
||||
# glm-asr-2512 only accepts .wav / .mp3 — convert everything else
|
||||
# (webm from the browser mic, m4a/amr/silk from chat channels, etc).
|
||||
# glm-asr-2512 only accepts .wav / .mp3
|
||||
lower = voice_file.lower()
|
||||
if lower.endswith(".mp3") or lower.endswith(".wav"):
|
||||
return voice_file
|
||||
@@ -95,8 +169,5 @@ class ZhipuAIVoice(Voice):
|
||||
audio_convert.any_to_mp3(voice_file, mp3_file)
|
||||
return mp3_file
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"[ZhipuAIVoice] convert {voice_file} to mp3 failed: {e}; "
|
||||
f"submitting original file"
|
||||
)
|
||||
logger.warning(f"[ZhipuAIVoice] mp3 convert failed: {e}")
|
||||
return voice_file
|
||||
|
||||
Reference in New Issue
Block a user