mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-06-02 00:57:41 +08:00
174 lines
6.8 KiB
Python
174 lines
6.8 KiB
Python
# encoding:utf-8
|
|
"""ZhipuAI voice: glm-asr-2512 (ASR) + glm-tts (TTS) via BigModel REST API."""
|
|
import datetime
|
|
import os
|
|
import random
|
|
|
|
import requests
|
|
|
|
from bridge.reply import Reply, ReplyType
|
|
from common.log import logger
|
|
from config import conf
|
|
from voice import audio_convert
|
|
from voice.voice import Voice
|
|
|
|
|
|
DEFAULT_ASR_MODEL = "glm-asr-2512"
|
|
DEFAULT_TTS_MODEL = "glm-tts"
|
|
DEFAULT_TTS_VOICE = "tongtong"
|
|
DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4"
|
|
MAX_FILE_BYTES = 25 * 1024 * 1024
|
|
REQUEST_TIMEOUT = (5, 60)
|
|
|
|
|
|
class ZhipuAIVoice(Voice):
|
|
def __init__(self):
|
|
pass
|
|
|
|
def voiceToText(self, voice_file: str):
|
|
try:
|
|
voice_file = self._ensure_compatible_format(voice_file)
|
|
|
|
try:
|
|
size = os.path.getsize(voice_file)
|
|
if size > MAX_FILE_BYTES:
|
|
logger.warning(
|
|
f"[ZhipuAIVoice] audio file {size}B exceeds {MAX_FILE_BYTES}B; "
|
|
f"glm-asr-2512 may reject it"
|
|
)
|
|
except OSError:
|
|
pass
|
|
|
|
api_key = conf().get("zhipu_ai_api_key", "")
|
|
if not api_key:
|
|
logger.error("[ZhipuAIVoice] zhipu_ai_api_key is not configured")
|
|
return Reply(ReplyType.ERROR, "未配置 ZhipuAI API key")
|
|
|
|
api_base = (conf().get("zhipu_ai_api_base") or DEFAULT_API_BASE).rstrip("/")
|
|
url = f"{api_base}/audio/transcriptions"
|
|
model = conf().get("voice_to_text_model") or DEFAULT_ASR_MODEL
|
|
|
|
with open(voice_file, "rb") as f:
|
|
files = {"file": (os.path.basename(voice_file), f)}
|
|
data = {"model": model, "stream": "false"}
|
|
headers = {"Authorization": f"Bearer {api_key}"}
|
|
response = requests.post(
|
|
url, headers=headers, files=files, data=data, timeout=REQUEST_TIMEOUT
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
logger.error(
|
|
f"[ZhipuAIVoice] voiceToText failed: status={response.status_code} "
|
|
f"body={response.text[:500]}"
|
|
)
|
|
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
|
|
|
payload = response.json()
|
|
text = (payload.get("text") or "").strip()
|
|
if not text:
|
|
logger.error(f"[ZhipuAIVoice] voiceToText empty text: {payload}")
|
|
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
|
|
|
logger.info(f"[ZhipuAIVoice] voiceToText model={model} text={text}")
|
|
return Reply(ReplyType.TEXT, text)
|
|
except Exception as e:
|
|
logger.exception(f"[ZhipuAIVoice] voiceToText exception: {e}")
|
|
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
|
|
|
def textToVoice(self, text: str):
|
|
try:
|
|
api_key = conf().get("zhipu_ai_api_key", "")
|
|
if not api_key:
|
|
logger.error("[ZhipuAIVoice] zhipu_ai_api_key is not configured")
|
|
return Reply(ReplyType.ERROR, "未配置 ZhipuAI API key")
|
|
|
|
api_base = (conf().get("zhipu_ai_api_base") or DEFAULT_API_BASE).rstrip("/")
|
|
url = f"{api_base}/audio/speech"
|
|
model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
|
|
voice_id = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
|
|
|
|
payload = {
|
|
"model": model,
|
|
"input": text,
|
|
"voice": voice_id,
|
|
"response_format": "wav",
|
|
"speed": 1.0,
|
|
"volume": 1.0,
|
|
}
|
|
headers = {
|
|
"Authorization": f"Bearer {api_key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
response = requests.post(
|
|
url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
logger.error(
|
|
f"[ZhipuAIVoice] textToVoice failed: status={response.status_code} "
|
|
f"body={response.text[:500]} model={model} voice={voice_id}"
|
|
)
|
|
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
|
|
|
|
# Some errors come back as JSON / SSE with HTTP 200.
|
|
ct = response.headers.get("Content-Type", "")
|
|
if "application/json" in ct or "text/event-stream" in ct:
|
|
try:
|
|
err = response.json()
|
|
except Exception:
|
|
err = {"raw": response.text[:500]}
|
|
logger.error(
|
|
f"[ZhipuAIVoice] textToVoice unexpected text response "
|
|
f"(content_type={ct}): {err}"
|
|
)
|
|
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
|
|
|
|
audio_bytes = response.content
|
|
ext = self._sniff_audio_ext(audio_bytes) or "wav"
|
|
|
|
file_name = (
|
|
"tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
|
+ str(random.randint(0, 1000)) + "." + ext
|
|
)
|
|
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
|
with open(file_name, "wb") as f:
|
|
f.write(audio_bytes)
|
|
logger.info(
|
|
f"[ZhipuAIVoice] textToVoice model={model} voice={voice_id} "
|
|
f"file={file_name} bytes={len(audio_bytes)} ext={ext}"
|
|
)
|
|
return Reply(ReplyType.VOICE, file_name)
|
|
except Exception as e:
|
|
logger.exception(f"[ZhipuAIVoice] textToVoice exception: {e}")
|
|
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
|
|
|
|
@staticmethod
|
|
def _sniff_audio_ext(data: bytes) -> str:
|
|
"""Detect audio container by magic bytes; returns '' on unknown."""
|
|
if len(data) < 12:
|
|
return ""
|
|
head = data[:12]
|
|
if head[:4] == b"RIFF" and head[8:12] == b"WAVE":
|
|
return "wav"
|
|
if head[:3] == b"ID3" or head[:2] == b"\xff\xfb" or head[:2] == b"\xff\xf3" or head[:2] == b"\xff\xf2":
|
|
return "mp3"
|
|
if head[:4] == b"OggS":
|
|
return "ogg"
|
|
if head[:4] == b"fLaC":
|
|
return "flac"
|
|
return ""
|
|
|
|
@staticmethod
|
|
def _ensure_compatible_format(voice_file: str) -> str:
|
|
# glm-asr-2512 only accepts .wav / .mp3
|
|
lower = voice_file.lower()
|
|
if lower.endswith(".mp3") or lower.endswith(".wav"):
|
|
return voice_file
|
|
try:
|
|
mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
|
|
audio_convert.any_to_mp3(voice_file, mp3_file)
|
|
return mp3_file
|
|
except Exception as e:
|
|
logger.warning(f"[ZhipuAIVoice] mp3 convert failed: {e}")
|
|
return voice_file
|