mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-06-02 00:57:41 +08:00
feat(voice): add dashscope & zhipu ASR, in-page mic input
This commit is contained in:
102
voice/zhipuai/zhipuai_voice.py
Normal file
102
voice/zhipuai/zhipuai_voice.py
Normal file
@@ -0,0 +1,102 @@
|
||||
# encoding:utf-8
|
||||
"""
|
||||
ZhipuAI (BigModel) voice service.
|
||||
|
||||
ASR : glm-asr-2512 via the OpenAI-compatible /audio/transcriptions endpoint.
|
||||
TTS : not yet implemented.
|
||||
|
||||
Endpoint accepts multipart/form-data with `model`, `file`, and `stream`.
|
||||
File size <= 25MB, duration <= 30s per request.
|
||||
"""
|
||||
import os
|
||||
|
||||
import requests
|
||||
|
||||
from bridge.reply import Reply, ReplyType
|
||||
from common.log import logger
|
||||
from config import conf
|
||||
from voice import audio_convert
|
||||
from voice.voice import Voice
|
||||
|
||||
|
||||
DEFAULT_ASR_MODEL = "glm-asr-2512"
|
||||
DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4"
|
||||
MAX_FILE_BYTES = 25 * 1024 * 1024
|
||||
REQUEST_TIMEOUT = (5, 60)
|
||||
|
||||
|
||||
class ZhipuAIVoice(Voice):
|
||||
def __init__(self):
|
||||
# api_key / base read per-call so live config edits take effect.
|
||||
pass
|
||||
|
||||
def voiceToText(self, voice_file: str):
|
||||
try:
|
||||
voice_file = self._ensure_compatible_format(voice_file)
|
||||
|
||||
try:
|
||||
size = os.path.getsize(voice_file)
|
||||
if size > MAX_FILE_BYTES:
|
||||
logger.warning(
|
||||
f"[ZhipuAIVoice] audio file {size}B exceeds {MAX_FILE_BYTES}B; "
|
||||
f"glm-asr-2512 may reject it"
|
||||
)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
api_key = conf().get("zhipu_ai_api_key", "")
|
||||
if not api_key:
|
||||
logger.error("[ZhipuAIVoice] zhipu_ai_api_key is not configured")
|
||||
return Reply(ReplyType.ERROR, "未配置 ZhipuAI API key")
|
||||
|
||||
api_base = (conf().get("zhipu_ai_api_base") or DEFAULT_API_BASE).rstrip("/")
|
||||
url = f"{api_base}/audio/transcriptions"
|
||||
model = conf().get("voice_to_text_model") or DEFAULT_ASR_MODEL
|
||||
|
||||
with open(voice_file, "rb") as f:
|
||||
files = {"file": (os.path.basename(voice_file), f)}
|
||||
data = {"model": model, "stream": "false"}
|
||||
headers = {"Authorization": f"Bearer {api_key}"}
|
||||
response = requests.post(
|
||||
url, headers=headers, files=files, data=data, timeout=REQUEST_TIMEOUT
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.error(
|
||||
f"[ZhipuAIVoice] voiceToText failed: status={response.status_code} "
|
||||
f"body={response.text[:500]}"
|
||||
)
|
||||
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
||||
|
||||
payload = response.json()
|
||||
text = (payload.get("text") or "").strip()
|
||||
if not text:
|
||||
logger.error(f"[ZhipuAIVoice] voiceToText empty text: {payload}")
|
||||
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
||||
|
||||
logger.info(f"[ZhipuAIVoice] voiceToText model={model} text={text}")
|
||||
return Reply(ReplyType.TEXT, text)
|
||||
except Exception as e:
|
||||
logger.exception(f"[ZhipuAIVoice] voiceToText exception: {e}")
|
||||
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
||||
|
||||
def textToVoice(self, text: str):
|
||||
return Reply(ReplyType.ERROR, "ZhipuAI 语音合成尚未接入")
|
||||
|
||||
@staticmethod
|
||||
def _ensure_compatible_format(voice_file: str) -> str:
|
||||
# glm-asr-2512 only accepts .wav / .mp3 — convert everything else
|
||||
# (webm from the browser mic, m4a/amr/silk from chat channels, etc).
|
||||
lower = voice_file.lower()
|
||||
if lower.endswith(".mp3") or lower.endswith(".wav"):
|
||||
return voice_file
|
||||
try:
|
||||
mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
|
||||
audio_convert.any_to_mp3(voice_file, mp3_file)
|
||||
return mp3_file
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"[ZhipuAIVoice] convert {voice_file} to mp3 failed: {e}; "
|
||||
f"submitting original file"
|
||||
)
|
||||
return voice_file
|
||||
Reference in New Issue
Block a user