mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-06-02 00:57:41 +08:00
feat(voice): rework TTS/ASR stack and unify tool/skill config schema
This commit is contained in:
@@ -1,20 +1,13 @@
|
||||
# encoding:utf-8
|
||||
"""
|
||||
DashScope (Aliyun Bailian) voice service.
|
||||
|
||||
ASR : qwen3-asr-flash via dashscope.MultiModalConversation
|
||||
TTS : not yet implemented (see CosyVoice / qwen3-tts)
|
||||
|
||||
Why MultiModalConversation instead of the OpenAI-compatible endpoint:
|
||||
- SDK is already a project dep (used by chat/vision)
|
||||
- Native API accepts local file:// paths up to 100 QPS without an OSS
|
||||
round-trip, which is what we need for the "send a short voice
|
||||
message" flow. Public URLs / Base64 also work.
|
||||
"""
|
||||
"""DashScope voice: qwen3-asr-flash (ASR) + qwen3-tts-flash (TTS)
|
||||
via dashscope.MultiModalConversation."""
|
||||
import datetime
|
||||
import os
|
||||
import random
|
||||
from typing import Optional
|
||||
|
||||
import dashscope
|
||||
import requests
|
||||
from dashscope import MultiModalConversation
|
||||
|
||||
from bridge.reply import Reply, ReplyType
|
||||
@@ -25,16 +18,14 @@ from voice.voice import Voice
|
||||
|
||||
|
||||
DEFAULT_ASR_MODEL = "qwen3-asr-flash"
|
||||
# qwen3-asr-flash hard cap (single file, sync call). Longer audio needs
|
||||
# qwen3-asr-flash-filetrans which is async-only and out of scope here.
|
||||
DEFAULT_TTS_MODEL = "qwen3-tts-flash"
|
||||
DEFAULT_TTS_VOICE = "Cherry"
|
||||
MAX_DURATION_SECONDS = 300
|
||||
MAX_FILE_BYTES = 10 * 1024 * 1024
|
||||
|
||||
|
||||
class DashScopeVoice(Voice):
|
||||
def __init__(self):
|
||||
# api_key is applied per-call (chat bot does the same) so a live
|
||||
# config change via the web console takes effect without restart.
|
||||
pass
|
||||
|
||||
def voiceToText(self, voice_file: str):
|
||||
@@ -83,14 +74,72 @@ class DashScopeVoice(Voice):
|
||||
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
||||
|
||||
def textToVoice(self, text: str):
|
||||
# TTS will be added in a follow-up commit (qwen3-tts / cosyvoice).
|
||||
return Reply(ReplyType.ERROR, "DashScope 语音合成尚未接入")
|
||||
try:
|
||||
api_key = conf().get("dashscope_api_key", "")
|
||||
if not api_key:
|
||||
logger.error("[DashScopeVoice] dashscope_api_key is not configured")
|
||||
return Reply(ReplyType.ERROR, "未配置 DashScope API key")
|
||||
dashscope.api_key = api_key
|
||||
|
||||
model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
|
||||
voice = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
|
||||
response = MultiModalConversation.call(
|
||||
model=model,
|
||||
api_key=api_key,
|
||||
text=text,
|
||||
voice=voice,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
url = self._extract_audio_url(response)
|
||||
if not url:
|
||||
logger.error(f"[DashScopeVoice] textToVoice failed: {response}")
|
||||
return Reply(ReplyType.ERROR, "语音合成失败")
|
||||
|
||||
local_path = self._download_audio(url)
|
||||
if not local_path:
|
||||
return Reply(ReplyType.ERROR, "语音合成失败")
|
||||
|
||||
logger.info(f"[DashScopeVoice] textToVoice model={model} voice={voice} file={local_path}")
|
||||
return Reply(ReplyType.VOICE, local_path)
|
||||
except Exception as e:
|
||||
logger.exception(f"[DashScopeVoice] textToVoice exception: {e}")
|
||||
return Reply(ReplyType.ERROR, "语音合成失败")
|
||||
|
||||
@staticmethod
|
||||
def _extract_audio_url(response) -> Optional[str]:
|
||||
try:
|
||||
if getattr(response, "status_code", 200) != 200:
|
||||
return None
|
||||
audio = response.output.get("audio") if response.output else None
|
||||
if isinstance(audio, dict):
|
||||
return audio.get("url") or None
|
||||
return getattr(audio, "url", None)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _download_audio(url: str) -> Optional[str]:
|
||||
try:
|
||||
tmp_dir = os.path.join(os.getcwd(), "tmp")
|
||||
os.makedirs(tmp_dir, exist_ok=True)
|
||||
ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
ext = os.path.splitext(url.split("?", 1)[0])[1].lower() or ".wav"
|
||||
if ext not in (".mp3", ".wav", ".m4a", ".aac", ".opus"):
|
||||
ext = ".wav"
|
||||
dst = os.path.join(tmp_dir, f"dashscope_tts_{ts}_{random.randint(0, 9999)}{ext}")
|
||||
resp = requests.get(url, timeout=60)
|
||||
resp.raise_for_status()
|
||||
with open(dst, "wb") as f:
|
||||
f.write(resp.content)
|
||||
return dst
|
||||
except Exception as e:
|
||||
logger.error(f"[DashScopeVoice] download audio failed: {e}")
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _ensure_compatible_format(voice_file: str) -> str:
|
||||
"""Convert AMR/SILK to mp3 since qwen3-asr-flash doesn't accept them.
|
||||
Other formats (mp3/wav/m4a/aac/opus/webm) are passed through.
|
||||
"""
|
||||
# qwen3-asr-flash doesn't accept AMR/SILK; mp3/wav/m4a/aac/opus pass through.
|
||||
lower = voice_file.lower()
|
||||
if lower.endswith(".amr") or lower.endswith(".silk") or lower.endswith(".slk"):
|
||||
try:
|
||||
@@ -98,20 +147,11 @@ class DashScopeVoice(Voice):
|
||||
audio_convert.any_to_mp3(voice_file, mp3_file)
|
||||
return mp3_file
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"[DashScopeVoice] convert {voice_file} to mp3 failed: {e}; "
|
||||
f"submitting original file"
|
||||
)
|
||||
logger.warning(f"[DashScopeVoice] mp3 convert failed: {e}")
|
||||
return voice_file
|
||||
|
||||
@staticmethod
|
||||
def _extract_text(response) -> Optional[str]:
|
||||
"""Pull the recognized text out of MultiModalConversation response.
|
||||
|
||||
Successful shape (result_format="message"):
|
||||
response.output.choices[0].message.content -> list of {"text": "..."}
|
||||
or in some SDK versions a plain string.
|
||||
"""
|
||||
try:
|
||||
if getattr(response, "status_code", 200) != 200:
|
||||
return None
|
||||
|
||||
@@ -1,16 +1,18 @@
|
||||
"""
|
||||
google voice service
|
||||
"""
|
||||
"""LinkAI voice: Whisper ASR + multi-vendor TTS (OpenAI / Doubao / Baidu)
|
||||
proxied via https://docs.link-ai.tech/platform/api/voice-speech."""
|
||||
import datetime
|
||||
import os
|
||||
import random
|
||||
|
||||
import requests
|
||||
from voice import audio_convert
|
||||
|
||||
from bridge.reply import Reply, ReplyType
|
||||
from common import const
|
||||
from common.log import logger
|
||||
from config import conf
|
||||
from voice import audio_convert
|
||||
from voice.voice import Voice
|
||||
from common import const
|
||||
import os
|
||||
import datetime
|
||||
|
||||
|
||||
class LinkAIVoice(Voice):
|
||||
def __init__(self):
|
||||
@@ -21,63 +23,67 @@ class LinkAIVoice(Voice):
|
||||
try:
|
||||
url = conf().get("linkai_api_base", "https://api.link-ai.tech") + "/v1/audio/transcriptions"
|
||||
headers = {"Authorization": "Bearer " + conf().get("linkai_api_key")}
|
||||
model = None
|
||||
if not conf().get("text_to_voice") or conf().get("voice_to_text") == "openai":
|
||||
model = const.WHISPER_1
|
||||
# Pin whisper-1: gateway ignores any other ASR model id.
|
||||
model = const.WHISPER_1
|
||||
if voice_file.endswith(".amr"):
|
||||
try:
|
||||
mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
|
||||
audio_convert.any_to_mp3(voice_file, mp3_file)
|
||||
voice_file = mp3_file
|
||||
except Exception as e:
|
||||
logger.warn(f"[LinkVoice] amr file transfer failed, directly send amr voice file: {format(e)}")
|
||||
file = open(voice_file, "rb")
|
||||
file_body = {
|
||||
"file": file
|
||||
}
|
||||
data = {
|
||||
"model": model
|
||||
}
|
||||
res = requests.post(url, files=file_body, headers=headers, data=data, timeout=(5, 60))
|
||||
if res.status_code == 200:
|
||||
text = res.json().get("text")
|
||||
else:
|
||||
res_json = res.json()
|
||||
logger.error(f"[LinkVoice] voiceToText error, status_code={res.status_code}, msg={res_json.get('message')}")
|
||||
logger.warning(f"[LinkVoice] amr file transfer failed, directly send amr voice file: {e}")
|
||||
with open(voice_file, "rb") as file:
|
||||
res = requests.post(
|
||||
url,
|
||||
files={"file": file},
|
||||
headers=headers,
|
||||
data={"model": model},
|
||||
timeout=(5, 60),
|
||||
)
|
||||
if res.status_code != 200:
|
||||
msg = ""
|
||||
try:
|
||||
msg = res.json().get("message", "")
|
||||
except Exception:
|
||||
pass
|
||||
logger.error(f"[LinkVoice] voiceToText error, status_code={res.status_code}, msg={msg}")
|
||||
return None
|
||||
reply = Reply(ReplyType.TEXT, text)
|
||||
text = res.json().get("text")
|
||||
logger.info(f"[LinkVoice] voiceToText success, text={text}, file name={voice_file}")
|
||||
return Reply(ReplyType.TEXT, text)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
return None
|
||||
return reply
|
||||
|
||||
def textToVoice(self, text):
|
||||
try:
|
||||
url = conf().get("linkai_api_base", "https://api.link-ai.tech") + "/v1/audio/speech"
|
||||
headers = {"Authorization": "Bearer " + conf().get("linkai_api_key")}
|
||||
model = const.TTS_1
|
||||
if not conf().get("text_to_voice") or conf().get("text_to_voice") in ["openai", const.TTS_1, const.TTS_1_HD]:
|
||||
model = conf().get("text_to_voice_model") or const.TTS_1
|
||||
# Gateway routes by `model` (tts-1 / doubao / baidu) + `voice` from
|
||||
# that engine's catalog. `app_code` is optional workspace override.
|
||||
data = {
|
||||
"model": model,
|
||||
"input": text,
|
||||
"voice": conf().get("tts_voice_id"),
|
||||
"app_code": conf().get("linkai_app_code")
|
||||
"app_code": conf().get("linkai_app_code"),
|
||||
}
|
||||
model = conf().get("text_to_voice_model")
|
||||
if model:
|
||||
data["model"] = model
|
||||
res = requests.post(url, headers=headers, json=data, timeout=(5, 120))
|
||||
if res.status_code == 200:
|
||||
tmp_file_name = "tmp/" + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + str(random.randint(0, 1000)) + ".mp3"
|
||||
with open(tmp_file_name, 'wb') as f:
|
||||
f.write(res.content)
|
||||
reply = Reply(ReplyType.VOICE, tmp_file_name)
|
||||
logger.info(f"[LinkVoice] textToVoice success, input={text}, model={model}, voice_id={data.get('voice')}")
|
||||
return reply
|
||||
else:
|
||||
res_json = res.json()
|
||||
logger.error(f"[LinkVoice] textToVoice error, status_code={res.status_code}, msg={res_json.get('message')}")
|
||||
if res.status_code != 200:
|
||||
msg = ""
|
||||
try:
|
||||
msg = res.json().get("message", "")
|
||||
except Exception:
|
||||
pass
|
||||
logger.error(f"[LinkVoice] textToVoice error, status_code={res.status_code}, msg={msg}")
|
||||
return None
|
||||
tmp_file_name = "tmp/" + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + str(random.randint(0, 1000)) + ".mp3"
|
||||
os.makedirs(os.path.dirname(tmp_file_name), exist_ok=True)
|
||||
with open(tmp_file_name, 'wb') as f:
|
||||
f.write(res.content)
|
||||
logger.info(f"[LinkVoice] textToVoice success, input={text}, voice_id={data.get('voice')}")
|
||||
return Reply(ReplyType.VOICE, tmp_file_name)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
# reply = Reply(ReplyType.ERROR, "遇到了一点小问题,请稍后再问我吧")
|
||||
return None
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
# encoding:utf-8
|
||||
"""
|
||||
MiniMax TTS voice service
|
||||
"""
|
||||
"""MiniMax TTS via /v1/t2a_v2 (SSE stream, hex-encoded mp3 chunks)."""
|
||||
import datetime
|
||||
import json
|
||||
import random
|
||||
import requests
|
||||
|
||||
@@ -12,24 +11,12 @@ from config import conf
|
||||
from voice.voice import Voice
|
||||
|
||||
|
||||
MINIMAX_TTS_VOICES = [
|
||||
"English_Graceful_Lady",
|
||||
"English_Insightful_Speaker",
|
||||
"English_radiant_girl",
|
||||
"English_Persuasive_Man",
|
||||
"English_Lucky_Robot",
|
||||
"English_expressive_narrator",
|
||||
"Chinese_Warm_Woman",
|
||||
"Chinese_Gentle_Man",
|
||||
]
|
||||
|
||||
|
||||
class MinimaxVoice(Voice):
|
||||
def __init__(self):
|
||||
self.api_key = conf().get("minimax_api_key")
|
||||
self.api_base = conf().get("minimax_api_base") or "https://api.minimax.io"
|
||||
# Strip trailing /v1 if present so we can always append /v1/t2a_v2
|
||||
self.api_base = self.api_base.rstrip("/")
|
||||
# Mainland endpoint matches `sk-api-0-...` keys; override via
|
||||
# `minimax_api_base` for international (api.minimax.io) workspaces.
|
||||
self.api_base = (conf().get("minimax_api_base") or "https://api.minimaxi.com").rstrip("/")
|
||||
if self.api_base.endswith("/v1"):
|
||||
self.api_base = self.api_base[:-3]
|
||||
|
||||
@@ -68,12 +55,14 @@ class MinimaxVoice(Voice):
|
||||
response = requests.post(url, headers=headers, json=payload, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse SSE stream and collect hex-encoded audio chunks
|
||||
# MiniMax returns HTTP 200 even on errors; capture base_resp for diagnostics.
|
||||
audio_chunks = []
|
||||
buffer = ""
|
||||
last_base_resp = None
|
||||
event_count = 0
|
||||
for raw in response.iter_lines():
|
||||
if not raw:
|
||||
continue
|
||||
event_count += 1
|
||||
line = raw.decode("utf-8") if isinstance(raw, bytes) else raw
|
||||
if not line.startswith("data:"):
|
||||
continue
|
||||
@@ -81,16 +70,31 @@ class MinimaxVoice(Voice):
|
||||
if not json_str or json_str == "[DONE]":
|
||||
continue
|
||||
try:
|
||||
import json
|
||||
event_data = json.loads(json_str)
|
||||
audio_hex = event_data.get("data", {}).get("audio")
|
||||
if audio_hex:
|
||||
audio_chunks.append(bytes.fromhex(audio_hex))
|
||||
except Exception:
|
||||
continue
|
||||
base_resp = event_data.get("base_resp") or {}
|
||||
if base_resp:
|
||||
last_base_resp = base_resp
|
||||
audio_hex = (event_data.get("data") or {}).get("audio")
|
||||
if audio_hex:
|
||||
try:
|
||||
audio_chunks.append(bytes.fromhex(audio_hex))
|
||||
except Exception as e:
|
||||
logger.warning(f"[MINIMAX] skip bad audio hex chunk: {e}")
|
||||
|
||||
if not audio_chunks:
|
||||
logger.error("[MINIMAX] TTS returned no audio data")
|
||||
ct = response.headers.get("Content-Type", "")
|
||||
if last_base_resp and last_base_resp.get("status_code") not in (None, 0):
|
||||
logger.error(
|
||||
f"[MINIMAX] TTS failed: status_code={last_base_resp.get('status_code')}, "
|
||||
f"status_msg={last_base_resp.get('status_msg')}, model={model}, voice_id={voice_id}"
|
||||
)
|
||||
else:
|
||||
logger.error(
|
||||
f"[MINIMAX] TTS returned no audio data, model={model}, voice_id={voice_id}, "
|
||||
f"url={url}, http={response.status_code}, content_type={ct!r}, events={event_count}"
|
||||
)
|
||||
return Reply(ReplyType.ERROR, "语音合成失败,未获取到音频数据")
|
||||
|
||||
audio_data = b"".join(audio_chunks)
|
||||
|
||||
@@ -31,7 +31,8 @@ class OpenaiVoice(Voice):
|
||||
"file": file,
|
||||
}
|
||||
data = {
|
||||
"model": "whisper-1",
|
||||
# Override via `voice_to_text_model` (e.g. fall back to whisper-1).
|
||||
"model": conf().get("voice_to_text_model") or "gpt-4o-mini-transcribe",
|
||||
}
|
||||
response = requests.post(url, headers=headers, files=files, data=data)
|
||||
response_data = response.json()
|
||||
|
||||
@@ -1,14 +1,8 @@
|
||||
# encoding:utf-8
|
||||
"""
|
||||
ZhipuAI (BigModel) voice service.
|
||||
|
||||
ASR : glm-asr-2512 via the OpenAI-compatible /audio/transcriptions endpoint.
|
||||
TTS : not yet implemented.
|
||||
|
||||
Endpoint accepts multipart/form-data with `model`, `file`, and `stream`.
|
||||
File size <= 25MB, duration <= 30s per request.
|
||||
"""
|
||||
"""ZhipuAI voice: glm-asr-2512 (ASR) + glm-tts (TTS) via BigModel REST API."""
|
||||
import datetime
|
||||
import os
|
||||
import random
|
||||
|
||||
import requests
|
||||
|
||||
@@ -20,6 +14,8 @@ from voice.voice import Voice
|
||||
|
||||
|
||||
DEFAULT_ASR_MODEL = "glm-asr-2512"
|
||||
DEFAULT_TTS_MODEL = "glm-tts"
|
||||
DEFAULT_TTS_VOICE = "tongtong"
|
||||
DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4"
|
||||
MAX_FILE_BYTES = 25 * 1024 * 1024
|
||||
REQUEST_TIMEOUT = (5, 60)
|
||||
@@ -27,7 +23,6 @@ REQUEST_TIMEOUT = (5, 60)
|
||||
|
||||
class ZhipuAIVoice(Voice):
|
||||
def __init__(self):
|
||||
# api_key / base read per-call so live config edits take effect.
|
||||
pass
|
||||
|
||||
def voiceToText(self, voice_file: str):
|
||||
@@ -81,12 +76,91 @@ class ZhipuAIVoice(Voice):
|
||||
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
||||
|
||||
def textToVoice(self, text: str):
|
||||
return Reply(ReplyType.ERROR, "ZhipuAI 语音合成尚未接入")
|
||||
try:
|
||||
api_key = conf().get("zhipu_ai_api_key", "")
|
||||
if not api_key:
|
||||
logger.error("[ZhipuAIVoice] zhipu_ai_api_key is not configured")
|
||||
return Reply(ReplyType.ERROR, "未配置 ZhipuAI API key")
|
||||
|
||||
api_base = (conf().get("zhipu_ai_api_base") or DEFAULT_API_BASE).rstrip("/")
|
||||
url = f"{api_base}/audio/speech"
|
||||
model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
|
||||
voice_id = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"input": text,
|
||||
"voice": voice_id,
|
||||
"response_format": "wav",
|
||||
"speed": 1.0,
|
||||
"volume": 1.0,
|
||||
}
|
||||
headers = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
response = requests.post(
|
||||
url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.error(
|
||||
f"[ZhipuAIVoice] textToVoice failed: status={response.status_code} "
|
||||
f"body={response.text[:500]} model={model} voice={voice_id}"
|
||||
)
|
||||
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
|
||||
|
||||
# Some errors come back as JSON / SSE with HTTP 200.
|
||||
ct = response.headers.get("Content-Type", "")
|
||||
if "application/json" in ct or "text/event-stream" in ct:
|
||||
try:
|
||||
err = response.json()
|
||||
except Exception:
|
||||
err = {"raw": response.text[:500]}
|
||||
logger.error(
|
||||
f"[ZhipuAIVoice] textToVoice unexpected text response "
|
||||
f"(content_type={ct}): {err}"
|
||||
)
|
||||
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
|
||||
|
||||
audio_bytes = response.content
|
||||
ext = self._sniff_audio_ext(audio_bytes) or "wav"
|
||||
|
||||
file_name = (
|
||||
"tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
+ str(random.randint(0, 1000)) + "." + ext
|
||||
)
|
||||
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
||||
with open(file_name, "wb") as f:
|
||||
f.write(audio_bytes)
|
||||
logger.info(
|
||||
f"[ZhipuAIVoice] textToVoice model={model} voice={voice_id} "
|
||||
f"file={file_name} bytes={len(audio_bytes)} ext={ext}"
|
||||
)
|
||||
return Reply(ReplyType.VOICE, file_name)
|
||||
except Exception as e:
|
||||
logger.exception(f"[ZhipuAIVoice] textToVoice exception: {e}")
|
||||
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
|
||||
|
||||
@staticmethod
|
||||
def _sniff_audio_ext(data: bytes) -> str:
|
||||
"""Detect audio container by magic bytes; returns '' on unknown."""
|
||||
if len(data) < 12:
|
||||
return ""
|
||||
head = data[:12]
|
||||
if head[:4] == b"RIFF" and head[8:12] == b"WAVE":
|
||||
return "wav"
|
||||
if head[:3] == b"ID3" or head[:2] == b"\xff\xfb" or head[:2] == b"\xff\xf3" or head[:2] == b"\xff\xf2":
|
||||
return "mp3"
|
||||
if head[:4] == b"OggS":
|
||||
return "ogg"
|
||||
if head[:4] == b"fLaC":
|
||||
return "flac"
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _ensure_compatible_format(voice_file: str) -> str:
|
||||
# glm-asr-2512 only accepts .wav / .mp3 — convert everything else
|
||||
# (webm from the browser mic, m4a/amr/silk from chat channels, etc).
|
||||
# glm-asr-2512 only accepts .wav / .mp3
|
||||
lower = voice_file.lower()
|
||||
if lower.endswith(".mp3") or lower.endswith(".wav"):
|
||||
return voice_file
|
||||
@@ -95,8 +169,5 @@ class ZhipuAIVoice(Voice):
|
||||
audio_convert.any_to_mp3(voice_file, mp3_file)
|
||||
return mp3_file
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"[ZhipuAIVoice] convert {voice_file} to mp3 failed: {e}; "
|
||||
f"submitting original file"
|
||||
)
|
||||
logger.warning(f"[ZhipuAIVoice] mp3 convert failed: {e}")
|
||||
return voice_file
|
||||
|
||||
Reference in New Issue
Block a user