Files
chatgpt-on-wechat/voice/mimo/mimo_voice.py
2026-05-28 10:49:52 +08:00

110 lines
4.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# encoding:utf-8
"""
小米 MiMo TTS - 基于 mimo-v2.5-tts 模型的语音合成。
通过 /chat/completions 接口实现assistant 消息内容为待合成文本,
audio 字段指定预置音色(如 冰糖/茉莉/苏打/Mia/Chloe 等),返回 base64
编码的音频字节。
文档https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5
注意MiMo 不提供 ASR 端点,因此 voiceToText 不实现。
"""
import base64
import datetime
import os
import random
import requests
from bridge.reply import Reply, ReplyType
from common.log import logger
from config import conf
from voice.voice import Voice
DEFAULT_API_BASE = "https://api.xiaomimimo.com/v1"
DEFAULT_TTS_MODEL = "mimo-v2.5-tts"
DEFAULT_TTS_VOICE = "冰糖" # 默认音色:中国集群事实默认值
REQUEST_TIMEOUT = (5, 120)
class MimoVoice(Voice):
def __init__(self):
pass
def voiceToText(self, voice_file: str):
# MiMo 没有独立 ASR 端点;建议使用其他 provider如 openai/zhipu/dashscope
logger.warning("[MimoVoice] voiceToText is not supported by MiMo API")
return Reply(ReplyType.ERROR, "MiMo 暂不支持语音识别,请配置其他 voice_to_text provider")
def textToVoice(self, text: str):
try:
api_key = conf().get("mimo_api_key", "")
if not api_key:
logger.error("[MimoVoice] mimo_api_key is not configured")
return Reply(ReplyType.ERROR, "未配置 MiMo API key")
api_base = (conf().get("mimo_api_base") or DEFAULT_API_BASE).rstrip("/")
model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
voice_id = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
# 目标合成文本必须放在 assistant 消息user 消息可选用作风格指令
payload = {
"model": model,
"messages": [
{"role": "assistant", "content": text},
],
"audio": {
"format": "wav",
"voice": voice_id,
},
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
url = f"{api_base}/chat/completions"
response = requests.post(url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT)
if response.status_code != 200:
logger.error(
f"[MimoVoice] textToVoice failed: status={response.status_code} "
f"body={response.text[:500]} model={model} voice={voice_id}"
)
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
data = response.json()
if "error" in data:
err = data["error"]
msg = err.get("message", str(err)) if isinstance(err, dict) else str(err)
logger.error(f"[MimoVoice] textToVoice api error: {msg}")
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
message = (data.get("choices") or [{}])[0].get("message", {}) or {}
audio_obj = message.get("audio") or {}
audio_b64 = audio_obj.get("data")
if not audio_b64:
logger.error(f"[MimoVoice] textToVoice empty audio in response: {data}")
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
try:
audio_bytes = base64.b64decode(audio_b64)
except Exception as e:
logger.error(f"[MimoVoice] base64 decode failed: {e}")
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
file_name = (
"tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+ str(random.randint(0, 1000)) + ".wav"
)
os.makedirs(os.path.dirname(file_name), exist_ok=True)
with open(file_name, "wb") as f:
f.write(audio_bytes)
logger.info(
f"[MimoVoice] textToVoice model={model} voice={voice_id} "
f"file={file_name} bytes={len(audio_bytes)}"
)
return Reply(ReplyType.VOICE, file_name)
except Exception as e:
logger.exception(f"[MimoVoice] textToVoice exception: {e}")
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")