mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-06-02 00:57:41 +08:00
feat(voice): enable TTS on Weixin / DingTalk / WeCom Bot with text-then-voice delivery
- Clear NOT_SUPPORT_REPLYTYPE on weixin, wecom_bot, dingtalk so TTS replies are actually synthesized for these channels. - Wire desire_rtype=VOICE in weixin and wecom_bot _compose_context so the always_reply_voice / voice_reply_voice toggles take effect. - DingTalk: send native sampleAudio (mediaId + duration). The media API only accepts ogg/amr, so convert TTS mp3/wav to amr on the fly. - WeCom Bot: send native voice msgtype via ws (respond + active push), converting TTS audio to amr before upload. - Weixin (ilink): no outbound voice item, deliver TTS as a file attachment. - chat_channel: when a TEXT reply is converted to VOICE, stash original text in context["voice_reply_text"] and send a text bubble before the voice reply. Skipped for feishu_streamed and wechatcom_app, which already render text alongside the voice.
This commit is contained in:
@@ -270,6 +270,8 @@ class ChatChannel(Channel):
|
||||
if reply.type == ReplyType.TEXT:
|
||||
reply_text = reply.content
|
||||
if desire_rtype == ReplyType.VOICE and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE:
|
||||
# Preserve original text for the "text-then-voice" pattern in _send_reply.
|
||||
context["voice_reply_text"] = reply.content
|
||||
reply = super().build_text_to_voice(reply.content)
|
||||
return self._decorate_reply(context, reply)
|
||||
if context.get("isgroup", False):
|
||||
@@ -317,6 +319,15 @@ class ChatChannel(Channel):
|
||||
# 短暂延迟后发送图片
|
||||
time.sleep(0.3)
|
||||
self._send(reply, context)
|
||||
# Send text bubble before voice, unless channel already streamed
|
||||
# the text (feishu) or natively renders STT under the voice (wechatcom).
|
||||
elif reply.type == ReplyType.VOICE and context.get("voice_reply_text") \
|
||||
and not context.get("feishu_streamed") \
|
||||
and context.get("channel_type") not in ("wechatcom_app",):
|
||||
text_reply = Reply(ReplyType.TEXT, context.get("voice_reply_text"))
|
||||
self._send(text_reply, context)
|
||||
time.sleep(0.3)
|
||||
self._send(reply, context)
|
||||
else:
|
||||
self._send(reply, context)
|
||||
|
||||
|
||||
@@ -86,6 +86,8 @@ def _check(func):
|
||||
|
||||
@singleton
|
||||
class DingTalkChanel(ChatChannel, dingtalk_stream.ChatbotHandler):
|
||||
NOT_SUPPORT_REPLYTYPE = []
|
||||
|
||||
dingtalk_client_id = conf().get('dingtalk_client_id')
|
||||
dingtalk_client_secret = conf().get('dingtalk_client_secret')
|
||||
|
||||
@@ -870,6 +872,48 @@ class DingTalkChanel(ChatChannel, dingtalk_stream.ChatbotHandler):
|
||||
self.reply_text("抱歉,文件上传失败", incoming_message)
|
||||
return
|
||||
|
||||
# Native sampleAudio. Upload only accepts ogg/amr, so convert TTS mp3/wav to amr.
|
||||
elif reply.type == ReplyType.VOICE:
|
||||
logger.info(f"[DingTalk] Sending voice: {reply.content}")
|
||||
access_token = self.get_access_token()
|
||||
if not access_token:
|
||||
logger.error("[DingTalk] Cannot get access token for voice")
|
||||
self.reply_text("抱歉,语音发送失败(无法获取token)", incoming_message)
|
||||
return
|
||||
|
||||
voice_path = reply.content
|
||||
if voice_path.startswith("file://"):
|
||||
voice_path = voice_path[7:]
|
||||
|
||||
amr_path = voice_path
|
||||
duration_ms = 0
|
||||
if not voice_path.lower().endswith((".amr", ".ogg")):
|
||||
try:
|
||||
from voice.audio_convert import any_to_amr
|
||||
amr_path = os.path.splitext(voice_path)[0] + ".amr"
|
||||
duration_ms = int(any_to_amr(voice_path, amr_path) or 0)
|
||||
except Exception as e:
|
||||
logger.error(f"[DingTalk] Failed to convert voice to amr: {e}")
|
||||
self.reply_text("抱歉,语音转码失败", incoming_message)
|
||||
return
|
||||
|
||||
media_id = self.upload_media(amr_path, media_type="voice")
|
||||
if not media_id:
|
||||
logger.error("[DingTalk] Failed to upload voice media")
|
||||
self.reply_text("抱歉,语音上传失败", incoming_message)
|
||||
return
|
||||
|
||||
msg_param = {
|
||||
"mediaId": media_id,
|
||||
"duration": str(duration_ms or 1000),
|
||||
}
|
||||
success = self._send_file_message(
|
||||
access_token, incoming_message, "sampleAudio", msg_param, isgroup
|
||||
)
|
||||
if not success:
|
||||
self.reply_text("抱歉,语音发送失败", incoming_message)
|
||||
return
|
||||
|
||||
# 处理文本消息
|
||||
elif reply.type == ReplyType.TEXT:
|
||||
logger.info(f"[DingTalk] Sending text message, length={len(reply.content)}")
|
||||
|
||||
@@ -81,6 +81,8 @@ def _loads_wecom_ws_json(raw):
|
||||
@singleton
|
||||
class WecomBotChannel(ChatChannel):
|
||||
|
||||
NOT_SUPPORT_REPLYTYPE = []
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.bot_id = ""
|
||||
@@ -472,6 +474,8 @@ class WecomBotChannel(ChatChannel):
|
||||
else:
|
||||
context.type = ContextType.TEXT
|
||||
context.content = content.strip()
|
||||
if "desire_rtype" not in context and conf().get("always_reply_voice"):
|
||||
context["desire_rtype"] = ReplyType.VOICE
|
||||
|
||||
return context
|
||||
|
||||
@@ -498,6 +502,8 @@ class WecomBotChannel(ChatChannel):
|
||||
self._send_file(reply.content, receiver, is_group, req_id)
|
||||
elif reply.type == ReplyType.VIDEO or reply.type == ReplyType.VIDEO_URL:
|
||||
self._send_file(reply.content, receiver, is_group, req_id, media_type="video")
|
||||
elif reply.type == ReplyType.VOICE:
|
||||
self._send_voice(reply.content, receiver, is_group, req_id)
|
||||
else:
|
||||
logger.warning(f"[WecomBot] Unsupported reply type: {reply.type}, falling back to text")
|
||||
self._send_text(str(reply.content), receiver, is_group, req_id)
|
||||
@@ -730,6 +736,65 @@ class WecomBotChannel(ChatChannel):
|
||||
},
|
||||
})
|
||||
|
||||
def _send_voice(self, voice_path: str, receiver: str, is_group: bool, req_id: str = None):
|
||||
"""Send native voice reply. WeCom voice media must be amr."""
|
||||
local_path = voice_path
|
||||
if local_path.startswith("file://"):
|
||||
local_path = local_path[7:]
|
||||
|
||||
if local_path.startswith(("http://", "https://")):
|
||||
try:
|
||||
resp = requests.get(local_path, timeout=60)
|
||||
resp.raise_for_status()
|
||||
ext = os.path.splitext(local_path)[1] or ".mp3"
|
||||
tmp_path = f"/tmp/wecom_voice_{uuid.uuid4().hex[:8]}{ext}"
|
||||
with open(tmp_path, "wb") as f:
|
||||
f.write(resp.content)
|
||||
local_path = tmp_path
|
||||
except Exception as e:
|
||||
logger.error(f"[WecomBot] Failed to download voice for sending: {e}")
|
||||
return
|
||||
|
||||
if not os.path.exists(local_path):
|
||||
logger.error(f"[WecomBot] Voice file not found: {local_path}")
|
||||
return
|
||||
|
||||
amr_path = local_path
|
||||
if not local_path.lower().endswith(".amr"):
|
||||
try:
|
||||
from voice.audio_convert import any_to_amr
|
||||
amr_path = os.path.splitext(local_path)[0] + ".amr"
|
||||
any_to_amr(local_path, amr_path)
|
||||
except Exception as e:
|
||||
logger.error(f"[WecomBot] Failed to convert voice to amr: {e}")
|
||||
return
|
||||
|
||||
media_id = self._upload_media(amr_path, "voice")
|
||||
if not media_id:
|
||||
logger.error("[WecomBot] Failed to upload voice media")
|
||||
return
|
||||
|
||||
if req_id:
|
||||
self._ws_send({
|
||||
"cmd": "aibot_respond_msg",
|
||||
"headers": {"req_id": req_id},
|
||||
"body": {
|
||||
"msgtype": "voice",
|
||||
"voice": {"media_id": media_id},
|
||||
},
|
||||
})
|
||||
else:
|
||||
self._ws_send({
|
||||
"cmd": "aibot_send_msg",
|
||||
"headers": {"req_id": self._gen_req_id()},
|
||||
"body": {
|
||||
"chatid": receiver,
|
||||
"chat_type": 2 if is_group else 1,
|
||||
"msgtype": "voice",
|
||||
"voice": {"media_id": media_id},
|
||||
},
|
||||
})
|
||||
|
||||
def _active_send_markdown(self, content: str, receiver: str, is_group: bool):
|
||||
"""Proactively send markdown message (for scheduled tasks, no req_id)."""
|
||||
self._ws_send({
|
||||
|
||||
@@ -60,6 +60,9 @@ def _save_credentials(cred_path: str, data: dict):
|
||||
@singleton
|
||||
class WeixinChannel(ChatChannel):
|
||||
|
||||
# ilink bot protocol has no outbound voice item; deliver TTS as a file.
|
||||
NOT_SUPPORT_REPLYTYPE = []
|
||||
|
||||
LOGIN_STATUS_IDLE = "idle"
|
||||
LOGIN_STATUS_WAITING = "waiting_scan"
|
||||
LOGIN_STATUS_SCANNED = "scanned"
|
||||
@@ -464,6 +467,14 @@ class WeixinChannel(ChatChannel):
|
||||
else:
|
||||
context.type = ContextType.TEXT
|
||||
context.content = content.strip()
|
||||
if "desire_rtype" not in context and conf().get("always_reply_voice"):
|
||||
context["desire_rtype"] = ReplyType.VOICE
|
||||
|
||||
elif ctype == ContextType.VOICE:
|
||||
if "desire_rtype" not in context and (
|
||||
conf().get("voice_reply_voice") or conf().get("always_reply_voice")
|
||||
):
|
||||
context["desire_rtype"] = ReplyType.VOICE
|
||||
|
||||
return context
|
||||
|
||||
@@ -486,6 +497,9 @@ class WeixinChannel(ChatChannel):
|
||||
self._send_file(reply.content, receiver, context_token)
|
||||
elif reply.type in (ReplyType.VIDEO, ReplyType.VIDEO_URL):
|
||||
self._send_video(reply.content, receiver, context_token)
|
||||
elif reply.type == ReplyType.VOICE:
|
||||
# ilink has no outbound voice item; deliver TTS as a file attachment.
|
||||
self._send_file(reply.content, receiver, context_token)
|
||||
else:
|
||||
logger.warning(f"[Weixin] Unsupported reply type: {reply.type}, fallback to text")
|
||||
self._send_text(str(reply.content), receiver, context_token)
|
||||
|
||||
Reference in New Issue
Block a user