From bca97a1d145a42ebf2f6f86e1db42cb0114e6f7f Mon Sep 17 00:00:00 2001 From: zhayujie Date: Thu, 21 May 2026 17:29:26 +0800 Subject: [PATCH] feat(voice): enable TTS on Weixin / DingTalk / WeCom Bot with text-then-voice delivery - Clear NOT_SUPPORT_REPLYTYPE on weixin, wecom_bot, dingtalk so TTS replies are actually synthesized for these channels. - Wire desire_rtype=VOICE in weixin and wecom_bot _compose_context so the always_reply_voice / voice_reply_voice toggles take effect. - DingTalk: send native sampleAudio (mediaId + duration). The media API only accepts ogg/amr, so convert TTS mp3/wav to amr on the fly. - WeCom Bot: send native voice msgtype via ws (respond + active push), converting TTS audio to amr before upload. - Weixin (ilink): no outbound voice item, deliver TTS as a file attachment. - chat_channel: when a TEXT reply is converted to VOICE, stash original text in context["voice_reply_text"] and send a text bubble before the voice reply. Skipped for feishu_streamed and wechatcom_app, which already render text alongside the voice. --- channel/chat_channel.py | 11 +++++ channel/dingtalk/dingtalk_channel.py | 44 +++++++++++++++++ channel/wecom_bot/wecom_bot_channel.py | 65 ++++++++++++++++++++++++++ channel/weixin/weixin_channel.py | 14 ++++++ 4 files changed, 134 insertions(+) diff --git a/channel/chat_channel.py b/channel/chat_channel.py index 760bf860..c38dd7c8 100644 --- a/channel/chat_channel.py +++ b/channel/chat_channel.py @@ -270,6 +270,8 @@ class ChatChannel(Channel): if reply.type == ReplyType.TEXT: reply_text = reply.content if desire_rtype == ReplyType.VOICE and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE: + # Preserve original text for the "text-then-voice" pattern in _send_reply. + context["voice_reply_text"] = reply.content reply = super().build_text_to_voice(reply.content) return self._decorate_reply(context, reply) if context.get("isgroup", False): @@ -317,6 +319,15 @@ class ChatChannel(Channel): # 短暂延迟后发送图片 time.sleep(0.3) self._send(reply, context) + # Send text bubble before voice, unless channel already streamed + # the text (feishu) or natively renders STT under the voice (wechatcom). + elif reply.type == ReplyType.VOICE and context.get("voice_reply_text") \ + and not context.get("feishu_streamed") \ + and context.get("channel_type") not in ("wechatcom_app",): + text_reply = Reply(ReplyType.TEXT, context.get("voice_reply_text")) + self._send(text_reply, context) + time.sleep(0.3) + self._send(reply, context) else: self._send(reply, context) diff --git a/channel/dingtalk/dingtalk_channel.py b/channel/dingtalk/dingtalk_channel.py index d572e35d..b1ae86c2 100644 --- a/channel/dingtalk/dingtalk_channel.py +++ b/channel/dingtalk/dingtalk_channel.py @@ -86,6 +86,8 @@ def _check(func): @singleton class DingTalkChanel(ChatChannel, dingtalk_stream.ChatbotHandler): + NOT_SUPPORT_REPLYTYPE = [] + dingtalk_client_id = conf().get('dingtalk_client_id') dingtalk_client_secret = conf().get('dingtalk_client_secret') @@ -870,6 +872,48 @@ class DingTalkChanel(ChatChannel, dingtalk_stream.ChatbotHandler): self.reply_text("抱歉,文件上传失败", incoming_message) return + # Native sampleAudio. Upload only accepts ogg/amr, so convert TTS mp3/wav to amr. + elif reply.type == ReplyType.VOICE: + logger.info(f"[DingTalk] Sending voice: {reply.content}") + access_token = self.get_access_token() + if not access_token: + logger.error("[DingTalk] Cannot get access token for voice") + self.reply_text("抱歉,语音发送失败(无法获取token)", incoming_message) + return + + voice_path = reply.content + if voice_path.startswith("file://"): + voice_path = voice_path[7:] + + amr_path = voice_path + duration_ms = 0 + if not voice_path.lower().endswith((".amr", ".ogg")): + try: + from voice.audio_convert import any_to_amr + amr_path = os.path.splitext(voice_path)[0] + ".amr" + duration_ms = int(any_to_amr(voice_path, amr_path) or 0) + except Exception as e: + logger.error(f"[DingTalk] Failed to convert voice to amr: {e}") + self.reply_text("抱歉,语音转码失败", incoming_message) + return + + media_id = self.upload_media(amr_path, media_type="voice") + if not media_id: + logger.error("[DingTalk] Failed to upload voice media") + self.reply_text("抱歉,语音上传失败", incoming_message) + return + + msg_param = { + "mediaId": media_id, + "duration": str(duration_ms or 1000), + } + success = self._send_file_message( + access_token, incoming_message, "sampleAudio", msg_param, isgroup + ) + if not success: + self.reply_text("抱歉,语音发送失败", incoming_message) + return + # 处理文本消息 elif reply.type == ReplyType.TEXT: logger.info(f"[DingTalk] Sending text message, length={len(reply.content)}") diff --git a/channel/wecom_bot/wecom_bot_channel.py b/channel/wecom_bot/wecom_bot_channel.py index 7aaca56b..0fe4500b 100644 --- a/channel/wecom_bot/wecom_bot_channel.py +++ b/channel/wecom_bot/wecom_bot_channel.py @@ -81,6 +81,8 @@ def _loads_wecom_ws_json(raw): @singleton class WecomBotChannel(ChatChannel): + NOT_SUPPORT_REPLYTYPE = [] + def __init__(self): super().__init__() self.bot_id = "" @@ -472,6 +474,8 @@ class WecomBotChannel(ChatChannel): else: context.type = ContextType.TEXT context.content = content.strip() + if "desire_rtype" not in context and conf().get("always_reply_voice"): + context["desire_rtype"] = ReplyType.VOICE return context @@ -498,6 +502,8 @@ class WecomBotChannel(ChatChannel): self._send_file(reply.content, receiver, is_group, req_id) elif reply.type == ReplyType.VIDEO or reply.type == ReplyType.VIDEO_URL: self._send_file(reply.content, receiver, is_group, req_id, media_type="video") + elif reply.type == ReplyType.VOICE: + self._send_voice(reply.content, receiver, is_group, req_id) else: logger.warning(f"[WecomBot] Unsupported reply type: {reply.type}, falling back to text") self._send_text(str(reply.content), receiver, is_group, req_id) @@ -730,6 +736,65 @@ class WecomBotChannel(ChatChannel): }, }) + def _send_voice(self, voice_path: str, receiver: str, is_group: bool, req_id: str = None): + """Send native voice reply. WeCom voice media must be amr.""" + local_path = voice_path + if local_path.startswith("file://"): + local_path = local_path[7:] + + if local_path.startswith(("http://", "https://")): + try: + resp = requests.get(local_path, timeout=60) + resp.raise_for_status() + ext = os.path.splitext(local_path)[1] or ".mp3" + tmp_path = f"/tmp/wecom_voice_{uuid.uuid4().hex[:8]}{ext}" + with open(tmp_path, "wb") as f: + f.write(resp.content) + local_path = tmp_path + except Exception as e: + logger.error(f"[WecomBot] Failed to download voice for sending: {e}") + return + + if not os.path.exists(local_path): + logger.error(f"[WecomBot] Voice file not found: {local_path}") + return + + amr_path = local_path + if not local_path.lower().endswith(".amr"): + try: + from voice.audio_convert import any_to_amr + amr_path = os.path.splitext(local_path)[0] + ".amr" + any_to_amr(local_path, amr_path) + except Exception as e: + logger.error(f"[WecomBot] Failed to convert voice to amr: {e}") + return + + media_id = self._upload_media(amr_path, "voice") + if not media_id: + logger.error("[WecomBot] Failed to upload voice media") + return + + if req_id: + self._ws_send({ + "cmd": "aibot_respond_msg", + "headers": {"req_id": req_id}, + "body": { + "msgtype": "voice", + "voice": {"media_id": media_id}, + }, + }) + else: + self._ws_send({ + "cmd": "aibot_send_msg", + "headers": {"req_id": self._gen_req_id()}, + "body": { + "chatid": receiver, + "chat_type": 2 if is_group else 1, + "msgtype": "voice", + "voice": {"media_id": media_id}, + }, + }) + def _active_send_markdown(self, content: str, receiver: str, is_group: bool): """Proactively send markdown message (for scheduled tasks, no req_id).""" self._ws_send({ diff --git a/channel/weixin/weixin_channel.py b/channel/weixin/weixin_channel.py index dba9060f..61f5cbb1 100644 --- a/channel/weixin/weixin_channel.py +++ b/channel/weixin/weixin_channel.py @@ -60,6 +60,9 @@ def _save_credentials(cred_path: str, data: dict): @singleton class WeixinChannel(ChatChannel): + # ilink bot protocol has no outbound voice item; deliver TTS as a file. + NOT_SUPPORT_REPLYTYPE = [] + LOGIN_STATUS_IDLE = "idle" LOGIN_STATUS_WAITING = "waiting_scan" LOGIN_STATUS_SCANNED = "scanned" @@ -464,6 +467,14 @@ class WeixinChannel(ChatChannel): else: context.type = ContextType.TEXT context.content = content.strip() + if "desire_rtype" not in context and conf().get("always_reply_voice"): + context["desire_rtype"] = ReplyType.VOICE + + elif ctype == ContextType.VOICE: + if "desire_rtype" not in context and ( + conf().get("voice_reply_voice") or conf().get("always_reply_voice") + ): + context["desire_rtype"] = ReplyType.VOICE return context @@ -486,6 +497,9 @@ class WeixinChannel(ChatChannel): self._send_file(reply.content, receiver, context_token) elif reply.type in (ReplyType.VIDEO, ReplyType.VIDEO_URL): self._send_video(reply.content, receiver, context_token) + elif reply.type == ReplyType.VOICE: + # ilink has no outbound voice item; deliver TTS as a file attachment. + self._send_file(reply.content, receiver, context_token) else: logger.warning(f"[Weixin] Unsupported reply type: {reply.type}, fallback to text") self._send_text(str(reply.content), receiver, context_token)