From bca97a1d145a42ebf2f6f86e1db42cb0114e6f7f Mon Sep 17 00:00:00 2001
From: zhayujie <yjzha1996@163.com>
Date: Thu, 21 May 2026 17:29:26 +0800
Subject: [PATCH] feat(voice): enable TTS on Weixin / DingTalk / WeCom Bot with
 text-then-voice delivery

- Clear NOT_SUPPORT_REPLYTYPE on weixin, wecom_bot, dingtalk so TTS replies
  are actually synthesized for these channels.
- Wire desire_rtype=VOICE in weixin and wecom_bot _compose_context so the
  always_reply_voice / voice_reply_voice toggles take effect.
- DingTalk: send native sampleAudio (mediaId + duration). The media API
  only accepts ogg/amr, so convert TTS mp3/wav to amr on the fly.
- WeCom Bot: send native voice msgtype via ws (respond + active push),
  converting TTS audio to amr before upload.
- Weixin (ilink): no outbound voice item, deliver TTS as a file attachment.
- chat_channel: when a TEXT reply is converted to VOICE, stash original
  text in context["voice_reply_text"] and send a text bubble before the
  voice reply. Skipped for feishu_streamed and wechatcom_app, which
  already render text alongside the voice.
---
 channel/chat_channel.py                | 11 +++++
 channel/dingtalk/dingtalk_channel.py   | 44 +++++++++++++++++
 channel/wecom_bot/wecom_bot_channel.py | 65 ++++++++++++++++++++++++++
 channel/weixin/weixin_channel.py       | 14 ++++++
 4 files changed, 134 insertions(+)

diff --git a/channel/chat_channel.py b/channel/chat_channel.py
index 760bf860..c38dd7c8 100644
--- a/channel/chat_channel.py
+++ b/channel/chat_channel.py
@@ -270,6 +270,8 @@ class ChatChannel(Channel):
                 if reply.type == ReplyType.TEXT:
                     reply_text = reply.content
                     if desire_rtype == ReplyType.VOICE and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE:
+                        # Preserve original text for the "text-then-voice" pattern in _send_reply.
+                        context["voice_reply_text"] = reply.content
                         reply = super().build_text_to_voice(reply.content)
                         return self._decorate_reply(context, reply)
                     if context.get("isgroup", False):
@@ -317,6 +319,15 @@ class ChatChannel(Channel):
                     # 短暂延迟后发送图片
                     time.sleep(0.3)
                     self._send(reply, context)
+                # Send text bubble before voice, unless channel already streamed
+                # the text (feishu) or natively renders STT under the voice (wechatcom).
+                elif reply.type == ReplyType.VOICE and context.get("voice_reply_text") \
+                        and not context.get("feishu_streamed") \
+                        and context.get("channel_type") not in ("wechatcom_app",):
+                    text_reply = Reply(ReplyType.TEXT, context.get("voice_reply_text"))
+                    self._send(text_reply, context)
+                    time.sleep(0.3)
+                    self._send(reply, context)
                 else:
                     self._send(reply, context)
     
diff --git a/channel/dingtalk/dingtalk_channel.py b/channel/dingtalk/dingtalk_channel.py
index d572e35d..b1ae86c2 100644
--- a/channel/dingtalk/dingtalk_channel.py
+++ b/channel/dingtalk/dingtalk_channel.py
@@ -86,6 +86,8 @@ def _check(func):
 
 @singleton
 class DingTalkChanel(ChatChannel, dingtalk_stream.ChatbotHandler):
+    NOT_SUPPORT_REPLYTYPE = []
+
     dingtalk_client_id = conf().get('dingtalk_client_id')
     dingtalk_client_secret = conf().get('dingtalk_client_secret')
 
@@ -870,6 +872,48 @@ class DingTalkChanel(ChatChannel, dingtalk_stream.ChatbotHandler):
                     self.reply_text("抱歉，文件上传失败", incoming_message)
             return
         
+        # Native sampleAudio. Upload only accepts ogg/amr, so convert TTS mp3/wav to amr.
+        elif reply.type == ReplyType.VOICE:
+            logger.info(f"[DingTalk] Sending voice: {reply.content}")
+            access_token = self.get_access_token()
+            if not access_token:
+                logger.error("[DingTalk] Cannot get access token for voice")
+                self.reply_text("抱歉，语音发送失败（无法获取token）", incoming_message)
+                return
+
+            voice_path = reply.content
+            if voice_path.startswith("file://"):
+                voice_path = voice_path[7:]
+
+            amr_path = voice_path
+            duration_ms = 0
+            if not voice_path.lower().endswith((".amr", ".ogg")):
+                try:
+                    from voice.audio_convert import any_to_amr
+                    amr_path = os.path.splitext(voice_path)[0] + ".amr"
+                    duration_ms = int(any_to_amr(voice_path, amr_path) or 0)
+                except Exception as e:
+                    logger.error(f"[DingTalk] Failed to convert voice to amr: {e}")
+                    self.reply_text("抱歉，语音转码失败", incoming_message)
+                    return
+
+            media_id = self.upload_media(amr_path, media_type="voice")
+            if not media_id:
+                logger.error("[DingTalk] Failed to upload voice media")
+                self.reply_text("抱歉，语音上传失败", incoming_message)
+                return
+
+            msg_param = {
+                "mediaId": media_id,
+                "duration": str(duration_ms or 1000),
+            }
+            success = self._send_file_message(
+                access_token, incoming_message, "sampleAudio", msg_param, isgroup
+            )
+            if not success:
+                self.reply_text("抱歉，语音发送失败", incoming_message)
+            return
+
         # 处理文本消息
         elif reply.type == ReplyType.TEXT:
             logger.info(f"[DingTalk] Sending text message, length={len(reply.content)}")
diff --git a/channel/wecom_bot/wecom_bot_channel.py b/channel/wecom_bot/wecom_bot_channel.py
index 7aaca56b..0fe4500b 100644
--- a/channel/wecom_bot/wecom_bot_channel.py
+++ b/channel/wecom_bot/wecom_bot_channel.py
@@ -81,6 +81,8 @@ def _loads_wecom_ws_json(raw):
 @singleton
 class WecomBotChannel(ChatChannel):
 
+    NOT_SUPPORT_REPLYTYPE = []
+
     def __init__(self):
         super().__init__()
         self.bot_id = ""
@@ -472,6 +474,8 @@ class WecomBotChannel(ChatChannel):
             else:
                 context.type = ContextType.TEXT
             context.content = content.strip()
+            if "desire_rtype" not in context and conf().get("always_reply_voice"):
+                context["desire_rtype"] = ReplyType.VOICE
 
         return context
 
@@ -498,6 +502,8 @@ class WecomBotChannel(ChatChannel):
             self._send_file(reply.content, receiver, is_group, req_id)
         elif reply.type == ReplyType.VIDEO or reply.type == ReplyType.VIDEO_URL:
             self._send_file(reply.content, receiver, is_group, req_id, media_type="video")
+        elif reply.type == ReplyType.VOICE:
+            self._send_voice(reply.content, receiver, is_group, req_id)
         else:
             logger.warning(f"[WecomBot] Unsupported reply type: {reply.type}, falling back to text")
             self._send_text(str(reply.content), receiver, is_group, req_id)
@@ -730,6 +736,65 @@ class WecomBotChannel(ChatChannel):
                 },
             })
 
+    def _send_voice(self, voice_path: str, receiver: str, is_group: bool, req_id: str = None):
+        """Send native voice reply. WeCom voice media must be amr."""
+        local_path = voice_path
+        if local_path.startswith("file://"):
+            local_path = local_path[7:]
+
+        if local_path.startswith(("http://", "https://")):
+            try:
+                resp = requests.get(local_path, timeout=60)
+                resp.raise_for_status()
+                ext = os.path.splitext(local_path)[1] or ".mp3"
+                tmp_path = f"/tmp/wecom_voice_{uuid.uuid4().hex[:8]}{ext}"
+                with open(tmp_path, "wb") as f:
+                    f.write(resp.content)
+                local_path = tmp_path
+            except Exception as e:
+                logger.error(f"[WecomBot] Failed to download voice for sending: {e}")
+                return
+
+        if not os.path.exists(local_path):
+            logger.error(f"[WecomBot] Voice file not found: {local_path}")
+            return
+
+        amr_path = local_path
+        if not local_path.lower().endswith(".amr"):
+            try:
+                from voice.audio_convert import any_to_amr
+                amr_path = os.path.splitext(local_path)[0] + ".amr"
+                any_to_amr(local_path, amr_path)
+            except Exception as e:
+                logger.error(f"[WecomBot] Failed to convert voice to amr: {e}")
+                return
+
+        media_id = self._upload_media(amr_path, "voice")
+        if not media_id:
+            logger.error("[WecomBot] Failed to upload voice media")
+            return
+
+        if req_id:
+            self._ws_send({
+                "cmd": "aibot_respond_msg",
+                "headers": {"req_id": req_id},
+                "body": {
+                    "msgtype": "voice",
+                    "voice": {"media_id": media_id},
+                },
+            })
+        else:
+            self._ws_send({
+                "cmd": "aibot_send_msg",
+                "headers": {"req_id": self._gen_req_id()},
+                "body": {
+                    "chatid": receiver,
+                    "chat_type": 2 if is_group else 1,
+                    "msgtype": "voice",
+                    "voice": {"media_id": media_id},
+                },
+            })
+
     def _active_send_markdown(self, content: str, receiver: str, is_group: bool):
         """Proactively send markdown message (for scheduled tasks, no req_id)."""
         self._ws_send({
diff --git a/channel/weixin/weixin_channel.py b/channel/weixin/weixin_channel.py
index dba9060f..61f5cbb1 100644
--- a/channel/weixin/weixin_channel.py
+++ b/channel/weixin/weixin_channel.py
@@ -60,6 +60,9 @@ def _save_credentials(cred_path: str, data: dict):
 @singleton
 class WeixinChannel(ChatChannel):
 
+    # ilink bot protocol has no outbound voice item; deliver TTS as a file.
+    NOT_SUPPORT_REPLYTYPE = []
+
     LOGIN_STATUS_IDLE = "idle"
     LOGIN_STATUS_WAITING = "waiting_scan"
     LOGIN_STATUS_SCANNED = "scanned"
@@ -464,6 +467,14 @@ class WeixinChannel(ChatChannel):
             else:
                 context.type = ContextType.TEXT
             context.content = content.strip()
+            if "desire_rtype" not in context and conf().get("always_reply_voice"):
+                context["desire_rtype"] = ReplyType.VOICE
+
+        elif ctype == ContextType.VOICE:
+            if "desire_rtype" not in context and (
+                conf().get("voice_reply_voice") or conf().get("always_reply_voice")
+            ):
+                context["desire_rtype"] = ReplyType.VOICE
 
         return context
 
@@ -486,6 +497,9 @@ class WeixinChannel(ChatChannel):
             self._send_file(reply.content, receiver, context_token)
         elif reply.type in (ReplyType.VIDEO, ReplyType.VIDEO_URL):
             self._send_video(reply.content, receiver, context_token)
+        elif reply.type == ReplyType.VOICE:
+            # ilink has no outbound voice item; deliver TTS as a file attachment.
+            self._send_file(reply.content, receiver, context_token)
         else:
             logger.warning(f"[Weixin] Unsupported reply type: {reply.type}, fallback to text")
             self._send_text(str(reply.content), receiver, context_token)