feat(voice): enable TTS on Weixin / DingTalk / WeCom Bot with text-then-voice delivery

- Clear NOT_SUPPORT_REPLYTYPE on weixin, wecom_bot, dingtalk so TTS replies
  are actually synthesized for these channels.
- Wire desire_rtype=VOICE in weixin and wecom_bot _compose_context so the
  always_reply_voice / voice_reply_voice toggles take effect.
- DingTalk: send native sampleAudio (mediaId + duration). The media API
  only accepts ogg/amr, so convert TTS mp3/wav to amr on the fly.
- WeCom Bot: send native voice msgtype via ws (respond + active push),
  converting TTS audio to amr before upload.
- Weixin (ilink): no outbound voice item, deliver TTS as a file attachment.
- chat_channel: when a TEXT reply is converted to VOICE, stash original
  text in context["voice_reply_text"] and send a text bubble before the
  voice reply. Skipped for feishu_streamed and wechatcom_app, which
  already render text alongside the voice.
This commit is contained in:
zhayujie
2026-05-21 17:29:26 +08:00
parent b8333e351c
commit bca97a1d14
4 changed files with 134 additions and 0 deletions

View File

@@ -270,6 +270,8 @@ class ChatChannel(Channel):
if reply.type == ReplyType.TEXT:
reply_text = reply.content
if desire_rtype == ReplyType.VOICE and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE:
# Preserve original text for the "text-then-voice" pattern in _send_reply.
context["voice_reply_text"] = reply.content
reply = super().build_text_to_voice(reply.content)
return self._decorate_reply(context, reply)
if context.get("isgroup", False):
@@ -317,6 +319,15 @@ class ChatChannel(Channel):
# 短暂延迟后发送图片
time.sleep(0.3)
self._send(reply, context)
# Send text bubble before voice, unless channel already streamed
# the text (feishu) or natively renders STT under the voice (wechatcom).
elif reply.type == ReplyType.VOICE and context.get("voice_reply_text") \
and not context.get("feishu_streamed") \
and context.get("channel_type") not in ("wechatcom_app",):
text_reply = Reply(ReplyType.TEXT, context.get("voice_reply_text"))
self._send(text_reply, context)
time.sleep(0.3)
self._send(reply, context)
else:
self._send(reply, context)

View File

@@ -86,6 +86,8 @@ def _check(func):
@singleton
class DingTalkChanel(ChatChannel, dingtalk_stream.ChatbotHandler):
NOT_SUPPORT_REPLYTYPE = []
dingtalk_client_id = conf().get('dingtalk_client_id')
dingtalk_client_secret = conf().get('dingtalk_client_secret')
@@ -870,6 +872,48 @@ class DingTalkChanel(ChatChannel, dingtalk_stream.ChatbotHandler):
self.reply_text("抱歉,文件上传失败", incoming_message)
return
# Native sampleAudio. Upload only accepts ogg/amr, so convert TTS mp3/wav to amr.
elif reply.type == ReplyType.VOICE:
logger.info(f"[DingTalk] Sending voice: {reply.content}")
access_token = self.get_access_token()
if not access_token:
logger.error("[DingTalk] Cannot get access token for voice")
self.reply_text("抱歉语音发送失败无法获取token", incoming_message)
return
voice_path = reply.content
if voice_path.startswith("file://"):
voice_path = voice_path[7:]
amr_path = voice_path
duration_ms = 0
if not voice_path.lower().endswith((".amr", ".ogg")):
try:
from voice.audio_convert import any_to_amr
amr_path = os.path.splitext(voice_path)[0] + ".amr"
duration_ms = int(any_to_amr(voice_path, amr_path) or 0)
except Exception as e:
logger.error(f"[DingTalk] Failed to convert voice to amr: {e}")
self.reply_text("抱歉,语音转码失败", incoming_message)
return
media_id = self.upload_media(amr_path, media_type="voice")
if not media_id:
logger.error("[DingTalk] Failed to upload voice media")
self.reply_text("抱歉,语音上传失败", incoming_message)
return
msg_param = {
"mediaId": media_id,
"duration": str(duration_ms or 1000),
}
success = self._send_file_message(
access_token, incoming_message, "sampleAudio", msg_param, isgroup
)
if not success:
self.reply_text("抱歉,语音发送失败", incoming_message)
return
# 处理文本消息
elif reply.type == ReplyType.TEXT:
logger.info(f"[DingTalk] Sending text message, length={len(reply.content)}")

View File

@@ -81,6 +81,8 @@ def _loads_wecom_ws_json(raw):
@singleton
class WecomBotChannel(ChatChannel):
NOT_SUPPORT_REPLYTYPE = []
def __init__(self):
super().__init__()
self.bot_id = ""
@@ -472,6 +474,8 @@ class WecomBotChannel(ChatChannel):
else:
context.type = ContextType.TEXT
context.content = content.strip()
if "desire_rtype" not in context and conf().get("always_reply_voice"):
context["desire_rtype"] = ReplyType.VOICE
return context
@@ -498,6 +502,8 @@ class WecomBotChannel(ChatChannel):
self._send_file(reply.content, receiver, is_group, req_id)
elif reply.type == ReplyType.VIDEO or reply.type == ReplyType.VIDEO_URL:
self._send_file(reply.content, receiver, is_group, req_id, media_type="video")
elif reply.type == ReplyType.VOICE:
self._send_voice(reply.content, receiver, is_group, req_id)
else:
logger.warning(f"[WecomBot] Unsupported reply type: {reply.type}, falling back to text")
self._send_text(str(reply.content), receiver, is_group, req_id)
@@ -730,6 +736,65 @@ class WecomBotChannel(ChatChannel):
},
})
def _send_voice(self, voice_path: str, receiver: str, is_group: bool, req_id: str = None):
"""Send native voice reply. WeCom voice media must be amr."""
local_path = voice_path
if local_path.startswith("file://"):
local_path = local_path[7:]
if local_path.startswith(("http://", "https://")):
try:
resp = requests.get(local_path, timeout=60)
resp.raise_for_status()
ext = os.path.splitext(local_path)[1] or ".mp3"
tmp_path = f"/tmp/wecom_voice_{uuid.uuid4().hex[:8]}{ext}"
with open(tmp_path, "wb") as f:
f.write(resp.content)
local_path = tmp_path
except Exception as e:
logger.error(f"[WecomBot] Failed to download voice for sending: {e}")
return
if not os.path.exists(local_path):
logger.error(f"[WecomBot] Voice file not found: {local_path}")
return
amr_path = local_path
if not local_path.lower().endswith(".amr"):
try:
from voice.audio_convert import any_to_amr
amr_path = os.path.splitext(local_path)[0] + ".amr"
any_to_amr(local_path, amr_path)
except Exception as e:
logger.error(f"[WecomBot] Failed to convert voice to amr: {e}")
return
media_id = self._upload_media(amr_path, "voice")
if not media_id:
logger.error("[WecomBot] Failed to upload voice media")
return
if req_id:
self._ws_send({
"cmd": "aibot_respond_msg",
"headers": {"req_id": req_id},
"body": {
"msgtype": "voice",
"voice": {"media_id": media_id},
},
})
else:
self._ws_send({
"cmd": "aibot_send_msg",
"headers": {"req_id": self._gen_req_id()},
"body": {
"chatid": receiver,
"chat_type": 2 if is_group else 1,
"msgtype": "voice",
"voice": {"media_id": media_id},
},
})
def _active_send_markdown(self, content: str, receiver: str, is_group: bool):
"""Proactively send markdown message (for scheduled tasks, no req_id)."""
self._ws_send({

View File

@@ -60,6 +60,9 @@ def _save_credentials(cred_path: str, data: dict):
@singleton
class WeixinChannel(ChatChannel):
# ilink bot protocol has no outbound voice item; deliver TTS as a file.
NOT_SUPPORT_REPLYTYPE = []
LOGIN_STATUS_IDLE = "idle"
LOGIN_STATUS_WAITING = "waiting_scan"
LOGIN_STATUS_SCANNED = "scanned"
@@ -464,6 +467,14 @@ class WeixinChannel(ChatChannel):
else:
context.type = ContextType.TEXT
context.content = content.strip()
if "desire_rtype" not in context and conf().get("always_reply_voice"):
context["desire_rtype"] = ReplyType.VOICE
elif ctype == ContextType.VOICE:
if "desire_rtype" not in context and (
conf().get("voice_reply_voice") or conf().get("always_reply_voice")
):
context["desire_rtype"] = ReplyType.VOICE
return context
@@ -486,6 +497,9 @@ class WeixinChannel(ChatChannel):
self._send_file(reply.content, receiver, context_token)
elif reply.type in (ReplyType.VIDEO, ReplyType.VIDEO_URL):
self._send_video(reply.content, receiver, context_token)
elif reply.type == ReplyType.VOICE:
# ilink has no outbound voice item; deliver TTS as a file attachment.
self._send_file(reply.content, receiver, context_token)
else:
logger.warning(f"[Weixin] Unsupported reply type: {reply.type}, fallback to text")
self._send_text(str(reply.content), receiver, context_token)