diff --git a/bridge/bridge.py b/bridge/bridge.py
index 753e394a..c0cb62e4 100644
--- a/bridge/bridge.py
+++ b/bridge/bridge.py
@@ -14,7 +14,9 @@ class Bridge(object):
def __init__(self):
self.btype = {
"chat": const.OPENAI,
- "voice_to_text": conf().get("voice_to_text", "openai"),
+ # Empty `voice_to_text` (the default in new configs) triggers
+ # the auto-pick below — see _auto_pick_voice_to_text for order.
+ "voice_to_text": conf().get("voice_to_text") or self._auto_pick_voice_to_text(),
"text_to_voice": conf().get("text_to_voice", "google"),
"translate": conf().get("translate", "baidu"),
}
@@ -84,6 +86,46 @@ class Bridge(object):
self.chat_bots = {}
self._agent_bridge = None
+ def refresh_voice(self):
+ """Re-read voice_to_text / text_to_voice from config and drop the
+ cached voice bots so the next call picks up the new provider.
+ Used by the web console after the user edits voice settings.
+ Does NOT touch the agent_bridge / agent state.
+ """
+ new_v2t = conf().get("voice_to_text") or self._auto_pick_voice_to_text()
+ new_t2v = conf().get("text_to_voice", "google")
+ if conf().get("use_linkai") and conf().get("linkai_api_key"):
+ if not conf().get("voice_to_text") or conf().get("voice_to_text") in ["openai"]:
+ new_v2t = const.LINKAI
+ if not conf().get("text_to_voice") or conf().get("text_to_voice") in ["openai", const.TTS_1, const.TTS_1_HD]:
+ new_t2v = const.LINKAI
+ self.btype["voice_to_text"] = new_v2t
+ self.btype["text_to_voice"] = new_t2v
+ self.bots.pop("voice_to_text", None)
+ self.bots.pop("text_to_voice", None)
+ logger.info(f"[Bridge] voice refreshed: voice_to_text={new_v2t}, text_to_voice={new_t2v}")
+
+ @staticmethod
+ def _auto_pick_voice_to_text() -> str:
+ """Pick an ASR provider by configured api keys when voice_to_text is
+ unset. Order matches the web console: openai → dashscope → zhipu →
+ linkai. Falls back to 'openai' when nothing is configured so the
+ original "missing key" error is preserved.
+ """
+ def has(k: str) -> bool:
+ v = (conf().get(k) or "").strip()
+ return v != "" and v not in ("YOUR API KEY", "YOUR_API_KEY")
+
+ for key, provider in (
+ ("open_ai_api_key", "openai"),
+ ("dashscope_api_key", "dashscope"),
+ ("zhipu_ai_api_key", "zhipu"),
+ ("linkai_api_key", "linkai"),
+ ):
+ if has(key):
+ return provider
+ return "openai"
+
# 模型对应的接口
def get_bot(self, typename):
if self.bots.get(typename) is None:
diff --git a/channel/web/chat.html b/channel/web/chat.html
index 31705d66..ba68e0f4 100644
--- a/channel/web/chat.html
+++ b/channel/web/chat.html
@@ -422,15 +422,24 @@
-
+
+
+
+
+
+
{
+ const elapsed = Date.now() - recordStartedAt;
+ const minMs = 350;
+ if (elapsed < minMs) {
+ // Give the recorder a moment to capture at least one chunk
+ // before we tell it to stop.
+ setTimeout(() => stop(), minMs - elapsed);
+ } else {
+ stop();
+ }
+ };
+
+ const stop = () => {
+ if (mediaRecorder && mediaRecorder.state !== 'inactive') {
+ mediaRecorder.stop();
+ }
+ };
+
+ micBtn.addEventListener('click', () => {
+ if (recording) {
+ stopWithMinDuration();
+ } else {
+ start();
+ }
+ });
+
+ setIdle();
+})();
+
// Smart auto-scroll: pause when user scrolls up, resume when near bottom
let _autoScrollEnabled = true;
const _SCROLL_THRESHOLD = 80; // px from bottom to re-enable auto-scroll
@@ -1250,6 +1449,87 @@ document.querySelectorAll('.example-card').forEach(card => {
});
});
+// Voice-message variant of sendMessage(): renders a playable audio bubble
+// with the ASR caption, then dispatches the recognised text to /message
+// through the same SSE/loading flow as a typed message.
+function sendVoiceMessage(text, audioUrl) {
+ text = (text || '').trim();
+ if (!text) return;
+
+ inputHistory.push(text);
+ historyIdx = -1;
+ historySavedDraft = '';
+
+ const ws = document.getElementById('welcome-screen');
+ const isFirstMessage = !!ws;
+ if (ws) ws.remove();
+
+ const titleInfo = isFirstMessage ? { sid: sessionId, userMsg: text } : null;
+ const timestamp = new Date();
+ addUserVoiceMessage(audioUrl, text, timestamp);
+ const loadingEl = addLoadingIndicator();
+
+ const body = {
+ session_id: sessionId,
+ message: text,
+ stream: true,
+ timestamp: timestamp.toISOString(),
+ };
+
+ const MAX_RETRIES = 2;
+ const RETRY_DELAY_MS = 1000;
+ function postWithRetry(attempt) {
+ fetch('/message', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify(body)
+ })
+ .then(r => r.json())
+ .then(data => {
+ if (data.status === 'success') {
+ if (data.stream) {
+ startSSE(data.request_id, loadingEl, timestamp, titleInfo);
+ } else {
+ loadingContainers[data.request_id] = loadingEl;
+ }
+ } else {
+ loadingEl.remove();
+ addBotMessage(t('error_send'), new Date());
+ }
+ })
+ .catch(err => {
+ if (attempt < MAX_RETRIES) {
+ setTimeout(() => postWithRetry(attempt + 1), RETRY_DELAY_MS * (attempt + 1));
+ return;
+ }
+ loadingEl.remove();
+ addBotMessage(t('error_send'), new Date());
+ });
+ }
+ postWithRetry(0);
+}
+
+function addUserVoiceMessage(audioUrl, caption, timestamp) {
+ const el = document.createElement('div');
+ el.className = 'flex justify-end px-4 sm:px-6 py-3';
+ // Voice-message bubble: playable on top, ASR caption beneath.
+ // The bubble keeps the same primary tint as a normal user message so
+ // it visually slots into the conversation flow.
+ el.innerHTML = `
+
+
+
+ ${caption ? `
${escapeHtml(caption)}
` : ''}
+
+
${formatTime(timestamp)}
+
+ `;
+ messagesDiv.appendChild(el);
+ _autoScrollEnabled = true;
+ scrollChatToBottom(true);
+}
+
function sendMessage() {
const text = chatInput.value.trim();
if (!text && pendingAttachments.length === 0) return;
@@ -2573,7 +2853,12 @@ let cfgProviderValue = '';
let cfgModelValue = '';
// --- Custom dropdown helper ---
-function initDropdown(el, options, selectedValue, onChange) {
+function initDropdown(el, options, selectedValue, onChange, opts) {
+ // opts.placeholder: when set AND selectedValue is empty, render that text
+ // in a dim style instead of auto-selecting options[0]. Useful for
+ // "pick or empty" capabilities (asr / embedding) where we want the
+ // user to make an explicit choice.
+ opts = opts || {};
const textEl = el.querySelector('.cfg-dropdown-text');
const menuEl = el.querySelector('.cfg-dropdown-menu');
const selEl = el.querySelector('.cfg-dropdown-selected');
@@ -2615,8 +2900,20 @@ function initDropdown(el, options, selectedValue, onChange) {
menuEl.appendChild(item);
});
const sel = options.find(o => o.value === el._ddValue);
- textEl.textContent = sel ? sel.label : (options[0] ? options[0].label : '--');
- if (!sel && options[0]) el._ddValue = options[0].value;
+ if (sel) {
+ textEl.textContent = sel.label;
+ textEl.classList.remove('text-slate-400', 'dark:text-slate-500');
+ } else if (opts.placeholder && !el._ddValue) {
+ // No selection yet — show the placeholder in muted style.
+ // Do NOT write a fallback value, so the dropdown stays
+ // "unsaved" until the user explicitly picks.
+ textEl.textContent = opts.placeholder;
+ textEl.classList.add('text-slate-400', 'dark:text-slate-500');
+ } else {
+ textEl.textContent = options[0] ? options[0].label : '--';
+ textEl.classList.remove('text-slate-400', 'dark:text-slate-500');
+ if (options[0]) el._ddValue = options[0].value;
+ }
}
render();
@@ -3566,21 +3863,27 @@ function renderCapabilityBody(def, cap, body) {
// For auto-capable capabilities, an "auto" strategy means the user has
// not pinned a vendor; we honor that by selecting the empty-string
// sentinel rather than the resolved fallback provider name.
- // `suggested_provider` is a UI-only preselect for embedding when nothing
- // is pinned yet — purely cosmetic, not persisted until the user saves.
+ // `suggested_provider` is a UI-only preselect (used by embedding & ASR)
+ // when the user has not pinned a vendor yet — purely cosmetic, not
+ // persisted until the user clicks Save.
+ // For "pick or empty" capabilities (no current, no suggestion), we leave
+ // the dropdown unselected and show a muted placeholder so the user is
+ // nudged to pick explicitly.
+ const noSelectionAndNoHint = !cap.current_provider && !cap.suggested_provider;
const initialProviderValue = pendingProvider
? pendingProvider
: ((cap.strategy === 'auto' && capabilitySupportsAuto(def.id))
? ''
: (cap.current_provider
|| cap.suggested_provider
- || (ddOpts[0] && ddOpts[0].value)
+ || (noSelectionAndNoHint ? '' : (ddOpts[0] && ddOpts[0].value))
|| ''));
initDropdown(
provDd,
ddOpts,
initialProviderValue,
- (value) => onCapabilityProviderChange(def, value, body)
+ (value) => onCapabilityProviderChange(def, value, body),
+ noSelectionAndNoHint ? { placeholder: t('models_pick_provider') } : null
);
decorateCapabilityProviderDropdown(def, provDd, providerOpts);
diff --git a/channel/web/web_channel.py b/channel/web/web_channel.py
index 5e4630fe..513e0210 100644
--- a/channel/web/web_channel.py
+++ b/channel/web/web_channel.py
@@ -1,10 +1,11 @@
+import datetime
import hashlib
import hmac
-import time
import json
import logging
import mimetypes
import os
+import random
import threading
import time
import uuid
@@ -340,6 +341,10 @@ class WebChannel(ChatChannel):
# Use a single-element list as a mutable counter accessible from closure.
reasoning_chars_sent = [0]
reasoning_capped_notified = [False]
+ # Captures the first error message emitted by agent_stream so the
+ # subsequent agent_end handler can skip its "empty final_response"
+ # fallback (which would otherwise overwrite the real error).
+ streamed_error: List[str] = []
def on_event(event: dict):
if request_id not in self.sse_queues:
@@ -398,6 +403,25 @@ class WebChannel(ChatChannel):
if tool_calls:
q.put({"type": "message_end", "has_tool_calls": True})
+ elif event_type == "error":
+ # Agent raised an exception (LLM 401/timeout/etc). Surface the
+ # real message instead of letting the empty-response fallback
+ # below hide it as "(模型未返回任何内容)".
+ err_msg = data.get("error") or "unknown error"
+ logger.warning(
+ f"[WebChannel] agent_stream emitted error for "
+ f"request {request_id}: {err_msg}"
+ )
+ # Remember it so the agent_end handler below knows not to
+ # rewrite the message into a generic empty-response notice.
+ streamed_error.append(err_msg)
+ q.put({
+ "type": "done",
+ "content": f"❌ {err_msg}",
+ "request_id": request_id,
+ "timestamp": time.time(),
+ })
+
elif event_type == "agent_end":
# Safety net: if the agent finishes with an empty final_response,
# chat_channel skips _send_reply (because reply.content is empty),
@@ -406,16 +430,21 @@ class WebChannel(ChatChannel):
# here so the frontend always gets closure.
final_response = data.get("final_response", "")
if not final_response or not str(final_response).strip():
- logger.warning(
- f"[WebChannel] agent_end with empty final_response for "
- f"request {request_id}, sending fallback done"
- )
- q.put({
- "type": "done",
- "content": "(模型未返回任何内容,请重试或换一种方式描述你的需求)",
- "request_id": request_id,
- "timestamp": time.time(),
- })
+ if streamed_error:
+ # Error was already surfaced via the `error` event
+ # handler above; nothing more to do here.
+ pass
+ else:
+ logger.warning(
+ f"[WebChannel] agent_end with empty final_response for "
+ f"request {request_id}, sending fallback done"
+ )
+ q.put({
+ "type": "done",
+ "content": "(模型未返回任何内容,请重试或换一种方式描述你的需求)",
+ "request_id": request_id,
+ "timestamp": time.time(),
+ })
elif event_type == "file_to_send":
file_path = data.get("path", "")
@@ -432,6 +461,39 @@ class WebChannel(ChatChannel):
return on_event
+ @staticmethod
+ def _cleanup_stale_voice_recordings(max_age_seconds: int = 3600) -> None:
+ """Delete voice-input audio files older than `max_age_seconds`.
+
+ Called once at startup. Web mic recordings live in the upload
+ directory so the browser can replay them inside the conversation
+ bubble. We don't persist them to history, so once a process
+ restarts they're useless — but they're never auto-cleaned
+ anywhere else, so without this they accumulate over time.
+ """
+ try:
+ upload_dir = _get_upload_dir()
+ if not os.path.isdir(upload_dir):
+ return
+ now = time.time()
+ removed = 0
+ for name in os.listdir(upload_dir):
+ if not name.startswith("voice_input_"):
+ continue
+ full = os.path.join(upload_dir, name)
+ try:
+ if not os.path.isfile(full):
+ continue
+ if now - os.path.getmtime(full) > max_age_seconds:
+ os.remove(full)
+ removed += 1
+ except OSError:
+ continue
+ if removed:
+ logger.info(f"[WebChannel] cleaned up {removed} stale voice recording(s) from {upload_dir}")
+ except Exception as e:
+ logger.warning(f"[WebChannel] voice cleanup failed: {e}")
+
def upload_file(self):
"""Handle file or directory upload via multipart/form-data."""
try:
@@ -703,6 +765,8 @@ class WebChannel(ChatChannel):
port = conf().get("web_port", 9899)
is_public_bind = host in ("0.0.0.0", "::")
+ self._cleanup_stale_voice_recordings()
+
# 打印可用渠道类型提示
logger.info(
"[WebChannel] 全部可用通道如下,可修改 config.json 配置文件中的 channel_type 字段进行切换,多个通道用逗号分隔:")
@@ -746,6 +810,7 @@ class WebChannel(ChatChannel):
'/upload', 'UploadHandler',
'/uploads/(.*)', 'UploadsHandler',
'/api/file', 'FileServeHandler',
+ '/api/voice/asr', 'VoiceAsrHandler',
'/poll', 'PollHandler',
'/stream', 'StreamHandler',
'/chat', 'ChatHandler',
@@ -870,6 +935,68 @@ class UploadHandler:
return WebChannel().upload_file()
+class VoiceAsrHandler:
+ """
+ Accept a short audio recording from the web console mic button,
+ save it under uploads/ so the browser can replay it, then run it
+ through the currently configured ASR provider.
+
+ Returns {status, text, audio_url} on success — the frontend renders
+ a voice-message bubble with the playable audio and the transcribed
+ caption.
+ """
+ def POST(self):
+ _require_auth()
+ web.header('Content-Type', 'application/json; charset=utf-8')
+
+ saved_path = None
+ try:
+ params = _raw_web_input()
+ file_obj = params.get("file")
+ if file_obj is None:
+ return json.dumps({"status": "error", "message": "no audio file"})
+
+ filename = getattr(file_obj, "filename", "") or "recording.webm"
+ ext = os.path.splitext(filename)[1].lower() or ".webm"
+ if ext not in (".webm", ".ogg", ".opus", ".mp4", ".m4a", ".mp3", ".wav"):
+ ext = ".webm"
+
+ upload_dir = _get_upload_dir()
+ os.makedirs(upload_dir, exist_ok=True)
+ ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+ saved_name = f"voice_input_{ts}_{random.randint(0, 9999)}{ext}"
+ saved_path = os.path.join(upload_dir, saved_name)
+ with open(saved_path, "wb") as f:
+ f.write(file_obj.file.read() if hasattr(file_obj, "file") else file_obj.value)
+
+ audio_url = f"/uploads/{saved_name}"
+
+ from bridge.bridge import Bridge
+ reply = Bridge().fetch_voice_to_text(saved_path)
+ if reply is None:
+ return json.dumps({
+ "status": "error",
+ "message": "ASR returned no reply",
+ "audio_url": audio_url,
+ })
+
+ from bridge.reply import ReplyType
+ if reply.type == ReplyType.TEXT:
+ return json.dumps({
+ "status": "success",
+ "text": reply.content or "",
+ "audio_url": audio_url,
+ })
+ return json.dumps({
+ "status": "error",
+ "message": reply.content or "ASR failed",
+ "audio_url": audio_url,
+ })
+ except Exception as e:
+ logger.exception(f"[VoiceAsrHandler] failed: {e}")
+ return json.dumps({"status": "error", "message": str(e)})
+
+
class UploadsHandler:
def GET(self, file_name):
_require_auth()
@@ -1232,7 +1359,7 @@ class ModelsHandler:
# Capability -> editable flag, current-value resolver, and supported provider
# ids drawn from ConfigHandler.PROVIDER_MODELS where applicable.
- _ASR_PROVIDERS = ["openai", "linkai", "baidu", "ali", "xunfei", "azure", "google"]
+ _ASR_PROVIDERS = ["openai", "dashscope", "zhipu", "linkai"]
_TTS_PROVIDERS = ["openai", "linkai", "minimax", "baidu", "ali", "xunfei", "azure", "google", "elevenlabs", "edge", "pytts"]
_EMBEDDING_PROVIDERS = ["openai", "dashscope", "doubao", "zhipu", "linkai"]
@@ -1502,10 +1629,23 @@ class ModelsHandler:
@classmethod
def _asr_capability(cls, local_config: dict) -> dict:
- provider_id = (local_config.get("voice_to_text") or "openai").strip().lower()
+ # "Pick or empty" — when voice_to_text is unset we don't show a
+ # current selection. `suggested_provider` previews which vendor
+ # the bridge auto-picker would land on (purely a UX hint, NOT
+ # persisted). Once the user saves a vendor, we lock onto it.
+ explicit = (local_config.get("voice_to_text") or "").strip().lower()
+ suggested = ""
+ if not explicit:
+ for pid in cls._ASR_PROVIDERS:
+ meta = ConfigHandler.PROVIDER_MODELS.get(pid) or {}
+ key_field = meta.get("api_key_field")
+ if key_field and cls._is_real_key(local_config.get(key_field, "")):
+ suggested = pid
+ break
return {
"editable": True,
- "current_provider": provider_id,
+ "current_provider": explicit,
+ "suggested_provider": suggested,
"current_model": "",
"providers": cls._ASR_PROVIDERS,
}
@@ -1897,6 +2037,10 @@ class ModelsHandler:
file_cfg[key] = value
self._write_file_config(file_cfg)
logger.info(f"[ModelsHandler] {key} set: {value!r}")
+ # Bridge caches voice_to_text routing + bot instance; refresh it
+ # so the change takes effect on the next voice request.
+ if key in ("voice_to_text", "text_to_voice"):
+ self._refresh_voice_routing()
return json.dumps({"status": "success", key: value})
def _set_tts(self, provider_id: str, model: str) -> str:
@@ -1910,8 +2054,17 @@ class ModelsHandler:
file_cfg["text_to_voice_model"] = model
self._write_file_config(file_cfg)
logger.info(f"[ModelsHandler] tts updated: provider={provider_id!r} model={model!r}")
+ self._refresh_voice_routing()
return json.dumps({"status": "success", "provider": provider_id, "model": model})
+ @staticmethod
+ def _refresh_voice_routing() -> None:
+ try:
+ from bridge.bridge import Bridge
+ Bridge().refresh_voice()
+ except Exception as e:
+ logger.warning(f"[ModelsHandler] Bridge voice refresh failed: {e}")
+
def _set_embedding(self, provider_id: str, model: str) -> str:
# provider_id="" + model="" means "switch back to legacy auto mode".
local_config = conf()
@@ -1926,9 +2079,9 @@ class ModelsHandler:
file_cfg["embedding_model"] = ""
self._write_file_config(file_cfg)
logger.info(f"[ModelsHandler] embedding updated: provider={provider_id!r} model={model!r}")
- # The agent's MemoryManager picks the new provider on next process
- # restart; the index dim may now mismatch so a rebuild is needed.
- # The frontend surfaces this via a confirm + post-save dialog.
+ # The next /memory rebuild-index command hot-swaps the provider onto
+ # the running MemoryManager (see plugins/cow_cli). The dim may have
+ # changed, so the frontend prompts the user to rebuild.
return json.dumps({"status": "success", "provider": provider_id, "model": model})
@staticmethod
diff --git a/voice/dashscope/__init__.py b/voice/dashscope/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/voice/dashscope/dashscope_voice.py b/voice/dashscope/dashscope_voice.py
new file mode 100644
index 00000000..089b933f
--- /dev/null
+++ b/voice/dashscope/dashscope_voice.py
@@ -0,0 +1,135 @@
+# encoding:utf-8
+"""
+DashScope (Aliyun Bailian) voice service.
+
+ASR : qwen3-asr-flash via dashscope.MultiModalConversation
+TTS : not yet implemented (see CosyVoice / qwen3-tts)
+
+Why MultiModalConversation instead of the OpenAI-compatible endpoint:
+ - SDK is already a project dep (used by chat/vision)
+ - Native API accepts local file:// paths up to 100 QPS without an OSS
+ round-trip, which is what we need for the "send a short voice
+ message" flow. Public URLs / Base64 also work.
+"""
+import os
+from typing import Optional
+
+import dashscope
+from dashscope import MultiModalConversation
+
+from bridge.reply import Reply, ReplyType
+from common.log import logger
+from config import conf
+from voice import audio_convert
+from voice.voice import Voice
+
+
+DEFAULT_ASR_MODEL = "qwen3-asr-flash"
+# qwen3-asr-flash hard cap (single file, sync call). Longer audio needs
+# qwen3-asr-flash-filetrans which is async-only and out of scope here.
+MAX_DURATION_SECONDS = 300
+MAX_FILE_BYTES = 10 * 1024 * 1024
+
+
+class DashScopeVoice(Voice):
+ def __init__(self):
+ # api_key is applied per-call (chat bot does the same) so a live
+ # config change via the web console takes effect without restart.
+ pass
+
+ def voiceToText(self, voice_file: str):
+ try:
+ voice_file = self._ensure_compatible_format(voice_file)
+
+ try:
+ size = os.path.getsize(voice_file)
+ if size > MAX_FILE_BYTES:
+ logger.warning(
+ f"[DashScopeVoice] audio file {size}B exceeds {MAX_FILE_BYTES}B; "
+ f"qwen3-asr-flash may reject it"
+ )
+ except OSError:
+ pass
+
+ api_key = conf().get("dashscope_api_key", "")
+ if not api_key:
+ logger.error("[DashScopeVoice] dashscope_api_key is not configured")
+ return Reply(ReplyType.ERROR, "未配置 DashScope API key")
+ dashscope.api_key = api_key
+
+ model = conf().get("voice_to_text_model") or DEFAULT_ASR_MODEL
+ abs_path = os.path.abspath(voice_file)
+ file_uri = f"file://{abs_path}"
+
+ messages = [
+ {"role": "user", "content": [{"audio": file_uri}]},
+ ]
+ response = MultiModalConversation.call(
+ model=model,
+ messages=messages,
+ result_format="message",
+ asr_options={"enable_itn": False, "enable_lid": True},
+ )
+
+ text = self._extract_text(response)
+ if text is None:
+ logger.error(f"[DashScopeVoice] voiceToText failed: {response}")
+ return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
+
+ logger.info(f"[DashScopeVoice] voiceToText model={model} text={text}")
+ return Reply(ReplyType.TEXT, text)
+ except Exception as e:
+ logger.exception(f"[DashScopeVoice] voiceToText exception: {e}")
+ return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
+
+ def textToVoice(self, text: str):
+ # TTS will be added in a follow-up commit (qwen3-tts / cosyvoice).
+ return Reply(ReplyType.ERROR, "DashScope 语音合成尚未接入")
+
+ @staticmethod
+ def _ensure_compatible_format(voice_file: str) -> str:
+ """Convert AMR/SILK to mp3 since qwen3-asr-flash doesn't accept them.
+ Other formats (mp3/wav/m4a/aac/opus/webm) are passed through.
+ """
+ lower = voice_file.lower()
+ if lower.endswith(".amr") or lower.endswith(".silk") or lower.endswith(".slk"):
+ try:
+ mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
+ audio_convert.any_to_mp3(voice_file, mp3_file)
+ return mp3_file
+ except Exception as e:
+ logger.warning(
+ f"[DashScopeVoice] convert {voice_file} to mp3 failed: {e}; "
+ f"submitting original file"
+ )
+ return voice_file
+
+ @staticmethod
+ def _extract_text(response) -> Optional[str]:
+ """Pull the recognized text out of MultiModalConversation response.
+
+ Successful shape (result_format="message"):
+ response.output.choices[0].message.content -> list of {"text": "..."}
+ or in some SDK versions a plain string.
+ """
+ try:
+ if getattr(response, "status_code", 200) != 200:
+ return None
+ choices = response.output.get("choices") or []
+ if not choices:
+ return None
+ content = choices[0].get("message", {}).get("content")
+ if isinstance(content, str):
+ return content.strip() or None
+ if isinstance(content, list):
+ parts = []
+ for item in content:
+ if isinstance(item, dict) and "text" in item:
+ parts.append(item["text"])
+ elif isinstance(item, str):
+ parts.append(item)
+ text = "".join(parts).strip()
+ return text or None
+ return None
+ except Exception:
+ return None
diff --git a/voice/factory.py b/voice/factory.py
index abe7ba57..3be60bbf 100644
--- a/voice/factory.py
+++ b/voice/factory.py
@@ -58,4 +58,12 @@ def create_voice(voice_type):
from voice.minimax.minimax_voice import MinimaxVoice
return MinimaxVoice()
+ elif voice_type == "dashscope":
+ from voice.dashscope.dashscope_voice import DashScopeVoice
+
+ return DashScopeVoice()
+ elif voice_type == "zhipu" or voice_type == "zhipuai":
+ from voice.zhipuai.zhipuai_voice import ZhipuAIVoice
+
+ return ZhipuAIVoice()
raise RuntimeError
diff --git a/voice/zhipuai/__init__.py b/voice/zhipuai/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/voice/zhipuai/zhipuai_voice.py b/voice/zhipuai/zhipuai_voice.py
new file mode 100644
index 00000000..8d824275
--- /dev/null
+++ b/voice/zhipuai/zhipuai_voice.py
@@ -0,0 +1,102 @@
+# encoding:utf-8
+"""
+ZhipuAI (BigModel) voice service.
+
+ASR : glm-asr-2512 via the OpenAI-compatible /audio/transcriptions endpoint.
+TTS : not yet implemented.
+
+Endpoint accepts multipart/form-data with `model`, `file`, and `stream`.
+File size <= 25MB, duration <= 30s per request.
+"""
+import os
+
+import requests
+
+from bridge.reply import Reply, ReplyType
+from common.log import logger
+from config import conf
+from voice import audio_convert
+from voice.voice import Voice
+
+
+DEFAULT_ASR_MODEL = "glm-asr-2512"
+DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4"
+MAX_FILE_BYTES = 25 * 1024 * 1024
+REQUEST_TIMEOUT = (5, 60)
+
+
+class ZhipuAIVoice(Voice):
+ def __init__(self):
+ # api_key / base read per-call so live config edits take effect.
+ pass
+
+ def voiceToText(self, voice_file: str):
+ try:
+ voice_file = self._ensure_compatible_format(voice_file)
+
+ try:
+ size = os.path.getsize(voice_file)
+ if size > MAX_FILE_BYTES:
+ logger.warning(
+ f"[ZhipuAIVoice] audio file {size}B exceeds {MAX_FILE_BYTES}B; "
+ f"glm-asr-2512 may reject it"
+ )
+ except OSError:
+ pass
+
+ api_key = conf().get("zhipu_ai_api_key", "")
+ if not api_key:
+ logger.error("[ZhipuAIVoice] zhipu_ai_api_key is not configured")
+ return Reply(ReplyType.ERROR, "未配置 ZhipuAI API key")
+
+ api_base = (conf().get("zhipu_ai_api_base") or DEFAULT_API_BASE).rstrip("/")
+ url = f"{api_base}/audio/transcriptions"
+ model = conf().get("voice_to_text_model") or DEFAULT_ASR_MODEL
+
+ with open(voice_file, "rb") as f:
+ files = {"file": (os.path.basename(voice_file), f)}
+ data = {"model": model, "stream": "false"}
+ headers = {"Authorization": f"Bearer {api_key}"}
+ response = requests.post(
+ url, headers=headers, files=files, data=data, timeout=REQUEST_TIMEOUT
+ )
+
+ if response.status_code != 200:
+ logger.error(
+ f"[ZhipuAIVoice] voiceToText failed: status={response.status_code} "
+ f"body={response.text[:500]}"
+ )
+ return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
+
+ payload = response.json()
+ text = (payload.get("text") or "").strip()
+ if not text:
+ logger.error(f"[ZhipuAIVoice] voiceToText empty text: {payload}")
+ return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
+
+ logger.info(f"[ZhipuAIVoice] voiceToText model={model} text={text}")
+ return Reply(ReplyType.TEXT, text)
+ except Exception as e:
+ logger.exception(f"[ZhipuAIVoice] voiceToText exception: {e}")
+ return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
+
+ def textToVoice(self, text: str):
+ return Reply(ReplyType.ERROR, "ZhipuAI 语音合成尚未接入")
+
+ @staticmethod
+ def _ensure_compatible_format(voice_file: str) -> str:
+ # glm-asr-2512 only accepts .wav / .mp3 — convert everything else
+ # (webm from the browser mic, m4a/amr/silk from chat channels, etc).
+ lower = voice_file.lower()
+ if lower.endswith(".mp3") or lower.endswith(".wav"):
+ return voice_file
+ try:
+ mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
+ audio_convert.any_to_mp3(voice_file, mp3_file)
+ return mp3_file
+ except Exception as e:
+ logger.warning(
+ f"[ZhipuAIVoice] convert {voice_file} to mp3 failed: {e}; "
+ f"submitting original file"
+ )
+ return voice_file