feat(voice): add dashscope & zhipu ASR, in-page mic input

This commit is contained in:
zhayujie
2026-05-20 22:36:37 +08:00
parent fff7326209
commit 2b90f377e6
9 changed files with 786 additions and 34 deletions

View File

@@ -14,7 +14,9 @@ class Bridge(object):
def __init__(self):
self.btype = {
"chat": const.OPENAI,
"voice_to_text": conf().get("voice_to_text", "openai"),
# Empty `voice_to_text` (the default in new configs) triggers
# the auto-pick below — see _auto_pick_voice_to_text for order.
"voice_to_text": conf().get("voice_to_text") or self._auto_pick_voice_to_text(),
"text_to_voice": conf().get("text_to_voice", "google"),
"translate": conf().get("translate", "baidu"),
}
@@ -84,6 +86,46 @@ class Bridge(object):
self.chat_bots = {}
self._agent_bridge = None
def refresh_voice(self):
"""Re-read voice_to_text / text_to_voice from config and drop the
cached voice bots so the next call picks up the new provider.
Used by the web console after the user edits voice settings.
Does NOT touch the agent_bridge / agent state.
"""
new_v2t = conf().get("voice_to_text") or self._auto_pick_voice_to_text()
new_t2v = conf().get("text_to_voice", "google")
if conf().get("use_linkai") and conf().get("linkai_api_key"):
if not conf().get("voice_to_text") or conf().get("voice_to_text") in ["openai"]:
new_v2t = const.LINKAI
if not conf().get("text_to_voice") or conf().get("text_to_voice") in ["openai", const.TTS_1, const.TTS_1_HD]:
new_t2v = const.LINKAI
self.btype["voice_to_text"] = new_v2t
self.btype["text_to_voice"] = new_t2v
self.bots.pop("voice_to_text", None)
self.bots.pop("text_to_voice", None)
logger.info(f"[Bridge] voice refreshed: voice_to_text={new_v2t}, text_to_voice={new_t2v}")
@staticmethod
def _auto_pick_voice_to_text() -> str:
"""Pick an ASR provider by configured api keys when voice_to_text is
unset. Order matches the web console: openai → dashscope → zhipu →
linkai. Falls back to 'openai' when nothing is configured so the
original "missing key" error is preserved.
"""
def has(k: str) -> bool:
v = (conf().get(k) or "").strip()
return v != "" and v not in ("YOUR API KEY", "YOUR_API_KEY")
for key, provider in (
("open_ai_api_key", "openai"),
("dashscope_api_key", "dashscope"),
("zhipu_ai_api_key", "zhipu"),
("linkai_api_key", "linkai"),
):
if has(key):
return provider
return "openai"
# 模型对应的接口
def get_bot(self, typename):
if self.bots.get(typename) is None:

View File

@@ -422,8 +422,9 @@
</button>
</div>
<div id="slash-menu" class="slash-menu hidden"></div>
<div class="flex-1 min-w-0 relative flex items-center">
<textarea id="chat-input"
class="flex-1 min-w-0 px-4 py-[10px] rounded-xl border border-slate-200 dark:border-slate-600
class="w-full pl-4 pr-11 py-[10px] rounded-xl border border-slate-200 dark:border-slate-600
bg-slate-50 dark:bg-white/5 text-slate-800 dark:text-slate-100
placeholder:text-slate-400 dark:placeholder:text-slate-500
focus:outline-none focus:ring-0 focus:border-primary-600
@@ -431,6 +432,14 @@
rows="1"
data-i18n-placeholder="input_placeholder"
placeholder="输入消息,或输入 / 使用指令"></textarea>
<button id="mic-btn" type="button"
class="absolute right-2 top-1/2 -translate-y-1/2 w-8 h-8 flex items-center justify-center rounded-lg
text-slate-400 hover:text-primary-500 hover:bg-primary-50 dark:hover:bg-primary-900/20
cursor-pointer transition-colors duration-150"
data-i18n-title="mic_idle_title" title="点击录音 / 再按一次结束">
<i class="fas fa-microphone text-sm"></i>
</button>
</div>
<button id="send-btn"
class="flex-shrink-0 w-10 h-10 flex items-center justify-center rounded-lg
bg-primary-400 text-white hover:bg-primary-500

View File

@@ -59,6 +59,7 @@ const I18N = {
models_embedding_saved_title: '向量模型已更新',
models_embedding_saved_msg: '请在聊天框输入 /memory rebuild-index 重建索引。',
models_embedding_saved_ok: '去执行',
models_pick_provider: '待选择',
models_clear_confirm_title: '清除厂商凭据',
models_clear_confirm_msg: '确认清除该厂商的 API Key 与 Base URL 吗?相关能力将不再可用。',
cancel: '取消',
@@ -153,6 +154,12 @@ const I18N = {
tip_clear_context: '清除上下文',
tip_attach: '添加附件',
attach_menu_file: '上传文件',
mic_idle_title: '点击录音 / 再按一次结束',
mic_recording_title: '录音中,再次点击结束',
mic_busy_title: '识别中…',
mic_permission_denied: '无法访问麦克风,请检查浏览器权限',
mic_too_short: '录音太短,请重试',
mic_error: '语音识别失败',
attach_menu_folder: '上传文件夹',
confirm_yes: '确认',
confirm_cancel: '取消',
@@ -207,6 +214,7 @@ const I18N = {
models_embedding_saved_title: 'Embedding model updated',
models_embedding_saved_msg: 'Send /memory rebuild-index in the chat to rebuild the index.',
models_embedding_saved_ok: 'Go',
models_pick_provider: 'Pick a provider',
models_clear_confirm_title: 'Clear vendor credentials',
models_clear_confirm_msg: 'Remove this vendor\'s API Key and Base URL? Capabilities relying on it will stop working.',
cancel: 'Cancel',
@@ -301,6 +309,12 @@ const I18N = {
tip_clear_context: 'Clear Context',
tip_attach: 'Add Attachment',
attach_menu_file: 'Upload File',
mic_idle_title: 'Click to record, click again to stop',
mic_recording_title: 'Recording, click to stop',
mic_busy_title: 'Transcribing…',
mic_permission_denied: 'Cannot access microphone — check browser permissions',
mic_too_short: 'Recording too short, please retry',
mic_error: 'Speech recognition failed',
attach_menu_folder: 'Upload Folder',
confirm_yes: 'Confirm',
confirm_cancel: 'Cancel',
@@ -707,6 +721,191 @@ if (!supportsDirectoryUpload && attachFolderOption) {
attachFolderOption.classList.add('hidden');
}
// ---------------- Mic button: in-page voice input via the configured ASR provider ----------------
(function setupMicButton() {
const micBtn = document.getElementById('mic-btn');
if (!micBtn) return;
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia ||
typeof window.MediaRecorder === 'undefined') {
micBtn.style.display = 'none';
return;
}
let mediaRecorder = null;
let stream = null;
let chunks = [];
let recording = false;
const setIdle = () => {
recording = false;
micBtn.classList.remove('text-red-500', 'animate-pulse');
micBtn.classList.add('text-slate-400');
micBtn.querySelector('i').className = 'fas fa-microphone text-sm';
micBtn.title = t('mic_idle_title');
};
const setRecording = () => {
recording = true;
micBtn.classList.remove('text-slate-400');
micBtn.classList.add('text-red-500', 'animate-pulse');
micBtn.querySelector('i').className = 'fas fa-stop text-sm';
micBtn.title = t('mic_recording_title');
};
const setBusy = () => {
micBtn.classList.remove('text-red-500', 'animate-pulse', 'text-slate-400');
micBtn.classList.add('text-primary-500');
micBtn.querySelector('i').className = 'fas fa-spinner fa-spin text-sm';
micBtn.title = t('mic_busy_title');
};
const pickMimeType = () => {
const candidates = [
'audio/webm;codecs=opus',
'audio/webm',
'audio/ogg;codecs=opus',
'audio/mp4',
];
for (const m of candidates) {
if (window.MediaRecorder.isTypeSupported && MediaRecorder.isTypeSupported(m)) {
return m;
}
}
return '';
};
const stopStream = () => {
if (stream) {
stream.getTracks().forEach(t => t.stop());
stream = null;
}
};
let _micTipTimer = null;
const flashError = (msg) => {
console.warn('[mic]', msg);
// Pop a small bubble above the mic so the user actually notices it.
// The mic lives inside a relatively-positioned wrapper around the
// textarea (see chat.html), so we hang the tip off that wrapper.
const wrapper = micBtn.parentElement;
if (!wrapper) return;
let tip = wrapper.querySelector('.mic-tip');
if (!tip) {
tip = document.createElement('div');
tip.className = 'mic-tip absolute right-1 bottom-full mb-2 px-2 py-1 rounded-md '
+ 'text-xs text-white bg-slate-800/90 dark:bg-slate-700/90 shadow-md '
+ 'pointer-events-none whitespace-nowrap z-10';
wrapper.appendChild(tip);
}
tip.textContent = msg;
tip.style.opacity = '1';
if (_micTipTimer) clearTimeout(_micTipTimer);
_micTipTimer = setTimeout(() => {
tip.style.opacity = '0';
tip.style.transition = 'opacity 200ms';
setTimeout(() => tip.remove(), 250);
}, 2000);
};
const upload = async (blob, ext) => {
setBusy();
const fd = new FormData();
fd.append('file', blob, `recording.${ext}`);
try {
const resp = await fetch('/api/voice/asr', { method: 'POST', body: fd });
const data = await resp.json();
if (data.status === 'success' && data.text) {
// Voice-message UX: drop the recording into the conversation
// as a playable bubble with the caption underneath, then
// dispatch the recognised text through the regular send path.
sendVoiceMessage(data.text, data.audio_url);
} else {
flashError(data.message || t('mic_error'));
}
} catch (e) {
flashError(t('mic_error') + ': ' + e.message);
} finally {
setIdle();
}
};
const start = async () => {
try {
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
} catch (e) {
flashError(t('mic_permission_denied'));
return;
}
chunks = [];
const mimeType = pickMimeType();
try {
mediaRecorder = mimeType
? new MediaRecorder(stream, { mimeType })
: new MediaRecorder(stream);
} catch (e) {
stopStream();
flashError(t('mic_error') + ': ' + e.message);
return;
}
mediaRecorder.ondataavailable = (ev) => {
if (ev.data && ev.data.size > 0) chunks.push(ev.data);
};
mediaRecorder.onstop = () => {
stopStream();
const blob = new Blob(chunks, { type: mediaRecorder.mimeType || 'audio/webm' });
// Map mime -> extension so the server picks the right file suffix.
const mt = (mediaRecorder.mimeType || 'audio/webm').split(';')[0];
const extMap = {
'audio/webm': 'webm', 'audio/ogg': 'ogg',
'audio/mp4': 'm4a', 'audio/mpeg': 'mp3',
};
const ext = extMap[mt] || 'webm';
// 256 bytes ~ container header only, no actual audio. Anything
// below that we treat as "tapped by mistake".
if (blob.size < 256) {
setIdle();
flashError(t('mic_too_short'));
return;
}
upload(blob, ext);
};
// timeslice=250ms: force the recorder to flush a chunk every 250ms.
// Without it some browsers wait for stop() before producing any data,
// which loses the audio on very short taps.
mediaRecorder.start(250);
recordStartedAt = Date.now();
setRecording();
};
let recordStartedAt = 0;
const stopWithMinDuration = () => {
const elapsed = Date.now() - recordStartedAt;
const minMs = 350;
if (elapsed < minMs) {
// Give the recorder a moment to capture at least one chunk
// before we tell it to stop.
setTimeout(() => stop(), minMs - elapsed);
} else {
stop();
}
};
const stop = () => {
if (mediaRecorder && mediaRecorder.state !== 'inactive') {
mediaRecorder.stop();
}
};
micBtn.addEventListener('click', () => {
if (recording) {
stopWithMinDuration();
} else {
start();
}
});
setIdle();
})();
// Smart auto-scroll: pause when user scrolls up, resume when near bottom
let _autoScrollEnabled = true;
const _SCROLL_THRESHOLD = 80; // px from bottom to re-enable auto-scroll
@@ -1250,6 +1449,87 @@ document.querySelectorAll('.example-card').forEach(card => {
});
});
// Voice-message variant of sendMessage(): renders a playable audio bubble
// with the ASR caption, then dispatches the recognised text to /message
// through the same SSE/loading flow as a typed message.
function sendVoiceMessage(text, audioUrl) {
text = (text || '').trim();
if (!text) return;
inputHistory.push(text);
historyIdx = -1;
historySavedDraft = '';
const ws = document.getElementById('welcome-screen');
const isFirstMessage = !!ws;
if (ws) ws.remove();
const titleInfo = isFirstMessage ? { sid: sessionId, userMsg: text } : null;
const timestamp = new Date();
addUserVoiceMessage(audioUrl, text, timestamp);
const loadingEl = addLoadingIndicator();
const body = {
session_id: sessionId,
message: text,
stream: true,
timestamp: timestamp.toISOString(),
};
const MAX_RETRIES = 2;
const RETRY_DELAY_MS = 1000;
function postWithRetry(attempt) {
fetch('/message', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(body)
})
.then(r => r.json())
.then(data => {
if (data.status === 'success') {
if (data.stream) {
startSSE(data.request_id, loadingEl, timestamp, titleInfo);
} else {
loadingContainers[data.request_id] = loadingEl;
}
} else {
loadingEl.remove();
addBotMessage(t('error_send'), new Date());
}
})
.catch(err => {
if (attempt < MAX_RETRIES) {
setTimeout(() => postWithRetry(attempt + 1), RETRY_DELAY_MS * (attempt + 1));
return;
}
loadingEl.remove();
addBotMessage(t('error_send'), new Date());
});
}
postWithRetry(0);
}
function addUserVoiceMessage(audioUrl, caption, timestamp) {
const el = document.createElement('div');
el.className = 'flex justify-end px-4 sm:px-6 py-3';
// Voice-message bubble: playable <audio> on top, ASR caption beneath.
// The bubble keeps the same primary tint as a normal user message so
// it visually slots into the conversation flow.
el.innerHTML = `
<div class="max-w-[75%] sm:max-w-[60%]">
<div class="bg-slate-100 dark:bg-white/10 text-slate-700 dark:text-slate-200 rounded-2xl px-3 py-2 msg-content user-bubble">
<audio controls preload="metadata" src="${audioUrl}"
class="block w-[260px] max-w-full h-9"></audio>
${caption ? `<div class="text-xs mt-1.5 leading-snug text-slate-500 dark:text-slate-400 whitespace-pre-wrap break-words">${escapeHtml(caption)}</div>` : ''}
</div>
<div class="text-xs text-slate-400 dark:text-slate-500 mt-1.5 text-right">${formatTime(timestamp)}</div>
</div>
`;
messagesDiv.appendChild(el);
_autoScrollEnabled = true;
scrollChatToBottom(true);
}
function sendMessage() {
const text = chatInput.value.trim();
if (!text && pendingAttachments.length === 0) return;
@@ -2573,7 +2853,12 @@ let cfgProviderValue = '';
let cfgModelValue = '';
// --- Custom dropdown helper ---
function initDropdown(el, options, selectedValue, onChange) {
function initDropdown(el, options, selectedValue, onChange, opts) {
// opts.placeholder: when set AND selectedValue is empty, render that text
// in a dim style instead of auto-selecting options[0]. Useful for
// "pick or empty" capabilities (asr / embedding) where we want the
// user to make an explicit choice.
opts = opts || {};
const textEl = el.querySelector('.cfg-dropdown-text');
const menuEl = el.querySelector('.cfg-dropdown-menu');
const selEl = el.querySelector('.cfg-dropdown-selected');
@@ -2615,8 +2900,20 @@ function initDropdown(el, options, selectedValue, onChange) {
menuEl.appendChild(item);
});
const sel = options.find(o => o.value === el._ddValue);
textEl.textContent = sel ? sel.label : (options[0] ? options[0].label : '--');
if (!sel && options[0]) el._ddValue = options[0].value;
if (sel) {
textEl.textContent = sel.label;
textEl.classList.remove('text-slate-400', 'dark:text-slate-500');
} else if (opts.placeholder && !el._ddValue) {
// No selection yet — show the placeholder in muted style.
// Do NOT write a fallback value, so the dropdown stays
// "unsaved" until the user explicitly picks.
textEl.textContent = opts.placeholder;
textEl.classList.add('text-slate-400', 'dark:text-slate-500');
} else {
textEl.textContent = options[0] ? options[0].label : '--';
textEl.classList.remove('text-slate-400', 'dark:text-slate-500');
if (options[0]) el._ddValue = options[0].value;
}
}
render();
@@ -3566,21 +3863,27 @@ function renderCapabilityBody(def, cap, body) {
// For auto-capable capabilities, an "auto" strategy means the user has
// not pinned a vendor; we honor that by selecting the empty-string
// sentinel rather than the resolved fallback provider name.
// `suggested_provider` is a UI-only preselect for embedding when nothing
// is pinned yet — purely cosmetic, not persisted until the user saves.
// `suggested_provider` is a UI-only preselect (used by embedding & ASR)
// when the user has not pinned a vendor yet — purely cosmetic, not
// persisted until the user clicks Save.
// For "pick or empty" capabilities (no current, no suggestion), we leave
// the dropdown unselected and show a muted placeholder so the user is
// nudged to pick explicitly.
const noSelectionAndNoHint = !cap.current_provider && !cap.suggested_provider;
const initialProviderValue = pendingProvider
? pendingProvider
: ((cap.strategy === 'auto' && capabilitySupportsAuto(def.id))
? ''
: (cap.current_provider
|| cap.suggested_provider
|| (ddOpts[0] && ddOpts[0].value)
|| (noSelectionAndNoHint ? '' : (ddOpts[0] && ddOpts[0].value))
|| ''));
initDropdown(
provDd,
ddOpts,
initialProviderValue,
(value) => onCapabilityProviderChange(def, value, body)
(value) => onCapabilityProviderChange(def, value, body),
noSelectionAndNoHint ? { placeholder: t('models_pick_provider') } : null
);
decorateCapabilityProviderDropdown(def, provDd, providerOpts);

View File

@@ -1,10 +1,11 @@
import datetime
import hashlib
import hmac
import time
import json
import logging
import mimetypes
import os
import random
import threading
import time
import uuid
@@ -340,6 +341,10 @@ class WebChannel(ChatChannel):
# Use a single-element list as a mutable counter accessible from closure.
reasoning_chars_sent = [0]
reasoning_capped_notified = [False]
# Captures the first error message emitted by agent_stream so the
# subsequent agent_end handler can skip its "empty final_response"
# fallback (which would otherwise overwrite the real error).
streamed_error: List[str] = []
def on_event(event: dict):
if request_id not in self.sse_queues:
@@ -398,6 +403,25 @@ class WebChannel(ChatChannel):
if tool_calls:
q.put({"type": "message_end", "has_tool_calls": True})
elif event_type == "error":
# Agent raised an exception (LLM 401/timeout/etc). Surface the
# real message instead of letting the empty-response fallback
# below hide it as "(模型未返回任何内容)".
err_msg = data.get("error") or "unknown error"
logger.warning(
f"[WebChannel] agent_stream emitted error for "
f"request {request_id}: {err_msg}"
)
# Remember it so the agent_end handler below knows not to
# rewrite the message into a generic empty-response notice.
streamed_error.append(err_msg)
q.put({
"type": "done",
"content": f"{err_msg}",
"request_id": request_id,
"timestamp": time.time(),
})
elif event_type == "agent_end":
# Safety net: if the agent finishes with an empty final_response,
# chat_channel skips _send_reply (because reply.content is empty),
@@ -406,6 +430,11 @@ class WebChannel(ChatChannel):
# here so the frontend always gets closure.
final_response = data.get("final_response", "")
if not final_response or not str(final_response).strip():
if streamed_error:
# Error was already surfaced via the `error` event
# handler above; nothing more to do here.
pass
else:
logger.warning(
f"[WebChannel] agent_end with empty final_response for "
f"request {request_id}, sending fallback done"
@@ -432,6 +461,39 @@ class WebChannel(ChatChannel):
return on_event
@staticmethod
def _cleanup_stale_voice_recordings(max_age_seconds: int = 3600) -> None:
"""Delete voice-input audio files older than `max_age_seconds`.
Called once at startup. Web mic recordings live in the upload
directory so the browser can replay them inside the conversation
bubble. We don't persist them to history, so once a process
restarts they're useless — but they're never auto-cleaned
anywhere else, so without this they accumulate over time.
"""
try:
upload_dir = _get_upload_dir()
if not os.path.isdir(upload_dir):
return
now = time.time()
removed = 0
for name in os.listdir(upload_dir):
if not name.startswith("voice_input_"):
continue
full = os.path.join(upload_dir, name)
try:
if not os.path.isfile(full):
continue
if now - os.path.getmtime(full) > max_age_seconds:
os.remove(full)
removed += 1
except OSError:
continue
if removed:
logger.info(f"[WebChannel] cleaned up {removed} stale voice recording(s) from {upload_dir}")
except Exception as e:
logger.warning(f"[WebChannel] voice cleanup failed: {e}")
def upload_file(self):
"""Handle file or directory upload via multipart/form-data."""
try:
@@ -703,6 +765,8 @@ class WebChannel(ChatChannel):
port = conf().get("web_port", 9899)
is_public_bind = host in ("0.0.0.0", "::")
self._cleanup_stale_voice_recordings()
# 打印可用渠道类型提示
logger.info(
"[WebChannel] 全部可用通道如下,可修改 config.json 配置文件中的 channel_type 字段进行切换,多个通道用逗号分隔:")
@@ -746,6 +810,7 @@ class WebChannel(ChatChannel):
'/upload', 'UploadHandler',
'/uploads/(.*)', 'UploadsHandler',
'/api/file', 'FileServeHandler',
'/api/voice/asr', 'VoiceAsrHandler',
'/poll', 'PollHandler',
'/stream', 'StreamHandler',
'/chat', 'ChatHandler',
@@ -870,6 +935,68 @@ class UploadHandler:
return WebChannel().upload_file()
class VoiceAsrHandler:
"""
Accept a short audio recording from the web console mic button,
save it under uploads/ so the browser can replay it, then run it
through the currently configured ASR provider.
Returns {status, text, audio_url} on success — the frontend renders
a voice-message bubble with the playable audio and the transcribed
caption.
"""
def POST(self):
_require_auth()
web.header('Content-Type', 'application/json; charset=utf-8')
saved_path = None
try:
params = _raw_web_input()
file_obj = params.get("file")
if file_obj is None:
return json.dumps({"status": "error", "message": "no audio file"})
filename = getattr(file_obj, "filename", "") or "recording.webm"
ext = os.path.splitext(filename)[1].lower() or ".webm"
if ext not in (".webm", ".ogg", ".opus", ".mp4", ".m4a", ".mp3", ".wav"):
ext = ".webm"
upload_dir = _get_upload_dir()
os.makedirs(upload_dir, exist_ok=True)
ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
saved_name = f"voice_input_{ts}_{random.randint(0, 9999)}{ext}"
saved_path = os.path.join(upload_dir, saved_name)
with open(saved_path, "wb") as f:
f.write(file_obj.file.read() if hasattr(file_obj, "file") else file_obj.value)
audio_url = f"/uploads/{saved_name}"
from bridge.bridge import Bridge
reply = Bridge().fetch_voice_to_text(saved_path)
if reply is None:
return json.dumps({
"status": "error",
"message": "ASR returned no reply",
"audio_url": audio_url,
})
from bridge.reply import ReplyType
if reply.type == ReplyType.TEXT:
return json.dumps({
"status": "success",
"text": reply.content or "",
"audio_url": audio_url,
})
return json.dumps({
"status": "error",
"message": reply.content or "ASR failed",
"audio_url": audio_url,
})
except Exception as e:
logger.exception(f"[VoiceAsrHandler] failed: {e}")
return json.dumps({"status": "error", "message": str(e)})
class UploadsHandler:
def GET(self, file_name):
_require_auth()
@@ -1232,7 +1359,7 @@ class ModelsHandler:
# Capability -> editable flag, current-value resolver, and supported provider
# ids drawn from ConfigHandler.PROVIDER_MODELS where applicable.
_ASR_PROVIDERS = ["openai", "linkai", "baidu", "ali", "xunfei", "azure", "google"]
_ASR_PROVIDERS = ["openai", "dashscope", "zhipu", "linkai"]
_TTS_PROVIDERS = ["openai", "linkai", "minimax", "baidu", "ali", "xunfei", "azure", "google", "elevenlabs", "edge", "pytts"]
_EMBEDDING_PROVIDERS = ["openai", "dashscope", "doubao", "zhipu", "linkai"]
@@ -1502,10 +1629,23 @@ class ModelsHandler:
@classmethod
def _asr_capability(cls, local_config: dict) -> dict:
provider_id = (local_config.get("voice_to_text") or "openai").strip().lower()
# "Pick or empty" — when voice_to_text is unset we don't show a
# current selection. `suggested_provider` previews which vendor
# the bridge auto-picker would land on (purely a UX hint, NOT
# persisted). Once the user saves a vendor, we lock onto it.
explicit = (local_config.get("voice_to_text") or "").strip().lower()
suggested = ""
if not explicit:
for pid in cls._ASR_PROVIDERS:
meta = ConfigHandler.PROVIDER_MODELS.get(pid) or {}
key_field = meta.get("api_key_field")
if key_field and cls._is_real_key(local_config.get(key_field, "")):
suggested = pid
break
return {
"editable": True,
"current_provider": provider_id,
"current_provider": explicit,
"suggested_provider": suggested,
"current_model": "",
"providers": cls._ASR_PROVIDERS,
}
@@ -1897,6 +2037,10 @@ class ModelsHandler:
file_cfg[key] = value
self._write_file_config(file_cfg)
logger.info(f"[ModelsHandler] {key} set: {value!r}")
# Bridge caches voice_to_text routing + bot instance; refresh it
# so the change takes effect on the next voice request.
if key in ("voice_to_text", "text_to_voice"):
self._refresh_voice_routing()
return json.dumps({"status": "success", key: value})
def _set_tts(self, provider_id: str, model: str) -> str:
@@ -1910,8 +2054,17 @@ class ModelsHandler:
file_cfg["text_to_voice_model"] = model
self._write_file_config(file_cfg)
logger.info(f"[ModelsHandler] tts updated: provider={provider_id!r} model={model!r}")
self._refresh_voice_routing()
return json.dumps({"status": "success", "provider": provider_id, "model": model})
@staticmethod
def _refresh_voice_routing() -> None:
try:
from bridge.bridge import Bridge
Bridge().refresh_voice()
except Exception as e:
logger.warning(f"[ModelsHandler] Bridge voice refresh failed: {e}")
def _set_embedding(self, provider_id: str, model: str) -> str:
# provider_id="" + model="" means "switch back to legacy auto mode".
local_config = conf()
@@ -1926,9 +2079,9 @@ class ModelsHandler:
file_cfg["embedding_model"] = ""
self._write_file_config(file_cfg)
logger.info(f"[ModelsHandler] embedding updated: provider={provider_id!r} model={model!r}")
# The agent's MemoryManager picks the new provider on next process
# restart; the index dim may now mismatch so a rebuild is needed.
# The frontend surfaces this via a confirm + post-save dialog.
# The next /memory rebuild-index command hot-swaps the provider onto
# the running MemoryManager (see plugins/cow_cli). The dim may have
# changed, so the frontend prompts the user to rebuild.
return json.dumps({"status": "success", "provider": provider_id, "model": model})
@staticmethod

View File

View File

@@ -0,0 +1,135 @@
# encoding:utf-8
"""
DashScope (Aliyun Bailian) voice service.
ASR : qwen3-asr-flash via dashscope.MultiModalConversation
TTS : not yet implemented (see CosyVoice / qwen3-tts)
Why MultiModalConversation instead of the OpenAI-compatible endpoint:
- SDK is already a project dep (used by chat/vision)
- Native API accepts local file:// paths up to 100 QPS without an OSS
round-trip, which is what we need for the "send a short voice
message" flow. Public URLs / Base64 also work.
"""
import os
from typing import Optional
import dashscope
from dashscope import MultiModalConversation
from bridge.reply import Reply, ReplyType
from common.log import logger
from config import conf
from voice import audio_convert
from voice.voice import Voice
DEFAULT_ASR_MODEL = "qwen3-asr-flash"
# qwen3-asr-flash hard cap (single file, sync call). Longer audio needs
# qwen3-asr-flash-filetrans which is async-only and out of scope here.
MAX_DURATION_SECONDS = 300
MAX_FILE_BYTES = 10 * 1024 * 1024
class DashScopeVoice(Voice):
def __init__(self):
# api_key is applied per-call (chat bot does the same) so a live
# config change via the web console takes effect without restart.
pass
def voiceToText(self, voice_file: str):
try:
voice_file = self._ensure_compatible_format(voice_file)
try:
size = os.path.getsize(voice_file)
if size > MAX_FILE_BYTES:
logger.warning(
f"[DashScopeVoice] audio file {size}B exceeds {MAX_FILE_BYTES}B; "
f"qwen3-asr-flash may reject it"
)
except OSError:
pass
api_key = conf().get("dashscope_api_key", "")
if not api_key:
logger.error("[DashScopeVoice] dashscope_api_key is not configured")
return Reply(ReplyType.ERROR, "未配置 DashScope API key")
dashscope.api_key = api_key
model = conf().get("voice_to_text_model") or DEFAULT_ASR_MODEL
abs_path = os.path.abspath(voice_file)
file_uri = f"file://{abs_path}"
messages = [
{"role": "user", "content": [{"audio": file_uri}]},
]
response = MultiModalConversation.call(
model=model,
messages=messages,
result_format="message",
asr_options={"enable_itn": False, "enable_lid": True},
)
text = self._extract_text(response)
if text is None:
logger.error(f"[DashScopeVoice] voiceToText failed: {response}")
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
logger.info(f"[DashScopeVoice] voiceToText model={model} text={text}")
return Reply(ReplyType.TEXT, text)
except Exception as e:
logger.exception(f"[DashScopeVoice] voiceToText exception: {e}")
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
def textToVoice(self, text: str):
# TTS will be added in a follow-up commit (qwen3-tts / cosyvoice).
return Reply(ReplyType.ERROR, "DashScope 语音合成尚未接入")
@staticmethod
def _ensure_compatible_format(voice_file: str) -> str:
"""Convert AMR/SILK to mp3 since qwen3-asr-flash doesn't accept them.
Other formats (mp3/wav/m4a/aac/opus/webm) are passed through.
"""
lower = voice_file.lower()
if lower.endswith(".amr") or lower.endswith(".silk") or lower.endswith(".slk"):
try:
mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
audio_convert.any_to_mp3(voice_file, mp3_file)
return mp3_file
except Exception as e:
logger.warning(
f"[DashScopeVoice] convert {voice_file} to mp3 failed: {e}; "
f"submitting original file"
)
return voice_file
@staticmethod
def _extract_text(response) -> Optional[str]:
"""Pull the recognized text out of MultiModalConversation response.
Successful shape (result_format="message"):
response.output.choices[0].message.content -> list of {"text": "..."}
or in some SDK versions a plain string.
"""
try:
if getattr(response, "status_code", 200) != 200:
return None
choices = response.output.get("choices") or []
if not choices:
return None
content = choices[0].get("message", {}).get("content")
if isinstance(content, str):
return content.strip() or None
if isinstance(content, list):
parts = []
for item in content:
if isinstance(item, dict) and "text" in item:
parts.append(item["text"])
elif isinstance(item, str):
parts.append(item)
text = "".join(parts).strip()
return text or None
return None
except Exception:
return None

View File

@@ -58,4 +58,12 @@ def create_voice(voice_type):
from voice.minimax.minimax_voice import MinimaxVoice
return MinimaxVoice()
elif voice_type == "dashscope":
from voice.dashscope.dashscope_voice import DashScopeVoice
return DashScopeVoice()
elif voice_type == "zhipu" or voice_type == "zhipuai":
from voice.zhipuai.zhipuai_voice import ZhipuAIVoice
return ZhipuAIVoice()
raise RuntimeError

View File

View File

@@ -0,0 +1,102 @@
# encoding:utf-8
"""
ZhipuAI (BigModel) voice service.
ASR : glm-asr-2512 via the OpenAI-compatible /audio/transcriptions endpoint.
TTS : not yet implemented.
Endpoint accepts multipart/form-data with `model`, `file`, and `stream`.
File size <= 25MB, duration <= 30s per request.
"""
import os
import requests
from bridge.reply import Reply, ReplyType
from common.log import logger
from config import conf
from voice import audio_convert
from voice.voice import Voice
DEFAULT_ASR_MODEL = "glm-asr-2512"
DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4"
MAX_FILE_BYTES = 25 * 1024 * 1024
REQUEST_TIMEOUT = (5, 60)
class ZhipuAIVoice(Voice):
def __init__(self):
# api_key / base read per-call so live config edits take effect.
pass
def voiceToText(self, voice_file: str):
try:
voice_file = self._ensure_compatible_format(voice_file)
try:
size = os.path.getsize(voice_file)
if size > MAX_FILE_BYTES:
logger.warning(
f"[ZhipuAIVoice] audio file {size}B exceeds {MAX_FILE_BYTES}B; "
f"glm-asr-2512 may reject it"
)
except OSError:
pass
api_key = conf().get("zhipu_ai_api_key", "")
if not api_key:
logger.error("[ZhipuAIVoice] zhipu_ai_api_key is not configured")
return Reply(ReplyType.ERROR, "未配置 ZhipuAI API key")
api_base = (conf().get("zhipu_ai_api_base") or DEFAULT_API_BASE).rstrip("/")
url = f"{api_base}/audio/transcriptions"
model = conf().get("voice_to_text_model") or DEFAULT_ASR_MODEL
with open(voice_file, "rb") as f:
files = {"file": (os.path.basename(voice_file), f)}
data = {"model": model, "stream": "false"}
headers = {"Authorization": f"Bearer {api_key}"}
response = requests.post(
url, headers=headers, files=files, data=data, timeout=REQUEST_TIMEOUT
)
if response.status_code != 200:
logger.error(
f"[ZhipuAIVoice] voiceToText failed: status={response.status_code} "
f"body={response.text[:500]}"
)
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
payload = response.json()
text = (payload.get("text") or "").strip()
if not text:
logger.error(f"[ZhipuAIVoice] voiceToText empty text: {payload}")
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
logger.info(f"[ZhipuAIVoice] voiceToText model={model} text={text}")
return Reply(ReplyType.TEXT, text)
except Exception as e:
logger.exception(f"[ZhipuAIVoice] voiceToText exception: {e}")
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
def textToVoice(self, text: str):
return Reply(ReplyType.ERROR, "ZhipuAI 语音合成尚未接入")
@staticmethod
def _ensure_compatible_format(voice_file: str) -> str:
# glm-asr-2512 only accepts .wav / .mp3 — convert everything else
# (webm from the browser mic, m4a/amr/silk from chat channels, etc).
lower = voice_file.lower()
if lower.endswith(".mp3") or lower.endswith(".wav"):
return voice_file
try:
mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
audio_convert.any_to_mp3(voice_file, mp3_file)
return mp3_file
except Exception as e:
logger.warning(
f"[ZhipuAIVoice] convert {voice_file} to mp3 failed: {e}; "
f"submitting original file"
)
return voice_file