mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-06-02 00:57:41 +08:00
feat(voice): add dashscope & zhipu ASR, in-page mic input
This commit is contained in:
@@ -14,7 +14,9 @@ class Bridge(object):
|
||||
def __init__(self):
|
||||
self.btype = {
|
||||
"chat": const.OPENAI,
|
||||
"voice_to_text": conf().get("voice_to_text", "openai"),
|
||||
# Empty `voice_to_text` (the default in new configs) triggers
|
||||
# the auto-pick below — see _auto_pick_voice_to_text for order.
|
||||
"voice_to_text": conf().get("voice_to_text") or self._auto_pick_voice_to_text(),
|
||||
"text_to_voice": conf().get("text_to_voice", "google"),
|
||||
"translate": conf().get("translate", "baidu"),
|
||||
}
|
||||
@@ -84,6 +86,46 @@ class Bridge(object):
|
||||
self.chat_bots = {}
|
||||
self._agent_bridge = None
|
||||
|
||||
def refresh_voice(self):
|
||||
"""Re-read voice_to_text / text_to_voice from config and drop the
|
||||
cached voice bots so the next call picks up the new provider.
|
||||
Used by the web console after the user edits voice settings.
|
||||
Does NOT touch the agent_bridge / agent state.
|
||||
"""
|
||||
new_v2t = conf().get("voice_to_text") or self._auto_pick_voice_to_text()
|
||||
new_t2v = conf().get("text_to_voice", "google")
|
||||
if conf().get("use_linkai") and conf().get("linkai_api_key"):
|
||||
if not conf().get("voice_to_text") or conf().get("voice_to_text") in ["openai"]:
|
||||
new_v2t = const.LINKAI
|
||||
if not conf().get("text_to_voice") or conf().get("text_to_voice") in ["openai", const.TTS_1, const.TTS_1_HD]:
|
||||
new_t2v = const.LINKAI
|
||||
self.btype["voice_to_text"] = new_v2t
|
||||
self.btype["text_to_voice"] = new_t2v
|
||||
self.bots.pop("voice_to_text", None)
|
||||
self.bots.pop("text_to_voice", None)
|
||||
logger.info(f"[Bridge] voice refreshed: voice_to_text={new_v2t}, text_to_voice={new_t2v}")
|
||||
|
||||
@staticmethod
|
||||
def _auto_pick_voice_to_text() -> str:
|
||||
"""Pick an ASR provider by configured api keys when voice_to_text is
|
||||
unset. Order matches the web console: openai → dashscope → zhipu →
|
||||
linkai. Falls back to 'openai' when nothing is configured so the
|
||||
original "missing key" error is preserved.
|
||||
"""
|
||||
def has(k: str) -> bool:
|
||||
v = (conf().get(k) or "").strip()
|
||||
return v != "" and v not in ("YOUR API KEY", "YOUR_API_KEY")
|
||||
|
||||
for key, provider in (
|
||||
("open_ai_api_key", "openai"),
|
||||
("dashscope_api_key", "dashscope"),
|
||||
("zhipu_ai_api_key", "zhipu"),
|
||||
("linkai_api_key", "linkai"),
|
||||
):
|
||||
if has(key):
|
||||
return provider
|
||||
return "openai"
|
||||
|
||||
# 模型对应的接口
|
||||
def get_bot(self, typename):
|
||||
if self.bots.get(typename) is None:
|
||||
|
||||
@@ -422,8 +422,9 @@
|
||||
</button>
|
||||
</div>
|
||||
<div id="slash-menu" class="slash-menu hidden"></div>
|
||||
<div class="flex-1 min-w-0 relative flex items-center">
|
||||
<textarea id="chat-input"
|
||||
class="flex-1 min-w-0 px-4 py-[10px] rounded-xl border border-slate-200 dark:border-slate-600
|
||||
class="w-full pl-4 pr-11 py-[10px] rounded-xl border border-slate-200 dark:border-slate-600
|
||||
bg-slate-50 dark:bg-white/5 text-slate-800 dark:text-slate-100
|
||||
placeholder:text-slate-400 dark:placeholder:text-slate-500
|
||||
focus:outline-none focus:ring-0 focus:border-primary-600
|
||||
@@ -431,6 +432,14 @@
|
||||
rows="1"
|
||||
data-i18n-placeholder="input_placeholder"
|
||||
placeholder="输入消息,或输入 / 使用指令"></textarea>
|
||||
<button id="mic-btn" type="button"
|
||||
class="absolute right-2 top-1/2 -translate-y-1/2 w-8 h-8 flex items-center justify-center rounded-lg
|
||||
text-slate-400 hover:text-primary-500 hover:bg-primary-50 dark:hover:bg-primary-900/20
|
||||
cursor-pointer transition-colors duration-150"
|
||||
data-i18n-title="mic_idle_title" title="点击录音 / 再按一次结束">
|
||||
<i class="fas fa-microphone text-sm"></i>
|
||||
</button>
|
||||
</div>
|
||||
<button id="send-btn"
|
||||
class="flex-shrink-0 w-10 h-10 flex items-center justify-center rounded-lg
|
||||
bg-primary-400 text-white hover:bg-primary-500
|
||||
|
||||
@@ -59,6 +59,7 @@ const I18N = {
|
||||
models_embedding_saved_title: '向量模型已更新',
|
||||
models_embedding_saved_msg: '请在聊天框输入 /memory rebuild-index 重建索引。',
|
||||
models_embedding_saved_ok: '去执行',
|
||||
models_pick_provider: '待选择',
|
||||
models_clear_confirm_title: '清除厂商凭据',
|
||||
models_clear_confirm_msg: '确认清除该厂商的 API Key 与 Base URL 吗?相关能力将不再可用。',
|
||||
cancel: '取消',
|
||||
@@ -153,6 +154,12 @@ const I18N = {
|
||||
tip_clear_context: '清除上下文',
|
||||
tip_attach: '添加附件',
|
||||
attach_menu_file: '上传文件',
|
||||
mic_idle_title: '点击录音 / 再按一次结束',
|
||||
mic_recording_title: '录音中,再次点击结束',
|
||||
mic_busy_title: '识别中…',
|
||||
mic_permission_denied: '无法访问麦克风,请检查浏览器权限',
|
||||
mic_too_short: '录音太短,请重试',
|
||||
mic_error: '语音识别失败',
|
||||
attach_menu_folder: '上传文件夹',
|
||||
confirm_yes: '确认',
|
||||
confirm_cancel: '取消',
|
||||
@@ -207,6 +214,7 @@ const I18N = {
|
||||
models_embedding_saved_title: 'Embedding model updated',
|
||||
models_embedding_saved_msg: 'Send /memory rebuild-index in the chat to rebuild the index.',
|
||||
models_embedding_saved_ok: 'Go',
|
||||
models_pick_provider: 'Pick a provider',
|
||||
models_clear_confirm_title: 'Clear vendor credentials',
|
||||
models_clear_confirm_msg: 'Remove this vendor\'s API Key and Base URL? Capabilities relying on it will stop working.',
|
||||
cancel: 'Cancel',
|
||||
@@ -301,6 +309,12 @@ const I18N = {
|
||||
tip_clear_context: 'Clear Context',
|
||||
tip_attach: 'Add Attachment',
|
||||
attach_menu_file: 'Upload File',
|
||||
mic_idle_title: 'Click to record, click again to stop',
|
||||
mic_recording_title: 'Recording, click to stop',
|
||||
mic_busy_title: 'Transcribing…',
|
||||
mic_permission_denied: 'Cannot access microphone — check browser permissions',
|
||||
mic_too_short: 'Recording too short, please retry',
|
||||
mic_error: 'Speech recognition failed',
|
||||
attach_menu_folder: 'Upload Folder',
|
||||
confirm_yes: 'Confirm',
|
||||
confirm_cancel: 'Cancel',
|
||||
@@ -707,6 +721,191 @@ if (!supportsDirectoryUpload && attachFolderOption) {
|
||||
attachFolderOption.classList.add('hidden');
|
||||
}
|
||||
|
||||
// ---------------- Mic button: in-page voice input via the configured ASR provider ----------------
|
||||
(function setupMicButton() {
|
||||
const micBtn = document.getElementById('mic-btn');
|
||||
if (!micBtn) return;
|
||||
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia ||
|
||||
typeof window.MediaRecorder === 'undefined') {
|
||||
micBtn.style.display = 'none';
|
||||
return;
|
||||
}
|
||||
|
||||
let mediaRecorder = null;
|
||||
let stream = null;
|
||||
let chunks = [];
|
||||
let recording = false;
|
||||
|
||||
const setIdle = () => {
|
||||
recording = false;
|
||||
micBtn.classList.remove('text-red-500', 'animate-pulse');
|
||||
micBtn.classList.add('text-slate-400');
|
||||
micBtn.querySelector('i').className = 'fas fa-microphone text-sm';
|
||||
micBtn.title = t('mic_idle_title');
|
||||
};
|
||||
const setRecording = () => {
|
||||
recording = true;
|
||||
micBtn.classList.remove('text-slate-400');
|
||||
micBtn.classList.add('text-red-500', 'animate-pulse');
|
||||
micBtn.querySelector('i').className = 'fas fa-stop text-sm';
|
||||
micBtn.title = t('mic_recording_title');
|
||||
};
|
||||
const setBusy = () => {
|
||||
micBtn.classList.remove('text-red-500', 'animate-pulse', 'text-slate-400');
|
||||
micBtn.classList.add('text-primary-500');
|
||||
micBtn.querySelector('i').className = 'fas fa-spinner fa-spin text-sm';
|
||||
micBtn.title = t('mic_busy_title');
|
||||
};
|
||||
|
||||
const pickMimeType = () => {
|
||||
const candidates = [
|
||||
'audio/webm;codecs=opus',
|
||||
'audio/webm',
|
||||
'audio/ogg;codecs=opus',
|
||||
'audio/mp4',
|
||||
];
|
||||
for (const m of candidates) {
|
||||
if (window.MediaRecorder.isTypeSupported && MediaRecorder.isTypeSupported(m)) {
|
||||
return m;
|
||||
}
|
||||
}
|
||||
return '';
|
||||
};
|
||||
|
||||
const stopStream = () => {
|
||||
if (stream) {
|
||||
stream.getTracks().forEach(t => t.stop());
|
||||
stream = null;
|
||||
}
|
||||
};
|
||||
|
||||
let _micTipTimer = null;
|
||||
const flashError = (msg) => {
|
||||
console.warn('[mic]', msg);
|
||||
// Pop a small bubble above the mic so the user actually notices it.
|
||||
// The mic lives inside a relatively-positioned wrapper around the
|
||||
// textarea (see chat.html), so we hang the tip off that wrapper.
|
||||
const wrapper = micBtn.parentElement;
|
||||
if (!wrapper) return;
|
||||
let tip = wrapper.querySelector('.mic-tip');
|
||||
if (!tip) {
|
||||
tip = document.createElement('div');
|
||||
tip.className = 'mic-tip absolute right-1 bottom-full mb-2 px-2 py-1 rounded-md '
|
||||
+ 'text-xs text-white bg-slate-800/90 dark:bg-slate-700/90 shadow-md '
|
||||
+ 'pointer-events-none whitespace-nowrap z-10';
|
||||
wrapper.appendChild(tip);
|
||||
}
|
||||
tip.textContent = msg;
|
||||
tip.style.opacity = '1';
|
||||
if (_micTipTimer) clearTimeout(_micTipTimer);
|
||||
_micTipTimer = setTimeout(() => {
|
||||
tip.style.opacity = '0';
|
||||
tip.style.transition = 'opacity 200ms';
|
||||
setTimeout(() => tip.remove(), 250);
|
||||
}, 2000);
|
||||
};
|
||||
|
||||
const upload = async (blob, ext) => {
|
||||
setBusy();
|
||||
const fd = new FormData();
|
||||
fd.append('file', blob, `recording.${ext}`);
|
||||
try {
|
||||
const resp = await fetch('/api/voice/asr', { method: 'POST', body: fd });
|
||||
const data = await resp.json();
|
||||
if (data.status === 'success' && data.text) {
|
||||
// Voice-message UX: drop the recording into the conversation
|
||||
// as a playable bubble with the caption underneath, then
|
||||
// dispatch the recognised text through the regular send path.
|
||||
sendVoiceMessage(data.text, data.audio_url);
|
||||
} else {
|
||||
flashError(data.message || t('mic_error'));
|
||||
}
|
||||
} catch (e) {
|
||||
flashError(t('mic_error') + ': ' + e.message);
|
||||
} finally {
|
||||
setIdle();
|
||||
}
|
||||
};
|
||||
|
||||
const start = async () => {
|
||||
try {
|
||||
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
} catch (e) {
|
||||
flashError(t('mic_permission_denied'));
|
||||
return;
|
||||
}
|
||||
chunks = [];
|
||||
const mimeType = pickMimeType();
|
||||
try {
|
||||
mediaRecorder = mimeType
|
||||
? new MediaRecorder(stream, { mimeType })
|
||||
: new MediaRecorder(stream);
|
||||
} catch (e) {
|
||||
stopStream();
|
||||
flashError(t('mic_error') + ': ' + e.message);
|
||||
return;
|
||||
}
|
||||
mediaRecorder.ondataavailable = (ev) => {
|
||||
if (ev.data && ev.data.size > 0) chunks.push(ev.data);
|
||||
};
|
||||
mediaRecorder.onstop = () => {
|
||||
stopStream();
|
||||
const blob = new Blob(chunks, { type: mediaRecorder.mimeType || 'audio/webm' });
|
||||
// Map mime -> extension so the server picks the right file suffix.
|
||||
const mt = (mediaRecorder.mimeType || 'audio/webm').split(';')[0];
|
||||
const extMap = {
|
||||
'audio/webm': 'webm', 'audio/ogg': 'ogg',
|
||||
'audio/mp4': 'm4a', 'audio/mpeg': 'mp3',
|
||||
};
|
||||
const ext = extMap[mt] || 'webm';
|
||||
// 256 bytes ~ container header only, no actual audio. Anything
|
||||
// below that we treat as "tapped by mistake".
|
||||
if (blob.size < 256) {
|
||||
setIdle();
|
||||
flashError(t('mic_too_short'));
|
||||
return;
|
||||
}
|
||||
upload(blob, ext);
|
||||
};
|
||||
// timeslice=250ms: force the recorder to flush a chunk every 250ms.
|
||||
// Without it some browsers wait for stop() before producing any data,
|
||||
// which loses the audio on very short taps.
|
||||
mediaRecorder.start(250);
|
||||
recordStartedAt = Date.now();
|
||||
setRecording();
|
||||
};
|
||||
|
||||
let recordStartedAt = 0;
|
||||
|
||||
const stopWithMinDuration = () => {
|
||||
const elapsed = Date.now() - recordStartedAt;
|
||||
const minMs = 350;
|
||||
if (elapsed < minMs) {
|
||||
// Give the recorder a moment to capture at least one chunk
|
||||
// before we tell it to stop.
|
||||
setTimeout(() => stop(), minMs - elapsed);
|
||||
} else {
|
||||
stop();
|
||||
}
|
||||
};
|
||||
|
||||
const stop = () => {
|
||||
if (mediaRecorder && mediaRecorder.state !== 'inactive') {
|
||||
mediaRecorder.stop();
|
||||
}
|
||||
};
|
||||
|
||||
micBtn.addEventListener('click', () => {
|
||||
if (recording) {
|
||||
stopWithMinDuration();
|
||||
} else {
|
||||
start();
|
||||
}
|
||||
});
|
||||
|
||||
setIdle();
|
||||
})();
|
||||
|
||||
// Smart auto-scroll: pause when user scrolls up, resume when near bottom
|
||||
let _autoScrollEnabled = true;
|
||||
const _SCROLL_THRESHOLD = 80; // px from bottom to re-enable auto-scroll
|
||||
@@ -1250,6 +1449,87 @@ document.querySelectorAll('.example-card').forEach(card => {
|
||||
});
|
||||
});
|
||||
|
||||
// Voice-message variant of sendMessage(): renders a playable audio bubble
|
||||
// with the ASR caption, then dispatches the recognised text to /message
|
||||
// through the same SSE/loading flow as a typed message.
|
||||
function sendVoiceMessage(text, audioUrl) {
|
||||
text = (text || '').trim();
|
||||
if (!text) return;
|
||||
|
||||
inputHistory.push(text);
|
||||
historyIdx = -1;
|
||||
historySavedDraft = '';
|
||||
|
||||
const ws = document.getElementById('welcome-screen');
|
||||
const isFirstMessage = !!ws;
|
||||
if (ws) ws.remove();
|
||||
|
||||
const titleInfo = isFirstMessage ? { sid: sessionId, userMsg: text } : null;
|
||||
const timestamp = new Date();
|
||||
addUserVoiceMessage(audioUrl, text, timestamp);
|
||||
const loadingEl = addLoadingIndicator();
|
||||
|
||||
const body = {
|
||||
session_id: sessionId,
|
||||
message: text,
|
||||
stream: true,
|
||||
timestamp: timestamp.toISOString(),
|
||||
};
|
||||
|
||||
const MAX_RETRIES = 2;
|
||||
const RETRY_DELAY_MS = 1000;
|
||||
function postWithRetry(attempt) {
|
||||
fetch('/message', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(body)
|
||||
})
|
||||
.then(r => r.json())
|
||||
.then(data => {
|
||||
if (data.status === 'success') {
|
||||
if (data.stream) {
|
||||
startSSE(data.request_id, loadingEl, timestamp, titleInfo);
|
||||
} else {
|
||||
loadingContainers[data.request_id] = loadingEl;
|
||||
}
|
||||
} else {
|
||||
loadingEl.remove();
|
||||
addBotMessage(t('error_send'), new Date());
|
||||
}
|
||||
})
|
||||
.catch(err => {
|
||||
if (attempt < MAX_RETRIES) {
|
||||
setTimeout(() => postWithRetry(attempt + 1), RETRY_DELAY_MS * (attempt + 1));
|
||||
return;
|
||||
}
|
||||
loadingEl.remove();
|
||||
addBotMessage(t('error_send'), new Date());
|
||||
});
|
||||
}
|
||||
postWithRetry(0);
|
||||
}
|
||||
|
||||
function addUserVoiceMessage(audioUrl, caption, timestamp) {
|
||||
const el = document.createElement('div');
|
||||
el.className = 'flex justify-end px-4 sm:px-6 py-3';
|
||||
// Voice-message bubble: playable <audio> on top, ASR caption beneath.
|
||||
// The bubble keeps the same primary tint as a normal user message so
|
||||
// it visually slots into the conversation flow.
|
||||
el.innerHTML = `
|
||||
<div class="max-w-[75%] sm:max-w-[60%]">
|
||||
<div class="bg-slate-100 dark:bg-white/10 text-slate-700 dark:text-slate-200 rounded-2xl px-3 py-2 msg-content user-bubble">
|
||||
<audio controls preload="metadata" src="${audioUrl}"
|
||||
class="block w-[260px] max-w-full h-9"></audio>
|
||||
${caption ? `<div class="text-xs mt-1.5 leading-snug text-slate-500 dark:text-slate-400 whitespace-pre-wrap break-words">${escapeHtml(caption)}</div>` : ''}
|
||||
</div>
|
||||
<div class="text-xs text-slate-400 dark:text-slate-500 mt-1.5 text-right">${formatTime(timestamp)}</div>
|
||||
</div>
|
||||
`;
|
||||
messagesDiv.appendChild(el);
|
||||
_autoScrollEnabled = true;
|
||||
scrollChatToBottom(true);
|
||||
}
|
||||
|
||||
function sendMessage() {
|
||||
const text = chatInput.value.trim();
|
||||
if (!text && pendingAttachments.length === 0) return;
|
||||
@@ -2573,7 +2853,12 @@ let cfgProviderValue = '';
|
||||
let cfgModelValue = '';
|
||||
|
||||
// --- Custom dropdown helper ---
|
||||
function initDropdown(el, options, selectedValue, onChange) {
|
||||
function initDropdown(el, options, selectedValue, onChange, opts) {
|
||||
// opts.placeholder: when set AND selectedValue is empty, render that text
|
||||
// in a dim style instead of auto-selecting options[0]. Useful for
|
||||
// "pick or empty" capabilities (asr / embedding) where we want the
|
||||
// user to make an explicit choice.
|
||||
opts = opts || {};
|
||||
const textEl = el.querySelector('.cfg-dropdown-text');
|
||||
const menuEl = el.querySelector('.cfg-dropdown-menu');
|
||||
const selEl = el.querySelector('.cfg-dropdown-selected');
|
||||
@@ -2615,8 +2900,20 @@ function initDropdown(el, options, selectedValue, onChange) {
|
||||
menuEl.appendChild(item);
|
||||
});
|
||||
const sel = options.find(o => o.value === el._ddValue);
|
||||
textEl.textContent = sel ? sel.label : (options[0] ? options[0].label : '--');
|
||||
if (!sel && options[0]) el._ddValue = options[0].value;
|
||||
if (sel) {
|
||||
textEl.textContent = sel.label;
|
||||
textEl.classList.remove('text-slate-400', 'dark:text-slate-500');
|
||||
} else if (opts.placeholder && !el._ddValue) {
|
||||
// No selection yet — show the placeholder in muted style.
|
||||
// Do NOT write a fallback value, so the dropdown stays
|
||||
// "unsaved" until the user explicitly picks.
|
||||
textEl.textContent = opts.placeholder;
|
||||
textEl.classList.add('text-slate-400', 'dark:text-slate-500');
|
||||
} else {
|
||||
textEl.textContent = options[0] ? options[0].label : '--';
|
||||
textEl.classList.remove('text-slate-400', 'dark:text-slate-500');
|
||||
if (options[0]) el._ddValue = options[0].value;
|
||||
}
|
||||
}
|
||||
|
||||
render();
|
||||
@@ -3566,21 +3863,27 @@ function renderCapabilityBody(def, cap, body) {
|
||||
// For auto-capable capabilities, an "auto" strategy means the user has
|
||||
// not pinned a vendor; we honor that by selecting the empty-string
|
||||
// sentinel rather than the resolved fallback provider name.
|
||||
// `suggested_provider` is a UI-only preselect for embedding when nothing
|
||||
// is pinned yet — purely cosmetic, not persisted until the user saves.
|
||||
// `suggested_provider` is a UI-only preselect (used by embedding & ASR)
|
||||
// when the user has not pinned a vendor yet — purely cosmetic, not
|
||||
// persisted until the user clicks Save.
|
||||
// For "pick or empty" capabilities (no current, no suggestion), we leave
|
||||
// the dropdown unselected and show a muted placeholder so the user is
|
||||
// nudged to pick explicitly.
|
||||
const noSelectionAndNoHint = !cap.current_provider && !cap.suggested_provider;
|
||||
const initialProviderValue = pendingProvider
|
||||
? pendingProvider
|
||||
: ((cap.strategy === 'auto' && capabilitySupportsAuto(def.id))
|
||||
? ''
|
||||
: (cap.current_provider
|
||||
|| cap.suggested_provider
|
||||
|| (ddOpts[0] && ddOpts[0].value)
|
||||
|| (noSelectionAndNoHint ? '' : (ddOpts[0] && ddOpts[0].value))
|
||||
|| ''));
|
||||
initDropdown(
|
||||
provDd,
|
||||
ddOpts,
|
||||
initialProviderValue,
|
||||
(value) => onCapabilityProviderChange(def, value, body)
|
||||
(value) => onCapabilityProviderChange(def, value, body),
|
||||
noSelectionAndNoHint ? { placeholder: t('models_pick_provider') } : null
|
||||
);
|
||||
decorateCapabilityProviderDropdown(def, provDd, providerOpts);
|
||||
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import hmac
|
||||
import time
|
||||
import json
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import random
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
@@ -340,6 +341,10 @@ class WebChannel(ChatChannel):
|
||||
# Use a single-element list as a mutable counter accessible from closure.
|
||||
reasoning_chars_sent = [0]
|
||||
reasoning_capped_notified = [False]
|
||||
# Captures the first error message emitted by agent_stream so the
|
||||
# subsequent agent_end handler can skip its "empty final_response"
|
||||
# fallback (which would otherwise overwrite the real error).
|
||||
streamed_error: List[str] = []
|
||||
|
||||
def on_event(event: dict):
|
||||
if request_id not in self.sse_queues:
|
||||
@@ -398,6 +403,25 @@ class WebChannel(ChatChannel):
|
||||
if tool_calls:
|
||||
q.put({"type": "message_end", "has_tool_calls": True})
|
||||
|
||||
elif event_type == "error":
|
||||
# Agent raised an exception (LLM 401/timeout/etc). Surface the
|
||||
# real message instead of letting the empty-response fallback
|
||||
# below hide it as "(模型未返回任何内容)".
|
||||
err_msg = data.get("error") or "unknown error"
|
||||
logger.warning(
|
||||
f"[WebChannel] agent_stream emitted error for "
|
||||
f"request {request_id}: {err_msg}"
|
||||
)
|
||||
# Remember it so the agent_end handler below knows not to
|
||||
# rewrite the message into a generic empty-response notice.
|
||||
streamed_error.append(err_msg)
|
||||
q.put({
|
||||
"type": "done",
|
||||
"content": f"❌ {err_msg}",
|
||||
"request_id": request_id,
|
||||
"timestamp": time.time(),
|
||||
})
|
||||
|
||||
elif event_type == "agent_end":
|
||||
# Safety net: if the agent finishes with an empty final_response,
|
||||
# chat_channel skips _send_reply (because reply.content is empty),
|
||||
@@ -406,6 +430,11 @@ class WebChannel(ChatChannel):
|
||||
# here so the frontend always gets closure.
|
||||
final_response = data.get("final_response", "")
|
||||
if not final_response or not str(final_response).strip():
|
||||
if streamed_error:
|
||||
# Error was already surfaced via the `error` event
|
||||
# handler above; nothing more to do here.
|
||||
pass
|
||||
else:
|
||||
logger.warning(
|
||||
f"[WebChannel] agent_end with empty final_response for "
|
||||
f"request {request_id}, sending fallback done"
|
||||
@@ -432,6 +461,39 @@ class WebChannel(ChatChannel):
|
||||
|
||||
return on_event
|
||||
|
||||
@staticmethod
|
||||
def _cleanup_stale_voice_recordings(max_age_seconds: int = 3600) -> None:
|
||||
"""Delete voice-input audio files older than `max_age_seconds`.
|
||||
|
||||
Called once at startup. Web mic recordings live in the upload
|
||||
directory so the browser can replay them inside the conversation
|
||||
bubble. We don't persist them to history, so once a process
|
||||
restarts they're useless — but they're never auto-cleaned
|
||||
anywhere else, so without this they accumulate over time.
|
||||
"""
|
||||
try:
|
||||
upload_dir = _get_upload_dir()
|
||||
if not os.path.isdir(upload_dir):
|
||||
return
|
||||
now = time.time()
|
||||
removed = 0
|
||||
for name in os.listdir(upload_dir):
|
||||
if not name.startswith("voice_input_"):
|
||||
continue
|
||||
full = os.path.join(upload_dir, name)
|
||||
try:
|
||||
if not os.path.isfile(full):
|
||||
continue
|
||||
if now - os.path.getmtime(full) > max_age_seconds:
|
||||
os.remove(full)
|
||||
removed += 1
|
||||
except OSError:
|
||||
continue
|
||||
if removed:
|
||||
logger.info(f"[WebChannel] cleaned up {removed} stale voice recording(s) from {upload_dir}")
|
||||
except Exception as e:
|
||||
logger.warning(f"[WebChannel] voice cleanup failed: {e}")
|
||||
|
||||
def upload_file(self):
|
||||
"""Handle file or directory upload via multipart/form-data."""
|
||||
try:
|
||||
@@ -703,6 +765,8 @@ class WebChannel(ChatChannel):
|
||||
port = conf().get("web_port", 9899)
|
||||
is_public_bind = host in ("0.0.0.0", "::")
|
||||
|
||||
self._cleanup_stale_voice_recordings()
|
||||
|
||||
# 打印可用渠道类型提示
|
||||
logger.info(
|
||||
"[WebChannel] 全部可用通道如下,可修改 config.json 配置文件中的 channel_type 字段进行切换,多个通道用逗号分隔:")
|
||||
@@ -746,6 +810,7 @@ class WebChannel(ChatChannel):
|
||||
'/upload', 'UploadHandler',
|
||||
'/uploads/(.*)', 'UploadsHandler',
|
||||
'/api/file', 'FileServeHandler',
|
||||
'/api/voice/asr', 'VoiceAsrHandler',
|
||||
'/poll', 'PollHandler',
|
||||
'/stream', 'StreamHandler',
|
||||
'/chat', 'ChatHandler',
|
||||
@@ -870,6 +935,68 @@ class UploadHandler:
|
||||
return WebChannel().upload_file()
|
||||
|
||||
|
||||
class VoiceAsrHandler:
|
||||
"""
|
||||
Accept a short audio recording from the web console mic button,
|
||||
save it under uploads/ so the browser can replay it, then run it
|
||||
through the currently configured ASR provider.
|
||||
|
||||
Returns {status, text, audio_url} on success — the frontend renders
|
||||
a voice-message bubble with the playable audio and the transcribed
|
||||
caption.
|
||||
"""
|
||||
def POST(self):
|
||||
_require_auth()
|
||||
web.header('Content-Type', 'application/json; charset=utf-8')
|
||||
|
||||
saved_path = None
|
||||
try:
|
||||
params = _raw_web_input()
|
||||
file_obj = params.get("file")
|
||||
if file_obj is None:
|
||||
return json.dumps({"status": "error", "message": "no audio file"})
|
||||
|
||||
filename = getattr(file_obj, "filename", "") or "recording.webm"
|
||||
ext = os.path.splitext(filename)[1].lower() or ".webm"
|
||||
if ext not in (".webm", ".ogg", ".opus", ".mp4", ".m4a", ".mp3", ".wav"):
|
||||
ext = ".webm"
|
||||
|
||||
upload_dir = _get_upload_dir()
|
||||
os.makedirs(upload_dir, exist_ok=True)
|
||||
ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
saved_name = f"voice_input_{ts}_{random.randint(0, 9999)}{ext}"
|
||||
saved_path = os.path.join(upload_dir, saved_name)
|
||||
with open(saved_path, "wb") as f:
|
||||
f.write(file_obj.file.read() if hasattr(file_obj, "file") else file_obj.value)
|
||||
|
||||
audio_url = f"/uploads/{saved_name}"
|
||||
|
||||
from bridge.bridge import Bridge
|
||||
reply = Bridge().fetch_voice_to_text(saved_path)
|
||||
if reply is None:
|
||||
return json.dumps({
|
||||
"status": "error",
|
||||
"message": "ASR returned no reply",
|
||||
"audio_url": audio_url,
|
||||
})
|
||||
|
||||
from bridge.reply import ReplyType
|
||||
if reply.type == ReplyType.TEXT:
|
||||
return json.dumps({
|
||||
"status": "success",
|
||||
"text": reply.content or "",
|
||||
"audio_url": audio_url,
|
||||
})
|
||||
return json.dumps({
|
||||
"status": "error",
|
||||
"message": reply.content or "ASR failed",
|
||||
"audio_url": audio_url,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.exception(f"[VoiceAsrHandler] failed: {e}")
|
||||
return json.dumps({"status": "error", "message": str(e)})
|
||||
|
||||
|
||||
class UploadsHandler:
|
||||
def GET(self, file_name):
|
||||
_require_auth()
|
||||
@@ -1232,7 +1359,7 @@ class ModelsHandler:
|
||||
|
||||
# Capability -> editable flag, current-value resolver, and supported provider
|
||||
# ids drawn from ConfigHandler.PROVIDER_MODELS where applicable.
|
||||
_ASR_PROVIDERS = ["openai", "linkai", "baidu", "ali", "xunfei", "azure", "google"]
|
||||
_ASR_PROVIDERS = ["openai", "dashscope", "zhipu", "linkai"]
|
||||
_TTS_PROVIDERS = ["openai", "linkai", "minimax", "baidu", "ali", "xunfei", "azure", "google", "elevenlabs", "edge", "pytts"]
|
||||
_EMBEDDING_PROVIDERS = ["openai", "dashscope", "doubao", "zhipu", "linkai"]
|
||||
|
||||
@@ -1502,10 +1629,23 @@ class ModelsHandler:
|
||||
|
||||
@classmethod
|
||||
def _asr_capability(cls, local_config: dict) -> dict:
|
||||
provider_id = (local_config.get("voice_to_text") or "openai").strip().lower()
|
||||
# "Pick or empty" — when voice_to_text is unset we don't show a
|
||||
# current selection. `suggested_provider` previews which vendor
|
||||
# the bridge auto-picker would land on (purely a UX hint, NOT
|
||||
# persisted). Once the user saves a vendor, we lock onto it.
|
||||
explicit = (local_config.get("voice_to_text") or "").strip().lower()
|
||||
suggested = ""
|
||||
if not explicit:
|
||||
for pid in cls._ASR_PROVIDERS:
|
||||
meta = ConfigHandler.PROVIDER_MODELS.get(pid) or {}
|
||||
key_field = meta.get("api_key_field")
|
||||
if key_field and cls._is_real_key(local_config.get(key_field, "")):
|
||||
suggested = pid
|
||||
break
|
||||
return {
|
||||
"editable": True,
|
||||
"current_provider": provider_id,
|
||||
"current_provider": explicit,
|
||||
"suggested_provider": suggested,
|
||||
"current_model": "",
|
||||
"providers": cls._ASR_PROVIDERS,
|
||||
}
|
||||
@@ -1897,6 +2037,10 @@ class ModelsHandler:
|
||||
file_cfg[key] = value
|
||||
self._write_file_config(file_cfg)
|
||||
logger.info(f"[ModelsHandler] {key} set: {value!r}")
|
||||
# Bridge caches voice_to_text routing + bot instance; refresh it
|
||||
# so the change takes effect on the next voice request.
|
||||
if key in ("voice_to_text", "text_to_voice"):
|
||||
self._refresh_voice_routing()
|
||||
return json.dumps({"status": "success", key: value})
|
||||
|
||||
def _set_tts(self, provider_id: str, model: str) -> str:
|
||||
@@ -1910,8 +2054,17 @@ class ModelsHandler:
|
||||
file_cfg["text_to_voice_model"] = model
|
||||
self._write_file_config(file_cfg)
|
||||
logger.info(f"[ModelsHandler] tts updated: provider={provider_id!r} model={model!r}")
|
||||
self._refresh_voice_routing()
|
||||
return json.dumps({"status": "success", "provider": provider_id, "model": model})
|
||||
|
||||
@staticmethod
|
||||
def _refresh_voice_routing() -> None:
|
||||
try:
|
||||
from bridge.bridge import Bridge
|
||||
Bridge().refresh_voice()
|
||||
except Exception as e:
|
||||
logger.warning(f"[ModelsHandler] Bridge voice refresh failed: {e}")
|
||||
|
||||
def _set_embedding(self, provider_id: str, model: str) -> str:
|
||||
# provider_id="" + model="" means "switch back to legacy auto mode".
|
||||
local_config = conf()
|
||||
@@ -1926,9 +2079,9 @@ class ModelsHandler:
|
||||
file_cfg["embedding_model"] = ""
|
||||
self._write_file_config(file_cfg)
|
||||
logger.info(f"[ModelsHandler] embedding updated: provider={provider_id!r} model={model!r}")
|
||||
# The agent's MemoryManager picks the new provider on next process
|
||||
# restart; the index dim may now mismatch so a rebuild is needed.
|
||||
# The frontend surfaces this via a confirm + post-save dialog.
|
||||
# The next /memory rebuild-index command hot-swaps the provider onto
|
||||
# the running MemoryManager (see plugins/cow_cli). The dim may have
|
||||
# changed, so the frontend prompts the user to rebuild.
|
||||
return json.dumps({"status": "success", "provider": provider_id, "model": model})
|
||||
|
||||
@staticmethod
|
||||
|
||||
0
voice/dashscope/__init__.py
Normal file
0
voice/dashscope/__init__.py
Normal file
135
voice/dashscope/dashscope_voice.py
Normal file
135
voice/dashscope/dashscope_voice.py
Normal file
@@ -0,0 +1,135 @@
|
||||
# encoding:utf-8
|
||||
"""
|
||||
DashScope (Aliyun Bailian) voice service.
|
||||
|
||||
ASR : qwen3-asr-flash via dashscope.MultiModalConversation
|
||||
TTS : not yet implemented (see CosyVoice / qwen3-tts)
|
||||
|
||||
Why MultiModalConversation instead of the OpenAI-compatible endpoint:
|
||||
- SDK is already a project dep (used by chat/vision)
|
||||
- Native API accepts local file:// paths up to 100 QPS without an OSS
|
||||
round-trip, which is what we need for the "send a short voice
|
||||
message" flow. Public URLs / Base64 also work.
|
||||
"""
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
import dashscope
|
||||
from dashscope import MultiModalConversation
|
||||
|
||||
from bridge.reply import Reply, ReplyType
|
||||
from common.log import logger
|
||||
from config import conf
|
||||
from voice import audio_convert
|
||||
from voice.voice import Voice
|
||||
|
||||
|
||||
DEFAULT_ASR_MODEL = "qwen3-asr-flash"
|
||||
# qwen3-asr-flash hard cap (single file, sync call). Longer audio needs
|
||||
# qwen3-asr-flash-filetrans which is async-only and out of scope here.
|
||||
MAX_DURATION_SECONDS = 300
|
||||
MAX_FILE_BYTES = 10 * 1024 * 1024
|
||||
|
||||
|
||||
class DashScopeVoice(Voice):
|
||||
def __init__(self):
|
||||
# api_key is applied per-call (chat bot does the same) so a live
|
||||
# config change via the web console takes effect without restart.
|
||||
pass
|
||||
|
||||
def voiceToText(self, voice_file: str):
|
||||
try:
|
||||
voice_file = self._ensure_compatible_format(voice_file)
|
||||
|
||||
try:
|
||||
size = os.path.getsize(voice_file)
|
||||
if size > MAX_FILE_BYTES:
|
||||
logger.warning(
|
||||
f"[DashScopeVoice] audio file {size}B exceeds {MAX_FILE_BYTES}B; "
|
||||
f"qwen3-asr-flash may reject it"
|
||||
)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
api_key = conf().get("dashscope_api_key", "")
|
||||
if not api_key:
|
||||
logger.error("[DashScopeVoice] dashscope_api_key is not configured")
|
||||
return Reply(ReplyType.ERROR, "未配置 DashScope API key")
|
||||
dashscope.api_key = api_key
|
||||
|
||||
model = conf().get("voice_to_text_model") or DEFAULT_ASR_MODEL
|
||||
abs_path = os.path.abspath(voice_file)
|
||||
file_uri = f"file://{abs_path}"
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": [{"audio": file_uri}]},
|
||||
]
|
||||
response = MultiModalConversation.call(
|
||||
model=model,
|
||||
messages=messages,
|
||||
result_format="message",
|
||||
asr_options={"enable_itn": False, "enable_lid": True},
|
||||
)
|
||||
|
||||
text = self._extract_text(response)
|
||||
if text is None:
|
||||
logger.error(f"[DashScopeVoice] voiceToText failed: {response}")
|
||||
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
||||
|
||||
logger.info(f"[DashScopeVoice] voiceToText model={model} text={text}")
|
||||
return Reply(ReplyType.TEXT, text)
|
||||
except Exception as e:
|
||||
logger.exception(f"[DashScopeVoice] voiceToText exception: {e}")
|
||||
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
||||
|
||||
def textToVoice(self, text: str):
|
||||
# TTS will be added in a follow-up commit (qwen3-tts / cosyvoice).
|
||||
return Reply(ReplyType.ERROR, "DashScope 语音合成尚未接入")
|
||||
|
||||
@staticmethod
|
||||
def _ensure_compatible_format(voice_file: str) -> str:
|
||||
"""Convert AMR/SILK to mp3 since qwen3-asr-flash doesn't accept them.
|
||||
Other formats (mp3/wav/m4a/aac/opus/webm) are passed through.
|
||||
"""
|
||||
lower = voice_file.lower()
|
||||
if lower.endswith(".amr") or lower.endswith(".silk") or lower.endswith(".slk"):
|
||||
try:
|
||||
mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
|
||||
audio_convert.any_to_mp3(voice_file, mp3_file)
|
||||
return mp3_file
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"[DashScopeVoice] convert {voice_file} to mp3 failed: {e}; "
|
||||
f"submitting original file"
|
||||
)
|
||||
return voice_file
|
||||
|
||||
@staticmethod
|
||||
def _extract_text(response) -> Optional[str]:
|
||||
"""Pull the recognized text out of MultiModalConversation response.
|
||||
|
||||
Successful shape (result_format="message"):
|
||||
response.output.choices[0].message.content -> list of {"text": "..."}
|
||||
or in some SDK versions a plain string.
|
||||
"""
|
||||
try:
|
||||
if getattr(response, "status_code", 200) != 200:
|
||||
return None
|
||||
choices = response.output.get("choices") or []
|
||||
if not choices:
|
||||
return None
|
||||
content = choices[0].get("message", {}).get("content")
|
||||
if isinstance(content, str):
|
||||
return content.strip() or None
|
||||
if isinstance(content, list):
|
||||
parts = []
|
||||
for item in content:
|
||||
if isinstance(item, dict) and "text" in item:
|
||||
parts.append(item["text"])
|
||||
elif isinstance(item, str):
|
||||
parts.append(item)
|
||||
text = "".join(parts).strip()
|
||||
return text or None
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
@@ -58,4 +58,12 @@ def create_voice(voice_type):
|
||||
from voice.minimax.minimax_voice import MinimaxVoice
|
||||
|
||||
return MinimaxVoice()
|
||||
elif voice_type == "dashscope":
|
||||
from voice.dashscope.dashscope_voice import DashScopeVoice
|
||||
|
||||
return DashScopeVoice()
|
||||
elif voice_type == "zhipu" or voice_type == "zhipuai":
|
||||
from voice.zhipuai.zhipuai_voice import ZhipuAIVoice
|
||||
|
||||
return ZhipuAIVoice()
|
||||
raise RuntimeError
|
||||
|
||||
0
voice/zhipuai/__init__.py
Normal file
0
voice/zhipuai/__init__.py
Normal file
102
voice/zhipuai/zhipuai_voice.py
Normal file
102
voice/zhipuai/zhipuai_voice.py
Normal file
@@ -0,0 +1,102 @@
|
||||
# encoding:utf-8
|
||||
"""
|
||||
ZhipuAI (BigModel) voice service.
|
||||
|
||||
ASR : glm-asr-2512 via the OpenAI-compatible /audio/transcriptions endpoint.
|
||||
TTS : not yet implemented.
|
||||
|
||||
Endpoint accepts multipart/form-data with `model`, `file`, and `stream`.
|
||||
File size <= 25MB, duration <= 30s per request.
|
||||
"""
|
||||
import os
|
||||
|
||||
import requests
|
||||
|
||||
from bridge.reply import Reply, ReplyType
|
||||
from common.log import logger
|
||||
from config import conf
|
||||
from voice import audio_convert
|
||||
from voice.voice import Voice
|
||||
|
||||
|
||||
DEFAULT_ASR_MODEL = "glm-asr-2512"
|
||||
DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4"
|
||||
MAX_FILE_BYTES = 25 * 1024 * 1024
|
||||
REQUEST_TIMEOUT = (5, 60)
|
||||
|
||||
|
||||
class ZhipuAIVoice(Voice):
|
||||
def __init__(self):
|
||||
# api_key / base read per-call so live config edits take effect.
|
||||
pass
|
||||
|
||||
def voiceToText(self, voice_file: str):
|
||||
try:
|
||||
voice_file = self._ensure_compatible_format(voice_file)
|
||||
|
||||
try:
|
||||
size = os.path.getsize(voice_file)
|
||||
if size > MAX_FILE_BYTES:
|
||||
logger.warning(
|
||||
f"[ZhipuAIVoice] audio file {size}B exceeds {MAX_FILE_BYTES}B; "
|
||||
f"glm-asr-2512 may reject it"
|
||||
)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
api_key = conf().get("zhipu_ai_api_key", "")
|
||||
if not api_key:
|
||||
logger.error("[ZhipuAIVoice] zhipu_ai_api_key is not configured")
|
||||
return Reply(ReplyType.ERROR, "未配置 ZhipuAI API key")
|
||||
|
||||
api_base = (conf().get("zhipu_ai_api_base") or DEFAULT_API_BASE).rstrip("/")
|
||||
url = f"{api_base}/audio/transcriptions"
|
||||
model = conf().get("voice_to_text_model") or DEFAULT_ASR_MODEL
|
||||
|
||||
with open(voice_file, "rb") as f:
|
||||
files = {"file": (os.path.basename(voice_file), f)}
|
||||
data = {"model": model, "stream": "false"}
|
||||
headers = {"Authorization": f"Bearer {api_key}"}
|
||||
response = requests.post(
|
||||
url, headers=headers, files=files, data=data, timeout=REQUEST_TIMEOUT
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.error(
|
||||
f"[ZhipuAIVoice] voiceToText failed: status={response.status_code} "
|
||||
f"body={response.text[:500]}"
|
||||
)
|
||||
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
||||
|
||||
payload = response.json()
|
||||
text = (payload.get("text") or "").strip()
|
||||
if not text:
|
||||
logger.error(f"[ZhipuAIVoice] voiceToText empty text: {payload}")
|
||||
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
||||
|
||||
logger.info(f"[ZhipuAIVoice] voiceToText model={model} text={text}")
|
||||
return Reply(ReplyType.TEXT, text)
|
||||
except Exception as e:
|
||||
logger.exception(f"[ZhipuAIVoice] voiceToText exception: {e}")
|
||||
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
||||
|
||||
def textToVoice(self, text: str):
|
||||
return Reply(ReplyType.ERROR, "ZhipuAI 语音合成尚未接入")
|
||||
|
||||
@staticmethod
|
||||
def _ensure_compatible_format(voice_file: str) -> str:
|
||||
# glm-asr-2512 only accepts .wav / .mp3 — convert everything else
|
||||
# (webm from the browser mic, m4a/amr/silk from chat channels, etc).
|
||||
lower = voice_file.lower()
|
||||
if lower.endswith(".mp3") or lower.endswith(".wav"):
|
||||
return voice_file
|
||||
try:
|
||||
mp3_file = os.path.splitext(voice_file)[0] + ".mp3"
|
||||
audio_convert.any_to_mp3(voice_file, mp3_file)
|
||||
return mp3_file
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"[ZhipuAIVoice] convert {voice_file} to mp3 failed: {e}; "
|
||||
f"submitting original file"
|
||||
)
|
||||
return voice_file
|
||||
Reference in New Issue
Block a user