feat(voice): rework TTS/ASR stack and unify tool/skill config schema

This commit is contained in:
zhayujie
2026-05-21 16:00:54 +08:00
parent 2b90f377e6
commit b8333e351c
31 changed files with 1551 additions and 335 deletions

View File

@@ -44,6 +44,7 @@ CREATE TABLE IF NOT EXISTS messages (
role TEXT NOT NULL, role TEXT NOT NULL,
content TEXT NOT NULL, content TEXT NOT NULL,
created_at INTEGER NOT NULL, created_at INTEGER NOT NULL,
extras TEXT NOT NULL DEFAULT '',
UNIQUE (session_id, seq) UNIQUE (session_id, seq)
); );
@@ -67,6 +68,12 @@ _MIGRATION_ADD_CONTEXT_START_SEQ = """
ALTER TABLE sessions ADD COLUMN context_start_seq INTEGER NOT NULL DEFAULT 0; ALTER TABLE sessions ADD COLUMN context_start_seq INTEGER NOT NULL DEFAULT 0;
""" """
# Generic JSON sidecar for per-message attachments (TTS audio URL, future use).
# Always optional — readers must tolerate missing column / empty / invalid JSON.
_MIGRATION_ADD_MSG_EXTRAS = """
ALTER TABLE messages ADD COLUMN extras TEXT NOT NULL DEFAULT '';
"""
DEFAULT_MAX_AGE_DAYS: int = 30 DEFAULT_MAX_AGE_DAYS: int = 30
@@ -169,20 +176,26 @@ def _group_into_display_turns(
cur_rest: List[tuple] = [] cur_rest: List[tuple] = []
started = False started = False
for role, raw_content, created_at in rows: for role, raw_content, created_at, raw_extras in rows:
try: try:
content = json.loads(raw_content) content = json.loads(raw_content)
except Exception: except Exception:
content = raw_content content = raw_content
try:
extras = json.loads(raw_extras) if raw_extras else {}
if not isinstance(extras, dict):
extras = {}
except Exception:
extras = {}
if role == "user" and _is_visible_user_message(content): if role == "user" and _is_visible_user_message(content):
if started: if started:
groups.append((cur_user, cur_rest)) groups.append((cur_user, cur_rest))
cur_user = (content, created_at) cur_user = (content, created_at, extras)
cur_rest = [] cur_rest = []
started = True started = True
else: else:
cur_rest.append((role, content, created_at)) cur_rest.append((role, content, created_at, extras))
if started: if started:
groups.append((cur_user, cur_rest)) groups.append((cur_user, cur_rest))
@@ -195,7 +208,7 @@ def _group_into_display_turns(
for user_row, rest in groups: for user_row, rest in groups:
# User turn # User turn
if user_row: if user_row:
content, created_at = user_row content, created_at, _u_extras = user_row
text = _extract_display_text(content) text = _extract_display_text(content)
if text: if text:
turns.append({"role": "user", "content": text, "created_at": created_at}) turns.append({"role": "user", "content": text, "created_at": created_at})
@@ -206,8 +219,11 @@ def _group_into_display_turns(
tool_results: Dict[str, str] = {} tool_results: Dict[str, str] = {}
final_text = "" final_text = ""
final_ts: Optional[int] = None final_ts: Optional[int] = None
merged_extras: Dict[str, Any] = {}
for role, content, created_at in rest: for role, content, created_at, extras in rest:
if role == "assistant" and isinstance(extras, dict):
merged_extras.update(extras)
if role == "user": if role == "user":
tool_results.update(_extract_tool_results(content)) tool_results.update(_extract_tool_results(content))
elif role == "assistant": elif role == "assistant":
@@ -256,6 +272,8 @@ def _group_into_display_turns(
"steps": steps, "steps": steps,
"created_at": final_ts or (user_row[1] if user_row else 0), "created_at": final_ts or (user_row[1] if user_row else 0),
} }
if merged_extras:
turn["extras"] = merged_extras
turns.append(turn) turns.append(turn)
return turns return turns
@@ -411,13 +429,15 @@ class ConversationStore:
content = json.dumps( content = json.dumps(
msg.get("content", ""), ensure_ascii=False msg.get("content", ""), ensure_ascii=False
) )
extras_obj = msg.get("extras") or {}
extras = json.dumps(extras_obj, ensure_ascii=False) if extras_obj else ""
conn.execute( conn.execute(
""" """
INSERT OR IGNORE INTO messages INSERT OR IGNORE INTO messages
(session_id, seq, role, content, created_at) (session_id, seq, role, content, created_at, extras)
VALUES (?, ?, ?, ?, ?) VALUES (?, ?, ?, ?, ?, ?)
""", """,
(session_id, next_seq, role, content, now), (session_id, next_seq, role, content, now, extras),
) )
next_seq += 1 next_seq += 1
@@ -651,6 +671,55 @@ class ConversationStore:
logger.info(f"[ConversationStore] Pruned {deleted} expired sessions") logger.info(f"[ConversationStore] Pruned {deleted} expired sessions")
return deleted return deleted
def attach_extras_to_last_assistant(
self,
session_id: str,
extras: Dict[str, Any],
) -> Optional[int]:
"""
Merge ``extras`` into the latest assistant message of a session.
Used by post-processing (e.g. TTS) that needs to annotate an already
persisted bot reply with attachments such as audio URLs.
Returns the message seq that was updated, or ``None`` if no assistant
message exists or the update could not be applied.
"""
if not extras:
return None
with self._lock:
conn = self._connect()
try:
row = conn.execute(
"""
SELECT seq, extras FROM messages
WHERE session_id = ? AND role = 'assistant'
ORDER BY seq DESC LIMIT 1
""",
(session_id,),
).fetchone()
if not row:
return None
seq, raw = row
try:
cur = json.loads(raw) if raw else {}
if not isinstance(cur, dict):
cur = {}
except Exception:
cur = {}
cur.update(extras)
conn.execute(
"UPDATE messages SET extras = ? WHERE session_id = ? AND seq = ?",
(json.dumps(cur, ensure_ascii=False), session_id, seq),
)
conn.commit()
return seq
except Exception as e:
logger.warning(f"[ConversationStore] attach_extras failed: {e}")
return None
finally:
conn.close()
def load_history_page( def load_history_page(
self, self,
session_id: str, session_id: str,
@@ -698,7 +767,22 @@ class ConversationStore:
).fetchone() ).fetchone()
ctx_start = ctx_row[0] if ctx_row else 0 ctx_start = ctx_row[0] if ctx_row else 0
# extras column is added by migration; tolerate older DBs that
# might miss it by falling back to a NULL literal.
try:
rows = conn.execute( rows = conn.execute(
"""
SELECT seq, role, content, created_at, extras
FROM messages
WHERE session_id = ?
ORDER BY seq ASC
""",
(session_id,),
).fetchall()
except sqlite3.OperationalError:
rows = [
(seq, role, content, created_at, "")
for (seq, role, content, created_at) in conn.execute(
""" """
SELECT seq, role, content, created_at SELECT seq, role, content, created_at
FROM messages FROM messages
@@ -707,6 +791,7 @@ class ConversationStore:
""", """,
(session_id,), (session_id,),
).fetchall() ).fetchall()
]
finally: finally:
conn.close() conn.close()
@@ -719,13 +804,16 @@ class ConversationStore:
include_thinking = False include_thinking = False
# Strip seq for display grouping, but record max seq per visible user group # Strip seq for display grouping, but record max seq per visible user group
plain_rows = [(role, content, created_at) for _seq, role, content, created_at in rows] plain_rows = [
(role, content, created_at, extras_raw)
for _seq, role, content, created_at, extras_raw in rows
]
visible = _group_into_display_turns(plain_rows, include_thinking=include_thinking) visible = _group_into_display_turns(plain_rows, include_thinking=include_thinking)
# Build a mapping: find the seq of each visible user message to annotate context boundary. # Build a mapping: find the seq of each visible user message to annotate context boundary.
# Walk through rows to find visible user message seqs in order. # Walk through rows to find visible user message seqs in order.
visible_user_seqs: List[int] = [] visible_user_seqs: List[int] = []
for seq, role, raw_content, _ts in rows: for seq, role, raw_content, _ts, _extras in rows:
if role != "user": if role != "user":
continue continue
try: try:
@@ -911,6 +999,18 @@ class ConversationStore:
except Exception as e: except Exception as e:
logger.warning(f"[ConversationStore] Migration (context_start_seq) failed: {e}") logger.warning(f"[ConversationStore] Migration (context_start_seq) failed: {e}")
msg_cols = {
row[1]
for row in conn.execute("PRAGMA table_info(messages)").fetchall()
}
if "extras" not in msg_cols:
try:
conn.execute(_MIGRATION_ADD_MSG_EXTRAS)
conn.commit()
logger.info("[ConversationStore] Migrated: added messages.extras column")
except Exception as e:
logger.warning(f"[ConversationStore] Migration (extras) failed: {e}")
def _connect(self) -> sqlite3.Connection: def _connect(self) -> sqlite3.Connection:
conn = sqlite3.connect(str(self._db_path), timeout=10) conn = sqlite3.connect(str(self._db_path), timeout=10)
conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA journal_mode=WAL")

View File

@@ -3,7 +3,7 @@ Vision tool - Analyze images using Vision API.
Supports local files (auto base64-encoded) and HTTP URLs. Supports local files (auto base64-encoded) and HTTP URLs.
Provider resolution: Provider resolution:
- tool.vision.model (if set) means "prefer this model first; fall back to - tools.vision.model (if set) means "prefer this model first; fall back to
other configured providers if it fails". The model name is mapped to its other configured providers if it fails". The model name is mapped to its
native provider (e.g. doubao-* → Doubao, kimi-* → Moonshot, gpt-* → native provider (e.g. doubao-* → Doubao, kimi-* → Moonshot, gpt-* →
OpenAI/LinkAI). That provider is tried first, then the standard auto OpenAI/LinkAI). That provider is tried first, then the standard auto
@@ -60,7 +60,7 @@ _DISCOVERABLE_MODELS = [
] ]
# Model name prefix → discoverable provider display_name. # Model name prefix → discoverable provider display_name.
# Used to auto-route tool.vision.model to its native provider. # Used to auto-route tools.vision.model to its native provider.
# Matched case-insensitively; longest prefix wins. # Matched case-insensitively; longest prefix wins.
_MODEL_PREFIX_TO_PROVIDER = [ _MODEL_PREFIX_TO_PROVIDER = [
("doubao-", "Doubao"), ("doubao-", "Doubao"),
@@ -154,7 +154,7 @@ class Vision(BaseTool):
# Default model is only used as a last-resort placeholder for providers # Default model is only used as a last-resort placeholder for providers
# whose VisionProvider.model_override is None (e.g. raw OpenAI provider # whose VisionProvider.model_override is None (e.g. raw OpenAI provider
# when the user did not configure tool.vision.model). # when the user did not configure tools.vision.model).
return self._call_with_fallback(providers, DEFAULT_MODEL, question, image_content) return self._call_with_fallback(providers, DEFAULT_MODEL, question, image_content)
def _call_with_fallback(self, providers: List[VisionProvider], model: str, def _call_with_fallback(self, providers: List[VisionProvider], model: str,
@@ -193,12 +193,12 @@ class Vision(BaseTool):
""" """
Build an ordered list of providers to try. Build an ordered list of providers to try.
Semantics of `tool.vision.model`: Semantics of `tools.vision.model`:
"Prefer this model first; fall back to other configured providers "Prefer this model first; fall back to other configured providers
if it fails." if it fails."
Order: Order:
1. The provider that natively serves `tool.vision.model` (if any 1. The provider that natively serves `tools.vision.model` (if any
and its API key is configured) — using the user-specified model and its API key is configured) — using the user-specified model
name verbatim. name verbatim.
2. Auto-discovery chain as fallback: 2. Auto-discovery chain as fallback:
@@ -213,7 +213,7 @@ class Vision(BaseTool):
user_model = self._resolve_user_vision_model() user_model = self._resolve_user_vision_model()
providers: List[VisionProvider] = [] providers: List[VisionProvider] = []
# Step 1: preferred provider derived from tool.vision.model # Step 1: preferred provider derived from tools.vision.model
if user_model: if user_model:
preferred = self._route_by_model_name(user_model) preferred = self._route_by_model_name(user_model)
if preferred: if preferred:
@@ -251,11 +251,11 @@ class Vision(BaseTool):
@staticmethod @staticmethod
def _resolve_user_vision_model() -> Optional[str]: def _resolve_user_vision_model() -> Optional[str]:
"""Read tool.vision.model from config; return None if unset/blank.""" """Read tools.vision.model (singular ``tool`` kept as runtime fallback)."""
tool_conf = conf().get("tool", {}) tools_conf = conf().get("tools") or conf().get("tool") or {}
if not isinstance(tool_conf, dict): if not isinstance(tools_conf, dict):
return None return None
vision_conf = tool_conf.get("vision", {}) vision_conf = tools_conf.get("vision", {})
if not isinstance(vision_conf, dict): if not isinstance(vision_conf, dict):
return None return None
m = vision_conf.get("model") m = vision_conf.get("model")
@@ -303,7 +303,7 @@ class Vision(BaseTool):
self._append_provider(providers, lambda: self._build_linkai_provider(user_model)) self._append_provider(providers, lambda: self._build_linkai_provider(user_model))
if providers: if providers:
return providers return providers
logger.warning(f"[Vision] tool.vision.model='{user_model}' looks like an OpenAI " logger.warning(f"[Vision] tools.vision.model='{user_model}' looks like an OpenAI "
f"model but neither OPENAI_API_KEY nor LINKAI_API_KEY is configured.") f"model but neither OPENAI_API_KEY nor LINKAI_API_KEY is configured.")
return None # fall through to auto return None # fall through to auto
@@ -317,7 +317,7 @@ class Vision(BaseTool):
continue continue
api_key = conf().get(config_key, "") api_key = conf().get(config_key, "")
if not api_key or not api_key.strip(): if not api_key or not api_key.strip():
logger.warning(f"[Vision] tool.vision.model='{user_model}' routes to " logger.warning(f"[Vision] tools.vision.model='{user_model}' routes to "
f"'{display_name}' but '{config_key}' is not configured. " f"'{display_name}' but '{config_key}' is not configured. "
f"Falling back to auto-discovery.") f"Falling back to auto-discovery.")
return None # fall through to auto return None # fall through to auto
@@ -452,8 +452,8 @@ class Vision(BaseTool):
if not self._main_bot_supports_vision(bot): if not self._main_bot_supports_vision(bot):
return None return None
# Use the configured main model name; do NOT inject tool.vision.model # Use the configured main model name; do NOT inject tools.vision.model
# here, because by the time we reach this branch the tool.vision.model # here, because by the time we reach this branch the tools.vision.model
# routing has already been attempted (and either matched the main bot # routing has already been attempted (and either matched the main bot
# or failed to find a provider). # or failed to find a provider).
main_model_name = conf().get("model") or None main_model_name = conf().get("model") or None

View File

@@ -171,7 +171,13 @@ class ChatChannel(Channel):
if "desire_rtype" not in context and conf().get("always_reply_voice") and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE: if "desire_rtype" not in context and conf().get("always_reply_voice") and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE:
context["desire_rtype"] = ReplyType.VOICE context["desire_rtype"] = ReplyType.VOICE
elif context.type == ContextType.VOICE: elif context.type == ContextType.VOICE:
if "desire_rtype" not in context and conf().get("voice_reply_voice") and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE: # Voice input replies with voice when either voice_reply_voice
# (mirror voice) or the global always_reply_voice toggle is on.
if (
"desire_rtype" not in context
and (conf().get("voice_reply_voice") or conf().get("always_reply_voice"))
and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE
):
context["desire_rtype"] = ReplyType.VOICE context["desire_rtype"] = ReplyType.VOICE
return context return context

View File

@@ -1515,10 +1515,16 @@ class FeiShuChanel(ChatChannel):
else: else:
context.type = ContextType.TEXT context.type = ContextType.TEXT
context.content = content.strip() context.content = content.strip()
# Text input opts into voice replies only when the always-on toggle is set.
if "desire_rtype" not in context and conf().get("always_reply_voice"):
context["desire_rtype"] = ReplyType.VOICE
elif context.type == ContextType.VOICE: elif context.type == ContextType.VOICE:
# 2.语音请求 # 2.语音请求: voice input replies with voice if either
if "desire_rtype" not in context and conf().get("voice_reply_voice"): # voice_reply_voice (mirror reply) or always_reply_voice is on.
if "desire_rtype" not in context and (
conf().get("voice_reply_voice") or conf().get("always_reply_voice")
):
context["desire_rtype"] = ReplyType.VOICE context["desire_rtype"] = ReplyType.VOICE
return context return context

View File

@@ -1294,3 +1294,76 @@
overflow: hidden; overflow: hidden;
min-height: 2.5em; /* ~2 lines at text-sm leading-relaxed */ min-height: 2.5em; /* ~2 lines at text-sm leading-relaxed */
} }
/* --------------------------------------------------------------------
* Voice pill — compact custom audio player used by mic uploads and TTS
* replies. Replaces the bulky native <audio controls> with a play/pause
* icon + thin progress bar + duration counter so it blends into chat
* bubbles without the chrome-grey browser default look.
* ------------------------------------------------------------------ */
.voice-pill {
display: inline-flex;
align-items: center;
gap: 8px;
padding: 6px 10px;
border-radius: 999px;
background: rgba(15, 23, 42, 0.05);
color: rgb(71, 85, 105);
font-size: 12px;
line-height: 1;
max-width: 240px;
user-select: none;
cursor: default;
}
.dark .voice-pill {
background: rgba(255, 255, 255, 0.08);
color: rgb(203, 213, 225);
}
.voice-pill[data-loading="1"] {
opacity: 0.65;
}
.voice-pill-btn {
width: 22px;
height: 22px;
border-radius: 999px;
display: inline-flex;
align-items: center;
justify-content: center;
background: var(--color-primary-500, #2563eb);
color: #fff;
flex-shrink: 0;
cursor: pointer;
transition: transform 0.1s ease;
}
.voice-pill-btn:hover { transform: scale(1.05); }
.voice-pill-btn i { font-size: 9px; margin-left: 1px; }
.voice-pill-btn[data-state="play"] i { margin-left: 2px; }
.voice-pill-btn[data-state="pause"] i { margin-left: 0; }
.voice-pill-track {
flex: 1;
height: 3px;
border-radius: 999px;
background: rgba(100, 116, 139, 0.25);
overflow: hidden;
min-width: 70px;
}
.dark .voice-pill-track {
background: rgba(148, 163, 184, 0.25);
}
.voice-pill-fill {
height: 100%;
width: 0%;
background: var(--color-primary-500, #2563eb);
border-radius: inherit;
transition: width 0.1s linear;
}
.voice-pill-time {
font-variant-numeric: tabular-nums;
font-size: 11px;
color: inherit;
opacity: 0.75;
flex-shrink: 0;
min-width: 28px;
text-align: right;
}
.voice-pill audio { display: none; }

View File

@@ -25,6 +25,7 @@ const I18N = {
models_add_vendor: '添加厂商', models_add_vendor: '添加厂商',
models_provider: '厂商', models_provider: '厂商',
models_model: '模型', models_model: '模型',
models_voice: '音色',
models_configured: '已配置', models_configured: '已配置',
models_not_configured: '未配置', models_not_configured: '未配置',
models_pick_to_configure: '选择以配置', models_pick_to_configure: '选择以配置',
@@ -160,6 +161,11 @@ const I18N = {
mic_permission_denied: '无法访问麦克风,请检查浏览器权限', mic_permission_denied: '无法访问麦克风,请检查浏览器权限',
mic_too_short: '录音太短,请重试', mic_too_short: '录音太短,请重试',
mic_error: '语音识别失败', mic_error: '语音识别失败',
speak_msg: '朗读这段回复',
voice_reply_mode_label: '语音回复策略',
voice_reply_off: '关闭',
voice_reply_if_voice: '仅语音问/语音答',
voice_reply_always: '总是语音回复',
attach_menu_folder: '上传文件夹', attach_menu_folder: '上传文件夹',
confirm_yes: '确认', confirm_yes: '确认',
confirm_cancel: '取消', confirm_cancel: '取消',
@@ -180,6 +186,7 @@ const I18N = {
models_add_vendor: 'Add Vendor', models_add_vendor: 'Add Vendor',
models_provider: 'Provider', models_provider: 'Provider',
models_model: 'Model', models_model: 'Model',
models_voice: 'Voice',
models_configured: 'configured', models_configured: 'configured',
models_not_configured: 'not configured', models_not_configured: 'not configured',
models_pick_to_configure: 'pick to configure', models_pick_to_configure: 'pick to configure',
@@ -315,6 +322,11 @@ const I18N = {
mic_permission_denied: 'Cannot access microphone — check browser permissions', mic_permission_denied: 'Cannot access microphone — check browser permissions',
mic_too_short: 'Recording too short, please retry', mic_too_short: 'Recording too short, please retry',
mic_error: 'Speech recognition failed', mic_error: 'Speech recognition failed',
speak_msg: 'Read this reply aloud',
voice_reply_mode_label: 'Voice reply policy',
voice_reply_off: 'Off',
voice_reply_if_voice: 'Voice only if voice input',
voice_reply_always: 'Always reply with voice',
attach_menu_folder: 'Upload Folder', attach_menu_folder: 'Upload Folder',
confirm_yes: 'Confirm', confirm_yes: 'Confirm',
confirm_cancel: 'Cancel', confirm_cancel: 'Cancel',
@@ -1474,6 +1486,7 @@ function sendVoiceMessage(text, audioUrl) {
message: text, message: text,
stream: true, stream: true,
timestamp: timestamp.toISOString(), timestamp: timestamp.toISOString(),
is_voice: true,
}; };
const MAX_RETRIES = 2; const MAX_RETRIES = 2;
@@ -1512,19 +1525,19 @@ function sendVoiceMessage(text, audioUrl) {
function addUserVoiceMessage(audioUrl, caption, timestamp) { function addUserVoiceMessage(audioUrl, caption, timestamp) {
const el = document.createElement('div'); const el = document.createElement('div');
el.className = 'flex justify-end px-4 sm:px-6 py-3'; el.className = 'flex justify-end px-4 sm:px-6 py-3';
// Voice-message bubble: playable <audio> on top, ASR caption beneath. // Voice-message bubble: compact voice pill on top, ASR caption beneath.
// The bubble keeps the same primary tint as a normal user message so // The bubble keeps the same primary tint as a normal user message so
// it visually slots into the conversation flow. // it visually slots into the conversation flow.
el.innerHTML = ` el.innerHTML = `
<div class="max-w-[75%] sm:max-w-[60%]"> <div class="max-w-[75%] sm:max-w-[60%]">
<div class="bg-slate-100 dark:bg-white/10 text-slate-700 dark:text-slate-200 rounded-2xl px-3 py-2 msg-content user-bubble"> <div class="bg-slate-100 dark:bg-white/10 text-slate-700 dark:text-slate-200 rounded-2xl px-3 py-2 msg-content user-bubble">
<audio controls preload="metadata" src="${audioUrl}" <div class="user-voice-slot"></div>
class="block w-[260px] max-w-full h-9"></audio>
${caption ? `<div class="text-xs mt-1.5 leading-snug text-slate-500 dark:text-slate-400 whitespace-pre-wrap break-words">${escapeHtml(caption)}</div>` : ''} ${caption ? `<div class="text-xs mt-1.5 leading-snug text-slate-500 dark:text-slate-400 whitespace-pre-wrap break-words">${escapeHtml(caption)}</div>` : ''}
</div> </div>
<div class="text-xs text-slate-400 dark:text-slate-500 mt-1.5 text-right">${formatTime(timestamp)}</div> <div class="text-xs text-slate-400 dark:text-slate-500 mt-1.5 text-right">${formatTime(timestamp)}</div>
</div> </div>
`; `;
el.querySelector('.user-voice-slot').appendChild(renderVoicePill(audioUrl));
messagesDiv.appendChild(el); messagesDiv.appendChild(el);
_autoScrollEnabled = true; _autoScrollEnabled = true;
scrollChatToBottom(true); scrollChatToBottom(true);
@@ -1639,12 +1652,16 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
<div class="agent-steps"></div> <div class="agent-steps"></div>
<div class="answer-content sse-streaming"></div> <div class="answer-content sse-streaming"></div>
<div class="media-content"></div> <div class="media-content"></div>
<div class="bot-audio-slot"></div>
</div> </div>
<div class="flex items-center gap-2 mt-1.5"> <div class="flex items-center gap-2 mt-1.5">
<span class="text-xs text-slate-400 dark:text-slate-500">${formatTime(timestamp)}</span> <span class="text-xs text-slate-400 dark:text-slate-500">${formatTime(timestamp)}</span>
<button class="copy-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${currentLang === 'zh' ? '复制' : 'Copy'}" style="display:none"> <button class="copy-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${currentLang === 'zh' ? '复制' : 'Copy'}" style="display:none">
<i class="fas fa-copy"></i> <i class="fas fa-copy"></i>
</button> </button>
<button class="speak-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${t('speak_msg')}" style="display:none;">
<i class="fas fa-volume-up"></i>
</button>
</div> </div>
</div> </div>
`; `;
@@ -1856,11 +1873,12 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
scrollChatToBottom(); scrollChatToBottom();
} else if (item.type === 'done') { } else if (item.type === 'done') {
// Don't close the stream yet: the backend keeps it open
// for a short tail to deliver async attachments such as
// TTS audio (`voice_attach`). It will close the stream on
// its own via onerror once the tail expires.
done = true; done = true;
es.close();
delete activeStreams[requestId];
// item.content may be empty when "done" is only a stream-close signal after media.
const finalText = item.content || accumulatedText; const finalText = item.content || accumulatedText;
if (!botEl && finalText) { if (!botEl && finalText) {
@@ -1874,6 +1892,7 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
if (copyBtn && finalText) copyBtn.style.display = ''; if (copyBtn && finalText) copyBtn.style.display = '';
applyHighlighting(botEl); applyHighlighting(botEl);
} }
renderBotSpeakerButton(botEl, finalText);
scrollChatToBottom(); scrollChatToBottom();
if (titleInfo) { if (titleInfo) {
@@ -1883,6 +1902,15 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
loadSessionList(); loadSessionList();
} }
} else if (item.type === 'voice_attach') {
// TTS finished — attach a playable audio element to the
// current bot bubble. The stream closes right after.
if (botEl && item.url) {
attachAudioToBotBubble(botEl, item.url, { autoplay: true });
}
es.close();
delete activeStreams[requestId];
} else if (item.type === 'error') { } else if (item.type === 'error') {
done = true; done = true;
es.close(); es.close();
@@ -1896,7 +1924,10 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
es.close(); es.close();
delete activeStreams[requestId]; delete activeStreams[requestId];
if (done) return; if (done) {
// Normal close after the post-done tail expired; nothing to do.
return;
}
if (currentReasoningEl) { if (currentReasoningEl) {
finalizeThinking(currentReasoningEl, reasoningStartTime, reasoningText); finalizeThinking(currentReasoningEl, reasoningStartTime, reasoningText);
@@ -2187,21 +2218,174 @@ function createBotMessageEl(content, timestamp, requestId, msg) {
<div class="bg-white dark:bg-[#1A1A1A] border border-slate-200 dark:border-white/10 rounded-2xl px-4 py-3 text-sm leading-relaxed msg-content text-slate-700 dark:text-slate-200"> <div class="bg-white dark:bg-[#1A1A1A] border border-slate-200 dark:border-white/10 rounded-2xl px-4 py-3 text-sm leading-relaxed msg-content text-slate-700 dark:text-slate-200">
${stepsHtml ? `<div class="agent-steps">${stepsHtml}</div>` : ''} ${stepsHtml ? `<div class="agent-steps">${stepsHtml}</div>` : ''}
<div class="answer-content">${renderMarkdown(displayContent)}</div> <div class="answer-content">${renderMarkdown(displayContent)}</div>
<div class="bot-audio-slot"></div>
</div> </div>
<div class="flex items-center gap-2 mt-1.5"> <div class="flex items-center gap-2 mt-1.5">
<span class="text-xs text-slate-400 dark:text-slate-500">${formatTime(timestamp)}</span> <span class="text-xs text-slate-400 dark:text-slate-500">${formatTime(timestamp)}</span>
<button class="copy-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${currentLang === 'zh' ? '复制' : 'Copy'}"> <button class="copy-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${currentLang === 'zh' ? '复制' : 'Copy'}">
<i class="fas fa-copy"></i> <i class="fas fa-copy"></i>
</button> </button>
<button class="speak-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${t('speak_msg')}" style="display:none;">
<i class="fas fa-volume-up"></i>
</button>
</div> </div>
</div> </div>
`; `;
el.querySelector('.answer-content').dataset.rawMd = displayContent; el.querySelector('.answer-content').dataset.rawMd = displayContent;
// Existing TTS attachment (history replay): mount the player up-front.
const existingAudio = msg && msg.extras && msg.extras.audio && msg.extras.audio.url;
if (existingAudio) {
attachAudioToBotBubble(el, existingAudio, { autoplay: false });
}
renderBotSpeakerButton(el, displayContent);
applyHighlighting(el); applyHighlighting(el);
bindChatKnowledgeLinks(el); bindChatKnowledgeLinks(el);
return el; return el;
} }
// Append (or replace) a small audio player inside a bot bubble's
// dedicated `.bot-audio-slot`. Used by both live TTS pushes and history
// replay. Silent failures: never throws.
function attachAudioToBotBubble(botEl, audioUrl, opts) {
try {
if (!botEl || !audioUrl) return;
const slot = botEl.querySelector('.bot-audio-slot');
if (!slot) return;
slot.innerHTML = '';
slot.style.marginTop = '6px';
const pill = renderVoicePill(audioUrl, { autoplay: !!(opts && opts.autoplay) });
slot.appendChild(pill);
const speakBtn = botEl.querySelector('.speak-msg-btn');
if (speakBtn) speakBtn.style.display = 'none';
} catch (_) { /* silent */ }
}
// Build a compact play/pause + progress + duration pill that wraps a
// hidden <audio>. Returns the root element; safe to embed anywhere.
function renderVoicePill(audioUrl, opts) {
opts = opts || {};
const wrap = document.createElement('div');
wrap.className = 'voice-pill';
wrap.innerHTML = `
<button type="button" class="voice-pill-btn" data-state="play" aria-label="play">
<i class="fas fa-play"></i>
</button>
<div class="voice-pill-track"><div class="voice-pill-fill"></div></div>
<span class="voice-pill-time">0:00</span>
<audio preload="metadata" src="${audioUrl}"></audio>
`;
const btn = wrap.querySelector('.voice-pill-btn');
const fill = wrap.querySelector('.voice-pill-fill');
const timeEl = wrap.querySelector('.voice-pill-time');
const audio = wrap.querySelector('audio');
const fmt = (s) => {
if (!isFinite(s) || s < 0) s = 0;
const m = Math.floor(s / 60);
const r = Math.floor(s % 60);
return `${m}:${r < 10 ? '0' : ''}${r}`;
};
const setIcon = (state) => {
btn.dataset.state = state;
btn.querySelector('i').className = state === 'pause' ? 'fas fa-pause' : 'fas fa-play';
btn.setAttribute('aria-label', state === 'pause' ? 'pause' : 'play');
};
audio.addEventListener('loadedmetadata', () => {
if (audio.duration && isFinite(audio.duration)) timeEl.textContent = fmt(audio.duration);
});
audio.addEventListener('timeupdate', () => {
const dur = audio.duration || 0;
if (dur > 0) {
fill.style.width = `${Math.min(100, (audio.currentTime / dur) * 100)}%`;
timeEl.textContent = fmt(dur - audio.currentTime);
}
});
audio.addEventListener('ended', () => {
setIcon('play');
fill.style.width = '0%';
timeEl.textContent = fmt(audio.duration || 0);
});
audio.addEventListener('play', () => setIcon('pause'));
audio.addEventListener('pause', () => setIcon('play'));
btn.addEventListener('click', (e) => {
e.stopPropagation();
if (audio.paused) {
audio.play().catch(() => {});
} else {
audio.pause();
}
});
if (opts.autoplay) {
// Autoplay may be blocked by the browser; fall back silently and
// let the user tap the play button.
const tryPlay = () => audio.play().catch(() => {});
if (audio.readyState >= 2) tryPlay();
else audio.addEventListener('canplay', tryPlay, { once: true });
}
return wrap;
}
// Show the manual "read aloud" button when TTS is configured but the
// bubble has no audio yet. Lazily probes capability via /api/models so
// we don't expose the button when nothing can synthesize speech.
function renderBotSpeakerButton(botEl, text) {
if (!botEl || !text || !text.trim()) return;
const btn = botEl.querySelector('.speak-msg-btn');
if (!btn) return;
if (botEl.querySelector('.bot-audio-slot audio')) return;
_isTtsReady().then(ready => {
if (!ready) return;
btn.style.display = '';
btn.onclick = () => _triggerManualTts(btn, botEl, text);
});
}
let _ttsReadyPromise = null;
let _ttsReadyTs = 0;
function _isTtsReady() {
// Cache for 30s to avoid hammering /api/models on every bubble.
if (_ttsReadyPromise && Date.now() - _ttsReadyTs < 30000) {
return _ttsReadyPromise;
}
_ttsReadyTs = Date.now();
_ttsReadyPromise = fetch('/api/models')
.then(r => r.json())
.then(data => {
const tts = data && data.capabilities && data.capabilities.tts;
if (!tts) return false;
return Boolean(tts.current_provider || tts.suggested_provider);
})
.catch(() => false);
return _ttsReadyPromise;
}
function _triggerManualTts(btn, botEl, text) {
if (btn.dataset.busy === '1') return;
btn.dataset.busy = '1';
const icon = btn.querySelector('i');
const prev = icon ? icon.className : '';
if (icon) icon.className = 'fas fa-spinner fa-spin';
fetch('/api/voice/tts', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text, session_id: sessionId }),
})
.then(r => r.json())
.then(data => {
if (data && data.status === 'success' && data.audio_url) {
attachAudioToBotBubble(botEl, data.audio_url, { autoplay: true });
}
})
.catch(() => {})
.finally(() => {
btn.dataset.busy = '0';
if (icon) icon.className = prev || 'fas fa-volume-up';
});
}
function addUserMessage(content, timestamp, attachments) { function addUserMessage(content, timestamp, attachments) {
const el = createUserMessageEl(content, timestamp, attachments); const el = createUserMessageEl(content, timestamp, attachments);
messagesDiv.appendChild(el); messagesDiv.appendChild(el);
@@ -3842,14 +4026,39 @@ function renderCapabilityBody(def, cap, body) {
body.innerHTML = providerHtml + modelHtml + dimHtml + footer; body.innerHTML = providerHtml + modelHtml + dimHtml + footer;
// The body subtree is detached from `document` at this moment (the parent // TTS: mount reply-mode above provider; defer off-mode toggle to the end.
// wrap is not yet appended), so we must scope lookups to `body` rather if (def.id === 'tts') {
// than calling document.getElementById, which would return null and crash renderVoiceReplyMode(body, cap.reply_mode || 'off', { skipVisibilityToggle: true });
// initDropdown's internal querySelector. // Voice-timbre picker depends on provider+model; rebuilt by callbacks.
const modelWrap = body.querySelector(`#cap-${def.id}-model-wrap`);
if (modelWrap) {
const voiceWrap = document.createElement('div');
voiceWrap.id = `cap-${def.id}-voice-wrap`;
voiceWrap.innerHTML = `
<label class="block text-sm font-medium text-slate-600 dark:text-slate-400 mb-1.5">${t('models_voice')}</label>
<div id="cap-${def.id}-voice" class="cfg-dropdown" tabindex="0">
<div class="cfg-dropdown-selected">
<span class="cfg-dropdown-text">--</span>
<i class="fas fa-chevron-down cfg-dropdown-arrow"></i>
</div>
<div class="cfg-dropdown-menu"></div>
</div>
<div id="cap-${def.id}-voice-custom-wrap" class="hidden mt-2">
<input id="cap-${def.id}-voice-custom" type="text"
class="w-full px-3 py-2 text-sm rounded-md border border-slate-200 dark:border-slate-700
bg-white dark:bg-slate-800 text-slate-700 dark:text-slate-200
placeholder:text-slate-400 dark:placeholder:text-slate-500
focus:outline-none focus:ring-2 focus:ring-primary-500"
placeholder="voice id" />
</div>
`;
modelWrap.parentNode.insertBefore(voiceWrap, modelWrap.nextSibling);
}
}
// `body` is still detached from `document`; scope lookups locally.
const provDd = body.querySelector(`#cap-${def.id}-provider`); const provDd = body.querySelector(`#cap-${def.id}-provider`);
// initDropdown's option shape is {value, label}; we strip our private // Strip private fields before handing to the generic initDropdown helper.
// _configured/_tracked fields before handing it over so the helper stays
// generic, then re-attach status decorations afterwards.
const ddOpts = providerOpts.map(o => ({ value: o.value, label: o.label })); const ddOpts = providerOpts.map(o => ({ value: o.value, label: o.label }));
let pendingProvider = null; let pendingProvider = null;
@@ -3860,15 +4069,9 @@ function renderCapabilityBody(def, cap, body) {
pendingCapabilitySelection = null; pendingCapabilitySelection = null;
} }
// For auto-capable capabilities, an "auto" strategy means the user has // Auto strategy => leave empty sentinel selected. `suggested_provider`
// not pinned a vendor; we honor that by selecting the empty-string // is a UI-only preselect (not persisted until the user clicks Save).
// sentinel rather than the resolved fallback provider name. // No current + no suggestion => leave unselected with a placeholder.
// `suggested_provider` is a UI-only preselect (used by embedding & ASR)
// when the user has not pinned a vendor yet — purely cosmetic, not
// persisted until the user clicks Save.
// For "pick or empty" capabilities (no current, no suggestion), we leave
// the dropdown unselected and show a muted placeholder so the user is
// nudged to pick explicitly.
const noSelectionAndNoHint = !cap.current_provider && !cap.suggested_provider; const noSelectionAndNoHint = !cap.current_provider && !cap.suggested_provider;
const initialProviderValue = pendingProvider const initialProviderValue = pendingProvider
? pendingProvider ? pendingProvider
@@ -3889,20 +4092,82 @@ function renderCapabilityBody(def, cap, body) {
if (def.needsModel) { if (def.needsModel) {
rebuildCapabilityModelDropdown(def, initialProviderValue, cap.current_model || '', body); rebuildCapabilityModelDropdown(def, initialProviderValue, cap.current_model || '', body);
// Hide the model picker entirely while the capability is in `auto` // Hide model picker in auto mode — fallback hint below covers it.
// mode — there is nothing useful to pin, and the fallback hint
// below explains what'll actually run.
setCapabilityModelPickerVisible(def, initialProviderValue !== '' || !capabilitySupportsAuto(def.id), body); setCapabilityModelPickerVisible(def, initialProviderValue !== '' || !capabilitySupportsAuto(def.id), body);
} }
if (def.id === 'tts') {
rebuildCapabilityVoiceDropdown(
initialProviderValue,
cap.current_voice || '',
body,
cap.current_model || ''
);
}
// Inject auto/router-pending hint banners before the action footer. // Inject auto/router-pending hint banners before the action footer.
renderCapabilityHints(def, cap, body, initialProviderValue); renderCapabilityHints(def, cap, body, initialProviderValue);
if (def.id === 'tts') {
_setTtsConfigVisible(body, (cap.reply_mode || 'off') !== 'off');
}
} }
// Toggle visibility of the model picker. Used both at first render and // TTS reply-policy dropdown (off / voice_if_voice / always). Persists on
// whenever the provider dropdown swings between an explicit vendor and the // change. When off, hides the rest of the TTS card.
// "auto" sentinel. We toggle the wrapper rather than re-rendering so the function renderVoiceReplyMode(host, currentMode, options) {
// existing dropdown state survives a round-trip back to a real vendor. options = options || {};
const opts = [
{ value: 'off', label: t('voice_reply_off') },
{ value: 'voice_if_voice', label: t('voice_reply_if_voice') },
{ value: 'always', label: t('voice_reply_always') },
];
const wrap = document.createElement('div');
wrap.id = 'voice-reply-mode-wrap';
wrap.innerHTML = `
<label class="block text-sm font-medium text-slate-600 dark:text-slate-400 mb-1.5">${t('voice_reply_mode_label')}</label>
<div id="voice-reply-mode-dd" class="cfg-dropdown" tabindex="0">
<div class="cfg-dropdown-selected">
<span class="cfg-dropdown-text">--</span>
<i class="fas fa-chevron-down cfg-dropdown-arrow"></i>
</div>
<div class="cfg-dropdown-menu"></div>
</div>
`;
host.prepend(wrap);
const dd = wrap.querySelector('#voice-reply-mode-dd');
const valid = ['off', 'voice_if_voice', 'always'];
const initial = valid.includes(currentMode) ? currentMode : 'off';
if (!options.skipVisibilityToggle) _setTtsConfigVisible(host, initial !== 'off');
initDropdown(dd, opts, initial, (mode) => {
if (!valid.includes(mode)) return;
_setTtsConfigVisible(host, mode !== 'off');
fetch('/api/models', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ action: 'set_voice_reply_mode', mode }),
})
.then(r => r.json())
.then(data => {
if (data && data.status === 'success') {
_ttsReadyPromise = null; // force re-probe on next bubble
}
})
.catch(() => {});
});
}
// Show/hide everything in the TTS card below the reply-mode dropdown.
function _setTtsConfigVisible(host, visible) {
if (!host) return;
Array.from(host.children).forEach((child) => {
if (child.id === 'voice-reply-mode-wrap') return;
child.classList.toggle('hidden', !visible);
});
}
// Toggle wrapper visibility instead of re-rendering so dropdown state survives.
function setCapabilityModelPickerVisible(def, visible, scope) { function setCapabilityModelPickerVisible(def, visible, scope) {
const root = scope || document; const root = scope || document;
const wrap = root.querySelector(`#cap-${def.id}-model-wrap`); const wrap = root.querySelector(`#cap-${def.id}-model-wrap`);
@@ -4135,7 +4400,7 @@ function rebuildCapabilityModelDropdown(def, providerId, selectedModel, scope) {
initDropdown(el, opts, initialValue, (value) => { initDropdown(el, opts, initialValue, (value) => {
const customWrap = document.getElementById(`cap-${def.id}-model-custom-wrap`); const customWrap = document.getElementById(`cap-${def.id}-model-custom-wrap`);
if (!customWrap) return; if (customWrap) {
if (value === '__custom__') { if (value === '__custom__') {
customWrap.classList.remove('hidden'); customWrap.classList.remove('hidden');
const input = document.getElementById(`cap-${def.id}-model-custom`); const input = document.getElementById(`cap-${def.id}-model-custom`);
@@ -4143,6 +4408,14 @@ function rebuildCapabilityModelDropdown(def, providerId, selectedModel, scope) {
} else { } else {
customWrap.classList.add('hidden'); customWrap.classList.add('hidden');
} }
}
// TTS voice catalog may be scoped per engine model (aggregating
// gateways). Rebuild the voice picker whenever the model changes.
if (def.id === 'tts') {
const provDd = document.getElementById('cap-tts-provider');
const provId = provDd ? getDropdownValue(provDd) : '';
rebuildCapabilityVoiceDropdown(provId, '', null, value);
}
}); });
const customWrap = root.querySelector(`#cap-${def.id}-model-custom-wrap`); const customWrap = root.querySelector(`#cap-${def.id}-model-custom-wrap`);
@@ -4157,22 +4430,93 @@ function rebuildCapabilityModelDropdown(def, providerId, selectedModel, scope) {
} }
} }
// TTS-only: rebuild the voice timbre picker against the provider's
// curated voice list. Hidden when no provider is picked.
//
// Each voice entry may be:
// - a bare string (code = label)
// - {value, label, hint?} so we can show a friendly Chinese name
// while persisting the raw API code that the runtime sends.
function rebuildCapabilityVoiceDropdown(providerId, selectedVoice, scope, modelId) {
const root = scope || document;
const wrap = root.querySelector(`#cap-tts-voice-wrap`);
const el = root.querySelector(`#cap-tts-voice`);
if (!wrap || !el) return;
const cap = modelsState.capabilities.tts || {};
const voicesByProvider = cap.provider_voices || {};
let raw = (providerId && voicesByProvider[providerId]) || [];
// Some providers (gateways) scope voices by engine model id.
if (raw && !Array.isArray(raw) && typeof raw === 'object') {
const activeModel = modelId
|| (root.querySelector(`#cap-tts-model`) ? getDropdownValue(root.querySelector(`#cap-tts-model`)) : '');
raw = (activeModel && raw[activeModel]) || [];
}
if (!raw || raw.length === 0) {
wrap.classList.add('hidden');
return;
}
wrap.classList.remove('hidden');
// Voice picker: friendly name on the left, raw API code as right-hand
// hint. Persisted/sent value is always the raw code.
const codes = [];
const opts = raw.map(entry => {
if (typeof entry === 'string') {
codes.push(entry);
return { value: entry, label: entry };
}
codes.push(entry.value);
const code = entry.value;
const desc = entry.hint || entry.label || code;
return {
value: code,
label: desc,
hint: desc === code ? '' : code,
};
});
opts.push({ value: '__custom__', label: currentLang === 'zh' ? '自定义...' : 'Custom...' });
// Off-catalog values route through the custom branch.
let initial = selectedVoice || '';
const isCustom = initial && !codes.includes(initial);
if (isCustom) initial = '__custom__';
if (!initial) initial = codes[0];
initDropdown(el, opts, initial, (value) => {
const customWrap = root.querySelector(`#cap-tts-voice-custom-wrap`);
if (!customWrap) return;
if (value === '__custom__') {
customWrap.classList.remove('hidden');
const input = root.querySelector(`#cap-tts-voice-custom`);
if (input && !input.value) input.value = isCustom ? selectedVoice : '';
} else {
customWrap.classList.add('hidden');
}
});
const customWrap = root.querySelector(`#cap-tts-voice-custom-wrap`);
if (customWrap) {
if (initial === '__custom__') {
customWrap.classList.remove('hidden');
const input = root.querySelector(`#cap-tts-voice-custom`);
if (input) input.value = isCustom ? selectedVoice : '';
} else {
customWrap.classList.add('hidden');
}
}
}
function onCapabilityProviderChange(def, providerId, scope) { function onCapabilityProviderChange(def, providerId, scope) {
if (def.needsModel) { if (def.needsModel) {
// For capabilities that support `auto`, switching to the empty // Empty sentinel hides the model picker (capability is in auto mode).
// sentinel hides the model picker entirely so the card reads as
// "we'll figure it out"; switching back to a real vendor re-runs
// the rebuild against the capability-scoped model list.
const isAuto = providerId === '' && capabilitySupportsAuto(def.id); const isAuto = providerId === '' && capabilitySupportsAuto(def.id);
if (!isAuto) { if (!isAuto) {
rebuildCapabilityModelDropdown(def, providerId, '', scope); rebuildCapabilityModelDropdown(def, providerId, '', scope);
} }
setCapabilityModelPickerVisible(def, !isAuto, scope); setCapabilityModelPickerVisible(def, !isAuto, scope);
} }
// Refresh the auto-hint so it disappears once the user pins a vendor if (def.id === 'tts') {
// and reappears when they swing back to "auto". renderCapabilityHints rebuildCapabilityVoiceDropdown(providerId, '', scope);
// now writes directly into the footer's hint slot, so we just call it }
// again — no need to clean up stale DOM nodes.
const body = scope || document.querySelector(`[data-cap-body="${def.id}"]`); const body = scope || document.querySelector(`[data-cap-body="${def.id}"]`);
if (body) { if (body) {
const cap = modelsState.capabilities[def.id] || {}; const cap = modelsState.capabilities[def.id] || {};
@@ -4202,6 +4546,16 @@ function saveCapability(capId) {
// the backend treats this as "fall back to the runtime chain". // the backend treats this as "fall back to the runtime chain".
const isAuto = provider === '' && capabilitySupportsAuto(capId); const isAuto = provider === '' && capabilitySupportsAuto(capId);
const model = isAuto ? '' : getCapabilityModelValue(def); const model = isAuto ? '' : getCapabilityModelValue(def);
// TTS carries an extra voice timbre (supports free-text custom ids).
let voice = '';
if (capId === 'tts' && !isAuto) {
const voiceDd = document.getElementById(`cap-${capId}-voice`);
voice = voiceDd ? getDropdownValue(voiceDd) : '';
if (voice === '__custom__') {
const input = document.getElementById(`cap-${capId}-voice-custom`);
voice = input ? input.value.trim() : '';
}
}
// Embedding changes invalidate any pre-existing vector index because // Embedding changes invalidate any pre-existing vector index because
// dimensions / vendor differ. Gate the save behind a confirm, and on // dimensions / vendor differ. Gate the save behind a confirm, and on
@@ -4243,19 +4597,19 @@ function saveCapability(capId) {
return; return;
} }
} }
_persistCapability(capId, provider, model); _persistCapability(capId, provider, model, undefined, { voice });
} }
function _persistCapability(capId, provider, model, onAfterSuccess) { function _persistCapability(capId, provider, model, onAfterSuccess, extras) {
const payload = { action: 'set_capability', capability: capId, provider_id: provider, model: model };
if (extras && extras.voice !== undefined) payload.voice = extras.voice;
fetch('/api/models', { fetch('/api/models', {
method: 'POST', method: 'POST',
headers: { 'Content-Type': 'application/json' }, headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ action: 'set_capability', capability: capId, provider_id: provider, model: model }), body: JSON.stringify(payload),
}).then(r => r.json()).then(data => { }).then(r => r.json()).then(data => {
if (data.status === 'success') { if (data.status === 'success') {
// Show "Saved" first, then refresh — loadModelsView would // Flash "Saved" before reload so the status survives the rebuild.
// otherwise rebuild the card and wipe the status span before
// the user can register the confirmation.
showStatus(`cap-${capId}-status`, 'models_save_success', false); showStatus(`cap-${capId}-status`, 'models_save_success', false);
setTimeout(() => { setTimeout(() => {
loadModelsView({ preserveScroll: true }); loadModelsView({ preserveScroll: true });

View File

@@ -6,6 +6,7 @@ import logging
import mimetypes import mimetypes
import os import os
import random import random
import shutil
import threading import threading
import time import time
import uuid import uuid
@@ -295,6 +296,12 @@ class WebChannel(ChatChannel):
"timestamp": time.time() "timestamp": time.time()
}) })
logger.debug(f"SSE done sent for request {request_id}") logger.debug(f"SSE done sent for request {request_id}")
# Auto-trigger TTS once the bot finishes its text reply. The
# synthesis runs in the background so the chat stream is never
# blocked; the resulting audio URL is pushed via a follow-up
# `voice_attach` SSE event and persisted to messages.extras.
if reply.type == ReplyType.TEXT and content.strip():
self._maybe_dispatch_auto_tts(request_id, session_id, content, context)
return return
# Fallback: polling mode # Fallback: polling mode
@@ -461,16 +468,133 @@ class WebChannel(ChatChannel):
return on_event return on_event
# ------------------------------------------------------------------
# TTS auto-dispatch
# ------------------------------------------------------------------
@staticmethod
def _resolve_voice_reply_mode() -> str:
"""
Decide the TTS auto-reply policy.
Source of truth is the cross-channel pair
(`always_reply_voice`, `voice_reply_voice`) which chat_channel
also consults. The web UI presents these as a single three-state
picker (off / voice_if_voice / always) via a lossless mapping.
"""
if conf().get("always_reply_voice", False):
return "always"
if conf().get("voice_reply_voice", False):
return "voice_if_voice"
return "off"
# Mirror of ModelsHandler._TTS_PROVIDERS. zhipu is intentionally omitted
# from the UI (GLM-TTS prelude beep); pinning it in config.json still works.
_TTS_PROVIDERS_SUGGEST_ORDER = ["openai", "minimax", "dashscope", "linkai"]
@classmethod
def _tts_provider_ready(cls) -> bool:
"""True if user picked a provider OR any suggested vendor has an API key."""
if (conf().get("text_to_voice") or "").strip():
return True
for pid in cls._TTS_PROVIDERS_SUGGEST_ORDER:
meta = ConfigHandler.PROVIDER_MODELS.get(pid) or {}
key_field = meta.get("api_key_field")
if not key_field:
continue
val = (conf().get(key_field) or "").strip()
if val and val not in ("YOUR API KEY", "YOUR_API_KEY"):
return True
return False
def _maybe_dispatch_auto_tts(
self,
request_id: str,
session_id: str,
text: str,
context: dict,
) -> None:
try:
mode = self._resolve_voice_reply_mode()
if mode == "off":
return
if mode == "voice_if_voice" and not context.get("is_voice_input"):
return
if not self._tts_provider_ready():
return
threading.Thread(
target=self._synthesize_tts_async,
args=(request_id, session_id, text),
daemon=True,
).start()
except Exception as e:
logger.debug(f"[WebChannel] auto-tts dispatch skipped: {e}")
def _synthesize_tts_async(
self,
request_id: str,
session_id: str,
text: str,
) -> None:
try:
from bridge.bridge import Bridge
reply = Bridge().fetch_text_to_voice(text)
if reply is None or reply.type != ReplyType.VOICE or not reply.content:
logger.warning(
f"[WebChannel] TTS produced no audio for request {request_id}: "
f"reply={reply}"
)
return
url = self._publish_tts_audio(reply.content)
if not url:
logger.warning(f"[WebChannel] TTS publish failed for request {request_id}")
return
payload = {"audio": {"url": url, "kind": "tts"}}
try:
from agent.memory import get_conversation_store
get_conversation_store().attach_extras_to_last_assistant(session_id, payload)
except Exception as e:
logger.debug(f"[WebChannel] tts persist skipped: {e}")
q = self.sse_queues.get(request_id)
if q is None:
logger.warning(
f"[WebChannel] TTS ready but SSE queue already closed "
f"for request {request_id} (url={url})"
)
return
q.put({
"type": "voice_attach",
"url": url,
"request_id": request_id,
"timestamp": time.time(),
})
logger.info(f"[WebChannel] TTS voice_attach pushed for request {request_id}: {url}")
except Exception as e:
# TTS failures are intentionally silent (no user-facing error).
logger.warning(f"[WebChannel] TTS synthesis failed: {e}")
@staticmethod
def _publish_tts_audio(src_path: str) -> str:
"""Move a TTS file into uploads/ and return its public URL."""
try:
if not src_path or not os.path.isfile(src_path):
logger.warning(f"[WebChannel] publish_tts_audio missing source: {src_path!r}")
return ""
ext = os.path.splitext(src_path)[1].lower() or ".mp3"
upload_dir = _get_upload_dir()
os.makedirs(upload_dir, exist_ok=True)
ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
dst_name = f"voice_reply_{ts}_{random.randint(0, 9999)}{ext}"
dst_path = os.path.join(upload_dir, dst_name)
shutil.move(src_path, dst_path)
logger.debug(f"[WebChannel] publish_tts_audio moved {src_path} -> {dst_path}")
return f"/uploads/{dst_name}"
except Exception as e:
logger.warning(f"[WebChannel] publish_tts_audio failed: {e}")
return ""
@staticmethod @staticmethod
def _cleanup_stale_voice_recordings(max_age_seconds: int = 3600) -> None: def _cleanup_stale_voice_recordings(max_age_seconds: int = 3600) -> None:
"""Delete voice-input audio files older than `max_age_seconds`. """Drop voice_input_* uploads older than max_age_seconds (run at startup)."""
Called once at startup. Web mic recordings live in the upload
directory so the browser can replay them inside the conversation
bubble. We don't persist them to history, so once a process
restarts they're useless — but they're never auto-cleaned
anywhere else, so without this they accumulate over time.
"""
try: try:
upload_dir = _get_upload_dir() upload_dir = _get_upload_dir()
if not os.path.isdir(upload_dir): if not os.path.isdir(upload_dir):
@@ -619,6 +743,10 @@ class WebChannel(ChatChannel):
prompt = json_data.get('message', '') prompt = json_data.get('message', '')
use_sse = json_data.get('stream', True) use_sse = json_data.get('stream', True)
attachments = json_data.get('attachments', []) attachments = json_data.get('attachments', [])
# Tag the message as originating from voice input so the post-reply
# TTS hook can honour the `voice_if_voice` policy (mirrors the
# desire_rtype concept used by other channels).
is_voice_input = bool(json_data.get('is_voice', False))
# Append file references to the prompt (same format as QQ channel) # Append file references to the prompt (same format as QQ channel)
if attachments: if attachments:
@@ -669,6 +797,11 @@ class WebChannel(ChatChannel):
context["session_id"] = session_id context["session_id"] = session_id
context["receiver"] = session_id context["receiver"] = session_id
context["request_id"] = request_id context["request_id"] = request_id
if is_voice_input:
# Web channel runs its own TTS post-pipeline via
# _maybe_dispatch_auto_tts; don't set desire_rtype here or
# chat_channel would synthesize a duplicate VOICE reply.
context["is_voice_input"] = True
if use_sse: if use_sse:
context["on_event"] = self._make_sse_callback(request_id) context["on_event"] = self._make_sse_callback(request_id)
@@ -696,27 +829,39 @@ class WebChannel(ChatChannel):
q = self.sse_queues[request_id] q = self.sse_queues[request_id]
idle_timeout = 600 # 10 minutes without any real event idle_timeout = 600 # 10 minutes without any real event
deadline = time.time() + idle_timeout deadline = time.time() + idle_timeout
done = False # After the main reply is done we keep the stream open for a short
# tail so async post-processing (TTS auto-synthesis) can deliver a
# `voice_attach` event before the client disconnects.
POST_DONE_TAIL_SECONDS = 60
post_done = False
post_deadline = 0.0
try: try:
while time.time() < deadline: while time.time() < deadline:
try: try:
item = q.get(timeout=1) item = q.get(timeout=1)
except Empty: except Empty:
if post_done and time.time() >= post_deadline:
break
yield b": keepalive\n\n" yield b": keepalive\n\n"
continue continue
# Real event received, reset idle deadline
deadline = time.time() + idle_timeout deadline = time.time() + idle_timeout
payload = json.dumps(item, ensure_ascii=False) payload = json.dumps(item, ensure_ascii=False)
yield f"data: {payload}\n\n".encode("utf-8") yield f"data: {payload}\n\n".encode("utf-8")
if item.get("type") == "done": itype = item.get("type")
done = True if itype == "done":
break post_done = True
post_deadline = time.time() + POST_DONE_TAIL_SECONDS
elif itype == "voice_attach":
# WSGI buffers the previous chunk until the next yield;
# shrink the tail so the generator wakes up quickly to
# emit a couple of keepalive comments that push the
# voice_attach payload through to the browser.
post_done = True
post_deadline = time.time() + 2 # 2s post-attach tail
finally: finally:
if done:
self.sse_queues.pop(request_id, None) self.sse_queues.pop(request_id, None)
def poll_response(self): def poll_response(self):
@@ -811,6 +956,7 @@ class WebChannel(ChatChannel):
'/uploads/(.*)', 'UploadsHandler', '/uploads/(.*)', 'UploadsHandler',
'/api/file', 'FileServeHandler', '/api/file', 'FileServeHandler',
'/api/voice/asr', 'VoiceAsrHandler', '/api/voice/asr', 'VoiceAsrHandler',
'/api/voice/tts', 'VoiceTtsHandler',
'/poll', 'PollHandler', '/poll', 'PollHandler',
'/stream', 'StreamHandler', '/stream', 'StreamHandler',
'/chat', 'ChatHandler', '/chat', 'ChatHandler',
@@ -936,15 +1082,8 @@ class UploadHandler:
class VoiceAsrHandler: class VoiceAsrHandler:
""" """Receive a mic recording, persist it under uploads/ and run ASR.
Accept a short audio recording from the web console mic button, Returns {status, text, audio_url} so the UI can render a playback bubble."""
save it under uploads/ so the browser can replay it, then run it
through the currently configured ASR provider.
Returns {status, text, audio_url} on success — the frontend renders
a voice-message bubble with the playable audio and the transcribed
caption.
"""
def POST(self): def POST(self):
_require_auth() _require_auth()
web.header('Content-Type', 'application/json; charset=utf-8') web.header('Content-Type', 'application/json; charset=utf-8')
@@ -997,6 +1136,48 @@ class VoiceAsrHandler:
return json.dumps({"status": "error", "message": str(e)}) return json.dumps({"status": "error", "message": str(e)})
class VoiceTtsHandler:
"""On-demand TTS for the in-chat "read aloud" button. Returns the
audio URL and (when session_id is given) persists it onto the message."""
def POST(self):
_require_auth()
web.header('Content-Type', 'application/json; charset=utf-8')
try:
data = json.loads(web.data() or b"{}")
text = (data.get("text") or "").strip()
session_id = (data.get("session_id") or "").strip()
if not text:
return json.dumps({"status": "error", "message": "empty text"})
# `@singleton` makes WebChannel a factory function — go via instance.
channel = WebChannel()
if not channel._tts_provider_ready():
return json.dumps({"status": "error", "message": "tts not configured"})
from bridge.bridge import Bridge
reply = Bridge().fetch_text_to_voice(text)
if reply is None or reply.type != ReplyType.VOICE or not reply.content:
msg = getattr(reply, "content", "") or "tts failed"
return json.dumps({"status": "error", "message": str(msg)})
url = channel._publish_tts_audio(reply.content)
if not url:
return json.dumps({"status": "error", "message": "publish failed"})
if session_id:
try:
from agent.memory import get_conversation_store
get_conversation_store().attach_extras_to_last_assistant(
session_id, {"audio": {"url": url, "kind": "tts"}},
)
except Exception as e:
logger.debug(f"[VoiceTtsHandler] persist skipped: {e}")
return json.dumps({"status": "success", "audio_url": url})
except Exception as e:
logger.exception(f"[VoiceTtsHandler] failed: {e}")
return json.dumps({"status": "error", "message": str(e)})
class UploadsHandler: class UploadsHandler:
def GET(self, file_name): def GET(self, file_name):
_require_auth() _require_auth()
@@ -1357,10 +1538,243 @@ class ModelsHandler:
POST /api/models/capability -> set provider/model for a capability POST /api/models/capability -> set provider/model for a capability
""" """
# Capability -> editable flag, current-value resolver, and supported provider # Capability -> provider ids drawn from ConfigHandler.PROVIDER_MODELS.
# ids drawn from ConfigHandler.PROVIDER_MODELS where applicable.
_ASR_PROVIDERS = ["openai", "dashscope", "zhipu", "linkai"] _ASR_PROVIDERS = ["openai", "dashscope", "zhipu", "linkai"]
_TTS_PROVIDERS = ["openai", "linkai", "minimax", "baidu", "ali", "xunfei", "azure", "google", "elevenlabs", "edge", "pytts"] # Web-console white-list. Other vendors stay usable via direct config.
_TTS_PROVIDERS = ["openai", "minimax", "dashscope", "linkai"]
# TTS engine catalog (speech models, not voice timbres). Entries are
# either a bare code or {value, hint?} when a friendly label helps.
_TTS_PROVIDER_MODELS = {
"openai": ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"],
"minimax": [
{"value": "speech-2.8-hd", "hint": "情绪渲染融合语气词,自然听感"},
{"value": "speech-2.8-turbo", "hint": "极致生成速度,更自然逼真"},
{"value": "speech-2.6-hd", "hint": "超低延时,归一化升级"},
{"value": "speech-2.6-turbo", "hint": "更快更便宜,适合语音聊天/数字人"},
],
"dashscope": [
{"value": "qwen3-tts-flash", "hint": "覆盖普通话、方言与主流外语"},
],
# Aggregating gateway: a single endpoint multiplexes several
# underlying TTS engines, selected via the `model` field.
# Each engine exposes its own voice catalog (see _TTS_PROVIDER_VOICES).
"linkai": [
{"value": "tts-1", "hint": "OpenAI · 多语种通用"},
{"value": "doubao", "hint": "字节豆包 · 中文音色丰富"},
{"value": "baidu", "hint": "百度 · 中文主播音色"},
],
}
# Per-provider voice timbres. Entries can be a bare code string
# (label = code) or {value, hint?} when a friendly secondary label
# helps recognition. We keep `value` as the raw API code so power
# users can cross-reference config.json.
_TTS_PROVIDER_VOICES = {
"openai": [
"alloy", "echo", "fable", "onyx", "nova", "shimmer",
"ash", "ballad", "coral", "sage", "verse",
],
"minimax": [
# Mandarin Chinese (full catalog)
{"value": "male-qn-qingse", "hint": "中文 · 青涩青年(男)"},
{"value": "male-qn-jingying", "hint": "中文 · 精英青年(男)"},
{"value": "male-qn-badao", "hint": "中文 · 霸道青年(男)"},
{"value": "male-qn-daxuesheng", "hint": "中文 · 青年大学生(男)"},
{"value": "female-shaonv", "hint": "中文 · 少女(女)"},
{"value": "female-yujie", "hint": "中文 · 御姐(女)"},
{"value": "female-chengshu", "hint": "中文 · 成熟女性(女)"},
{"value": "female-tianmei", "hint": "中文 · 甜美女性(女)"},
{"value": "male-qn-qingse-jingpin", "hint": "中文 · 青涩青年-beta"},
{"value": "male-qn-jingying-jingpin", "hint": "中文 · 精英青年-beta"},
{"value": "male-qn-badao-jingpin", "hint": "中文 · 霸道青年-beta"},
{"value": "male-qn-daxuesheng-jingpin", "hint": "中文 · 青年大学生-beta"},
{"value": "female-shaonv-jingpin", "hint": "中文 · 少女-beta"},
{"value": "female-yujie-jingpin", "hint": "中文 · 御姐-beta"},
{"value": "female-chengshu-jingpin", "hint": "中文 · 成熟女性-beta"},
{"value": "female-tianmei-jingpin", "hint": "中文 · 甜美女性-beta"},
{"value": "clever_boy", "hint": "中文 · 聪明男童"},
{"value": "cute_boy", "hint": "中文 · 可爱男童"},
{"value": "lovely_girl", "hint": "中文 · 萌萌女童"},
{"value": "cartoon_pig", "hint": "中文 · 卡通猪小琪"},
{"value": "bingjiao_didi", "hint": "中文 · 病娇弟弟"},
{"value": "junlang_nanyou", "hint": "中文 · 俊朗男友"},
{"value": "chunzhen_xuedi", "hint": "中文 · 纯真学弟"},
{"value": "lengdan_xiongzhang", "hint": "中文 · 冷淡学长"},
{"value": "badao_shaoye", "hint": "中文 · 霸道少爷"},
{"value": "tianxin_xiaoling", "hint": "中文 · 甜心小玲"},
{"value": "qiaopi_mengmei", "hint": "中文 · 俏皮萌妹"},
{"value": "wumei_yujie", "hint": "中文 · 妩媚御姐"},
{"value": "diadia_xuemei", "hint": "中文 · 嗲嗲学妹"},
{"value": "danya_xuejie", "hint": "中文 · 淡雅学姐"},
{"value": "Chinese (Mandarin)_Reliable_Executive", "hint": "中文 · 沉稳高管"},
{"value": "Chinese (Mandarin)_News_Anchor", "hint": "中文 · 新闻女声"},
{"value": "Chinese (Mandarin)_Mature_Woman", "hint": "中文 · 傲娇御姐"},
{"value": "Chinese (Mandarin)_Unrestrained_Young_Man","hint": "中文 · 不羁青年"},
{"value": "Arrogant_Miss", "hint": "中文 · 嚣张小姐"},
{"value": "Robot_Armor", "hint": "中文 · 机械战甲"},
{"value": "Chinese (Mandarin)_Kind-hearted_Antie", "hint": "中文 · 热心大婶"},
{"value": "Chinese (Mandarin)_HK_Flight_Attendant", "hint": "中文 · 港普空姐"},
{"value": "Chinese (Mandarin)_Humorous_Elder", "hint": "中文 · 搞笑大爷"},
{"value": "Chinese (Mandarin)_Gentleman", "hint": "中文 · 温润男声"},
{"value": "Chinese (Mandarin)_Warm_Bestie", "hint": "中文 · 温暖闺蜜"},
{"value": "Chinese (Mandarin)_Male_Announcer", "hint": "中文 · 播报男声"},
{"value": "Chinese (Mandarin)_Sweet_Lady", "hint": "中文 · 甜美女声"},
{"value": "Chinese (Mandarin)_Southern_Young_Man", "hint": "中文 · 南方小哥"},
{"value": "Chinese (Mandarin)_Wise_Women", "hint": "中文 · 阅历姐姐"},
{"value": "Chinese (Mandarin)_Gentle_Youth", "hint": "中文 · 温润青年"},
{"value": "Chinese (Mandarin)_Warm_Girl", "hint": "中文 · 温暖少女"},
{"value": "Chinese (Mandarin)_Kind-hearted_Elder", "hint": "中文 · 花甲奶奶"},
{"value": "Chinese (Mandarin)_Cute_Spirit", "hint": "中文 · 憨憨萌兽"},
{"value": "Chinese (Mandarin)_Radio_Host", "hint": "中文 · 电台男主播"},
{"value": "Chinese (Mandarin)_Lyrical_Voice", "hint": "中文 · 抒情男声"},
{"value": "Chinese (Mandarin)_Straightforward_Boy", "hint": "中文 · 率真弟弟"},
{"value": "Chinese (Mandarin)_Sincere_Adult", "hint": "中文 · 真诚青年"},
{"value": "Chinese (Mandarin)_Gentle_Senior", "hint": "中文 · 温柔学姐"},
{"value": "Chinese (Mandarin)_Stubborn_Friend", "hint": "中文 · 嘴硬竹马"},
{"value": "Chinese (Mandarin)_Crisp_Girl", "hint": "中文 · 清脆少女"},
{"value": "Chinese (Mandarin)_Pure-hearted_Boy", "hint": "中文 · 清澈邻家弟弟"},
{"value": "Chinese (Mandarin)_Soft_Girl", "hint": "中文 · 柔和少女"},
# Cantonese (full catalog)
{"value": "Cantonese_ProfessionalHostF)", "hint": "粤语 · 专业女主持"},
{"value": "Cantonese_GentleLady", "hint": "粤语 · 温柔女声"},
{"value": "Cantonese_ProfessionalHostM)", "hint": "粤语 · 专业男主持"},
{"value": "Cantonese_PlayfulMan", "hint": "粤语 · 活泼男声"},
{"value": "Cantonese_CuteGirl", "hint": "粤语 · 可爱女孩"},
{"value": "Cantonese_KindWoman", "hint": "粤语 · 善良女声"},
# English (curated: 1F + 1M)
{"value": "English_Graceful_Lady", "hint": "英文 · Graceful Lady"},
{"value": "English_Trustworthy_Man", "hint": "英文 · Trustworthy Man"},
# Japanese (curated: 1F + 1M)
{"value": "Japanese_KindLady", "hint": "日文 · Kind Lady"},
{"value": "Japanese_LoyalKnight", "hint": "日文 · Loyal Knight"},
# Korean (curated: 1F + 1M)
{"value": "Korean_SweetGirl", "hint": "韩文 · Sweet Girl"},
{"value": "Korean_CheerfulBoyfriend", "hint": "韩文 · Cheerful Boyfriend"},
],
"dashscope": [
{"value": "Cherry", "hint": "芊悦 · 阳光女声"},
{"value": "Serena", "hint": "苏瑶 · 温柔女声"},
{"value": "Chelsie", "hint": "千雪 · 二次元少女"},
{"value": "Ethan", "hint": "晨煦 · 阳光男声"},
{"value": "Moon", "hint": "月白 · 率性男声"},
{"value": "Kai", "hint": "凯 · 治愈男声"},
{"value": "Nofish", "hint": "不吃鱼 · 设计师男声"},
{"value": "Bella", "hint": "萌宝 · 小萝莉"},
{"value": "Bunny", "hint": "萌小姬 · 萌系少女"},
{"value": "Stella", "hint": "少女阿月 · 元气少女"},
{"value": "Neil", "hint": "阿闻 · 新闻主播"},
{"value": "Seren", "hint": "小婉 · 助眠女声"},
{"value": "Jada", "hint": "上海话 · 阿珍"},
{"value": "Dylan", "hint": "北京话 · 晓东"},
{"value": "Sunny", "hint": "四川话 · 晴儿"},
{"value": "Eric", "hint": "四川话 · 程川"},
{"value": "Rocky", "hint": "粤语 · 阿强"},
{"value": "Kiki", "hint": "粤语 · 阿清"},
{"value": "Peter", "hint": "天津话 · 李彼得"},
{"value": "Marcus", "hint": "陕西话 · 秦川"},
{"value": "Roy", "hint": "闽南语 · 阿杰"},
],
# Aggregating gateway: voices are scoped per engine model. The
# frontend picks the correct list based on the selected model so
# users don't see incompatible timbres for the active engine.
"linkai": {
"tts-1": [
"alloy", "echo", "fable", "onyx", "nova", "shimmer",
],
"doubao": [
{"value": "zh_female_wanwanxiaohe_moon_bigtts", "hint": "湾湾小何"},
{"value": "BV007_streaming", "hint": "亲切女声"},
{"value": "BV001_streaming", "hint": "通用女声"},
{"value": "BV002_streaming", "hint": "通用男声"},
{"value": "BV051_streaming", "hint": "奶气萌娃"},
{"value": "zh_female_linjianvhai_moon_bigtts", "hint": "邻家女孩"},
{"value": "BV700_streaming", "hint": "灿灿"},
{"value": "BV019_streaming", "hint": "重庆小伙"},
{"value": "BV524_streaming", "hint": "日语男声"},
{"value": "BV021_streaming", "hint": "东北老铁"},
{"value": "BV701_streaming", "hint": "擎苍"},
{"value": "BV113_streaming", "hint": "甜宠少御"},
{"value": "BV056_streaming", "hint": "阳光男声"},
{"value": "BV213_streaming", "hint": "广西表哥"},
{"value": "BV119_streaming", "hint": "通用赘婿"},
{"value": "BV705_streaming", "hint": "炀炀"},
{"value": "BV033_streaming", "hint": "温柔小哥"},
{"value": "BV102_streaming", "hint": "儒雅青年"},
{"value": "BV522_streaming", "hint": "气质女生"},
{"value": "BV034_streaming", "hint": "知性姐姐 · 双语"},
{"value": "BV005_streaming", "hint": "活泼女声"},
{"value": "zh_female_wanqudashu_moon_bigtts", "hint": "湾区大叔"},
{"value": "zh_female_daimengchuanmei_moon_bigtts", "hint": "呆萌川妹"},
{"value": "zh_male_guozhoudege_moon_bigtts", "hint": "广州德哥"},
{"value": "zh_male_beijingxiaoye_moon_bigtts", "hint": "北京小爷"},
{"value": "zh_male_shaonianzixin_moon_bigtts", "hint": "少年梓辛 / Brayan"},
{"value": "zh_female_meilinvyou_moon_bigtts", "hint": "魅力女友"},
{"value": "zh_male_shenyeboke_moon_bigtts", "hint": "深夜播客"},
{"value": "zh_female_sajiaonvyou_moon_bigtts", "hint": "柔美女友"},
{"value": "zh_female_yuanqinvyou_moon_bigtts", "hint": "撒娇学妹"},
{"value": "zh_male_haoyuxiaoge_moon_bigtts", "hint": "浩宇小哥"},
{"value": "zh_male_guangxiyuanzhou_moon_bigtts", "hint": "广西远舟"},
{"value": "zh_female_meituojieer_moon_bigtts", "hint": "妹坨洁儿"},
{"value": "zh_male_yuzhouzixuan_moon_bigtts", "hint": "豫州子轩"},
{"value": "BV115_streaming", "hint": "古风少御"},
{"value": "zh_female_gaolengyujie_moon_bigtts", "hint": "高冷御姐"},
{"value": "zh_male_yuanboxiaoshu_moon_bigtts", "hint": "渊博小叔"},
{"value": "zh_male_yangguangqingnian_moon_bigtts", "hint": "阳光青年"},
{"value": "zh_male_aojiaobazong_moon_bigtts", "hint": "傲娇霸总"},
{"value": "zh_male_jingqiangkanye_moon_bigtts", "hint": "京腔侃爷 / Harmony"},
{"value": "zh_female_shuangkuaisisi_moon_bigtts", "hint": "爽快思思 / Skye"},
{"value": "zh_male_wennuanahu_moon_bigtts", "hint": "温暖阿虎 / Alvin"},
{"value": "multi_female_shuangkuaisisi_moon_bigtts", "hint": "はるこ / Esmeralda"},
{"value": "multi_male_jingqiangkanye_moon_bigtts", "hint": "かずね / Javier or Álvaro"},
{"value": "multi_female_gaolengyujie_moon_bigtts", "hint": "あけみ"},
{"value": "multi_male_wanqudashu_moon_bigtts", "hint": "ひろし / Roberto"},
{"value": "ICL_zh_female_bingruoshaonv_tob", "hint": "病弱少女"},
{"value": "ICL_zh_female_huoponvhai_tob", "hint": "活泼女孩"},
{"value": "ICL_zh_female_heainainai_tob", "hint": "和蔼奶奶"},
{"value": "ICL_zh_female_linjuayi_tob", "hint": "邻居阿姨"},
{"value": "zh_female_wenrouxiaoya_moon_bigtts", "hint": "温柔小雅"},
{"value": "zh_female_tianmeixiaoyuan_moon_bigtts", "hint": "甜美小源"},
{"value": "zh_female_qingchezizi_moon_bigtts", "hint": "清澈梓梓"},
{"value": "zh_male_dongfanghaoran_moon_bigtts", "hint": "东方浩然"},
{"value": "zh_male_jieshuoxiaoming_moon_bigtts", "hint": "解说小明"},
{"value": "zh_female_kailangjiejie_moon_bigtts", "hint": "开朗姐姐"},
{"value": "zh_male_linjiananhai_moon_bigtts", "hint": "邻家男孩"},
{"value": "zh_female_tianmeiyueyue_moon_bigtts", "hint": "甜美悦悦"},
{"value": "zh_female_xinlingjitang_moon_bigtts", "hint": "心灵鸡汤"},
],
"baidu": [
{"value": "baidu_0", "hint": "度小美 · 标准女主播"},
{"value": "baidu_1", "hint": "度小宇 · 亲切男声"},
{"value": "baidu_3", "hint": "度逍遥 · 情感男声"},
{"value": "baidu_4", "hint": "度丫丫 · 童声"},
{"value": "baidu_5", "hint": "度小娇 · 成熟女主播"},
{"value": "baidu_5003", "hint": "度逍遥 · 情感男声"},
{"value": "baidu_5118", "hint": "度小鹿 · 甜美女声"},
{"value": "baidu_103", "hint": "度米朵 · 可爱童声"},
{"value": "baidu_106", "hint": "度博文 · 专业男主播"},
{"value": "baidu_110", "hint": "度小童 · 童声主播"},
{"value": "baidu_111", "hint": "度小萌 · 软萌妹子"},
{"value": "baidu_4003", "hint": "度逍遥 · 情感男声"},
{"value": "baidu_4100", "hint": "度小雯 · 活力女主播"},
{"value": "baidu_4103", "hint": "度米朵 · 可爱女声"},
{"value": "baidu_4105", "hint": "度灵儿 · 清澈女声"},
{"value": "baidu_4106", "hint": "度博文 · 专业男主播"},
{"value": "baidu_4115", "hint": "度小贤 · 电台男主播"},
{"value": "baidu_4117", "hint": "度小乔 · 活泼女声"},
{"value": "baidu_4119", "hint": "度小鹿 · 甜美女声"},
{"value": "baidu_4129", "hint": "度小彦 · 知识男主播"},
{"value": "baidu_4140", "hint": "度小新 · 专业女主播"},
{"value": "baidu_4143", "hint": "度清风 · 配音男声"},
{"value": "baidu_4144", "hint": "度姗姗 · 娱乐女声"},
{"value": "baidu_4149", "hint": "度星河 · 广告男声"},
{"value": "baidu_4206", "hint": "度博文 · 综艺男声"},
{"value": "baidu_4226", "hint": "南方 · 电台女主播"},
{"value": "baidu_4254", "hint": "度小清 · 广告女声"},
{"value": "baidu_4278", "hint": "度小贝 · 知识女主播"},
],
},
}
_EMBEDDING_PROVIDERS = ["openai", "dashscope", "doubao", "zhipu", "linkai"] _EMBEDDING_PROVIDERS = ["openai", "dashscope", "doubao", "zhipu", "linkai"]
# Capability-scoped model catalogs. The chat dropdown can reuse the # Capability-scoped model catalogs. The chat dropdown can reuse the
@@ -1525,7 +1939,7 @@ class ModelsHandler:
@classmethod @classmethod
def _predict_vision_auto(cls, local_config: dict) -> dict: def _predict_vision_auto(cls, local_config: dict) -> dict:
"""Predict which provider vision.py will actually dispatch to when """Predict which provider vision.py will actually dispatch to when
no tool.vision.model is set. Mirrors the fallback order in no tools.vision.model is set. Mirrors the fallback order in
agent/tools/vision/vision.py::_resolve_providers so the UI hint agent/tools/vision/vision.py::_resolve_providers so the UI hint
matches reality.""" matches reality."""
chat = cls._chat_capability(local_config) chat = cls._chat_capability(local_config)
@@ -1590,12 +2004,12 @@ class ModelsHandler:
@classmethod @classmethod
def _vision_capability(cls, local_config: dict) -> dict: def _vision_capability(cls, local_config: dict) -> dict:
"""Vision model. tool.vision.model is the explicit override; otherwise """Vision model. tools.vision.model is the explicit override; otherwise
the runtime fallback chain in agent/tools/vision/vision.py decides.""" the runtime fallback chain in agent/tools/vision/vision.py decides."""
tool_conf = local_config.get("tool") or {} tools_conf = local_config.get("tools") or local_config.get("tool") or {}
if not isinstance(tool_conf, dict): if not isinstance(tools_conf, dict):
tool_conf = {} tools_conf = {}
vision_conf = tool_conf.get("vision") or {} vision_conf = tools_conf.get("vision") or {}
if not isinstance(vision_conf, dict): if not isinstance(vision_conf, dict):
vision_conf = {} vision_conf = {}
user_specified = (vision_conf.get("model") or "").strip() user_specified = (vision_conf.get("model") or "").strip()
@@ -1652,14 +2066,38 @@ class ModelsHandler:
@classmethod @classmethod
def _tts_capability(cls, local_config: dict) -> dict: def _tts_capability(cls, local_config: dict) -> dict:
provider_id = (local_config.get("text_to_voice") or "openai").strip().lower() explicit = (local_config.get("text_to_voice") or "").strip().lower()
# Providers outside the white-list don't drive the picker, but their
# underlying runtime config is preserved so bridge still routes them.
ui_provider = explicit if explicit in cls._TTS_PROVIDERS else ""
suggested = ""
if not ui_provider:
for pid in cls._TTS_PROVIDERS:
meta = ConfigHandler.PROVIDER_MODELS.get(pid) or {}
key_field = meta.get("api_key_field")
if key_field and cls._is_real_key(local_config.get(key_field, "")):
suggested = pid
break
return { return {
"editable": True, "editable": True,
"current_provider": provider_id, "current_provider": ui_provider,
"current_model": local_config.get("text_to_voice_model", "") or "", "suggested_provider": suggested,
"current_model": (local_config.get("text_to_voice_model") or "") if ui_provider else "",
"current_voice": (local_config.get("tts_voice_id") or "") if ui_provider else "",
"providers": cls._TTS_PROVIDERS, "providers": cls._TTS_PROVIDERS,
"provider_models": cls._TTS_PROVIDER_MODELS,
"provider_voices": cls._TTS_PROVIDER_VOICES,
"reply_mode": cls._tts_reply_mode(local_config),
} }
@staticmethod
def _tts_reply_mode(local_config: dict) -> str:
if local_config.get("always_reply_voice", False):
return "always"
if local_config.get("voice_reply_voice", False):
return "voice_if_voice"
return "off"
@classmethod @classmethod
def _embedding_capability(cls, local_config: dict) -> dict: def _embedding_capability(cls, local_config: dict) -> dict:
# Embedding is "pick or empty" — runtime's legacy openai/linkai # Embedding is "pick or empty" — runtime's legacy openai/linkai
@@ -1728,17 +2166,20 @@ class ModelsHandler:
@classmethod @classmethod
def _image_capability(cls, local_config: dict) -> dict: def _image_capability(cls, local_config: dict) -> dict:
"""Image generation. Source of truth: config["skill"]["image-generation"]["model"] """Image generation. Source of truth: config["skills"]["image-generation"]["model"]
(mirrors the per-skill config schema documented in skills/image-generation). (mirrors the per-skill config schema documented in skills/image-generation).
The runtime resolver in skills/image-generation/scripts/generate.py The runtime resolver in skills/image-generation/scripts/generate.py
reads this via the SKILL_IMAGE_GENERATION_MODEL env var that the reads this via the SKILL_IMAGE_GENERATION_MODEL env var that the
agent_initializer syncs at startup; provider is inferred from the agent_initializer syncs at startup; provider is inferred from the
model name prefix, mirroring vision.py's design. model name prefix, mirroring vision.py's design.
``skill`` (singular) is still tolerated as a legacy fallback —
config.load_config() folds it into ``skills`` at startup.
""" """
skill_node = local_config.get("skill") or {} skills_node = local_config.get("skills") or local_config.get("skill") or {}
if not isinstance(skill_node, dict): if not isinstance(skills_node, dict):
skill_node = {} skills_node = {}
img_node = skill_node.get("image-generation") or {} img_node = skills_node.get("image-generation") or {}
if not isinstance(img_node, dict): if not isinstance(img_node, dict):
img_node = {} img_node = {}
explicit_model = (img_node.get("model") or "").strip() explicit_model = (img_node.get("model") or "").strip()
@@ -1832,6 +2273,8 @@ class ModelsHandler:
return self._handle_delete_provider(data) return self._handle_delete_provider(data)
if action == "set_capability": if action == "set_capability":
return self._handle_set_capability(data) return self._handle_set_capability(data)
if action == "set_voice_reply_mode":
return self._handle_set_voice_reply_mode(data)
return json.dumps({"status": "error", "message": f"unknown action: {action!r}"}) return json.dumps({"status": "error", "message": f"unknown action: {action!r}"})
except Exception as e: except Exception as e:
logger.error(f"[ModelsHandler] POST failed: {e}") logger.error(f"[ModelsHandler] POST failed: {e}")
@@ -1918,7 +2361,7 @@ class ModelsHandler:
if capability == "asr": if capability == "asr":
return self._set_simple("voice_to_text", provider_id) return self._set_simple("voice_to_text", provider_id)
if capability == "tts": if capability == "tts":
return self._set_tts(provider_id, model) return self._set_tts(provider_id, model, (data.get("voice") or "").strip())
if capability == "embedding": if capability == "embedding":
return self._set_embedding(provider_id, model) return self._set_embedding(provider_id, model)
if capability == "image": if capability == "image":
@@ -1926,35 +2369,20 @@ class ModelsHandler:
return json.dumps({"status": "error", "message": f"capability not editable: {capability}"}) return json.dumps({"status": "error", "message": f"capability not editable: {capability}"})
def _set_image(self, provider_id: str, model: str) -> str: def _set_image(self, provider_id: str, model: str) -> str:
# Source of truth: config["skill"]["image-generation"]["model"]. # Source of truth: skills.image-generation.model. provider_id is
# provider_id is informational only (used by the UI to highlight a # informational only; the resolver picks the vendor by model prefix.
# vendor card); the runtime resolver infers the provider from the
# model name prefix at request time, mirroring vision.py's design.
# An empty model means "switch back to auto / let the script pick".
local_config = conf() local_config = conf()
file_cfg = self._read_file_config() file_cfg = self._read_file_config()
def _ensure_skill_node(cfg: dict) -> dict: self._set_nested_namespace_value(local_config, "skills", "image-generation", "model", model or "")
skill_node = cfg.get("skill") or {} self._set_nested_namespace_value(file_cfg, "skills", "image-generation", "model", model or "")
if not isinstance(skill_node, dict): self._drop_legacy_namespace(local_config, "skill", "skills", child="image-generation")
skill_node = {} self._drop_legacy_namespace(file_cfg, "skill", "skills", child="image-generation")
img_node = skill_node.get("image-generation") or {}
if not isinstance(img_node, dict):
img_node = {}
skill_node["image-generation"] = img_node
cfg["skill"] = skill_node
return img_node
_ensure_skill_node(local_config)["model"] = model or ""
_ensure_skill_node(file_cfg)["model"] = model or ""
self._write_file_config(file_cfg) self._write_file_config(file_cfg)
# The skill subprocess (skills/image-generation/scripts/generate.py) # The skill subprocess reads SKILL_IMAGE_GENERATION_MODEL from env at
# reads SKILL_IMAGE_GENERATION_MODEL from its environment, which is # startup; mirror the change so live edits apply without restart.
# only synced from config["skill"] at startup. Update os.environ live
# so changes take effect on the next call without a restart. An empty
# model means "clear the override" → drop the env var entirely.
env_key = "SKILL_IMAGE_GENERATION_MODEL" env_key = "SKILL_IMAGE_GENERATION_MODEL"
if model: if model:
os.environ[env_key] = model os.environ[env_key] = model
@@ -1992,8 +2420,6 @@ class ModelsHandler:
applied["model"] = model applied["model"] = model
if not applied: if not applied:
# No-op save (nothing to write). Return success so the UI can
# confirm the click without showing a misleading error.
return json.dumps({"status": "success", "applied": {}, "noop": True}) return json.dumps({"status": "success", "applied": {}, "noop": True})
self._write_file_config(file_cfg) self._write_file_config(file_cfg)
@@ -2002,34 +2428,66 @@ class ModelsHandler:
return json.dumps({"status": "success", "applied": applied}) return json.dumps({"status": "success", "applied": applied})
def _set_vision(self, provider_id: str, model: str) -> str: def _set_vision(self, provider_id: str, model: str) -> str:
# Vision uses tool.vision.model (nested). provider_id is informational # Source of truth: tools.vision.model. provider_id is informational
# only; the runtime resolver auto-routes by model name prefix. # only; the resolver picks the vendor by model prefix.
local_config = conf() local_config = conf()
file_cfg = self._read_file_config() file_cfg = self._read_file_config()
tool_node = file_cfg.get("tool") or {} self._set_nested_namespace_value(file_cfg, "tools", "vision", "model", model)
if not isinstance(tool_node, dict): self._set_nested_namespace_value(local_config, "tools", "vision", "model", model)
tool_node = {} self._drop_legacy_namespace(file_cfg, "tool", "tools", child="vision")
vision_node = tool_node.get("vision") or {} self._drop_legacy_namespace(local_config, "tool", "tools", child="vision")
if not isinstance(vision_node, dict):
vision_node = {}
vision_node["model"] = model
tool_node["vision"] = vision_node
file_cfg["tool"] = tool_node
# Mirror into in-memory config so the live agent sees the change.
runtime_tool = local_config.get("tool") or {}
if not isinstance(runtime_tool, dict):
runtime_tool = {}
runtime_vision = runtime_tool.get("vision") or {}
if not isinstance(runtime_vision, dict):
runtime_vision = {}
runtime_vision["model"] = model
runtime_tool["vision"] = runtime_vision
local_config["tool"] = runtime_tool
self._write_file_config(file_cfg) self._write_file_config(file_cfg)
logger.info(f"[ModelsHandler] vision model set: {model!r}") logger.info(f"[ModelsHandler] vision model set: {model!r}")
return json.dumps({"status": "success", "model": model}) return json.dumps({"status": "success", "model": model})
@staticmethod
def _set_nested_namespace_value(cfg, top: str, name: str, key: str, value):
"""Set ``cfg[top][name][key] = value``, creating missing dicts."""
bucket = cfg.get(top)
if not isinstance(bucket, dict):
bucket = {}
node = bucket.get(name)
if not isinstance(node, dict):
node = {}
node[key] = value
bucket[name] = node
cfg[top] = bucket
@staticmethod
def _drop_legacy_namespace(cfg, legacy: str, canonical: str, child: str) -> None:
"""Strip the deprecated singular key so config.json stays single-source."""
legacy_section = cfg.get(legacy)
if not isinstance(legacy_section, dict):
return
legacy_section.pop(child, None)
if legacy_section:
cfg[legacy] = legacy_section
else:
cfg.pop(legacy, None)
def _handle_set_voice_reply_mode(self, data: dict) -> str:
# UI picker (off / voice_if_voice / always) maps to the legacy
# always_reply_voice + voice_reply_voice pair that chat_channel.py
# reads, so all channels (web/feishu/wecom/...) share the routing.
mode = (data.get("mode") or "").strip().lower()
if mode not in ("off", "voice_if_voice", "always"):
return json.dumps({"status": "error", "message": f"invalid mode: {mode!r}"})
always = (mode == "always")
if_voice = (mode == "voice_if_voice")
local_config = conf()
file_cfg = self._read_file_config()
local_config["always_reply_voice"] = always
local_config["voice_reply_voice"] = if_voice
file_cfg["always_reply_voice"] = always
file_cfg["voice_reply_voice"] = if_voice
self._write_file_config(file_cfg)
logger.info(
f"[ModelsHandler] voice reply mode set: {mode!r} "
f"(always_reply_voice={always}, voice_reply_voice={if_voice})"
)
return json.dumps({"status": "success", "mode": mode})
def _set_simple(self, key: str, value: str) -> str: def _set_simple(self, key: str, value: str) -> str:
local_config = conf() local_config = conf()
file_cfg = self._read_file_config() file_cfg = self._read_file_config()
@@ -2037,25 +2495,30 @@ class ModelsHandler:
file_cfg[key] = value file_cfg[key] = value
self._write_file_config(file_cfg) self._write_file_config(file_cfg)
logger.info(f"[ModelsHandler] {key} set: {value!r}") logger.info(f"[ModelsHandler] {key} set: {value!r}")
# Bridge caches voice_to_text routing + bot instance; refresh it # Hot-swap the cached voice bot so the change takes effect immediately.
# so the change takes effect on the next voice request.
if key in ("voice_to_text", "text_to_voice"): if key in ("voice_to_text", "text_to_voice"):
self._refresh_voice_routing() self._refresh_voice_routing()
return json.dumps({"status": "success", key: value}) return json.dumps({"status": "success", key: value})
def _set_tts(self, provider_id: str, model: str) -> str: def _set_tts(self, provider_id: str, model: str, voice: str = "") -> str:
local_config = conf() local_config = conf()
file_cfg = self._read_file_config() file_cfg = self._read_file_config()
if provider_id:
local_config["text_to_voice"] = provider_id local_config["text_to_voice"] = provider_id
file_cfg["text_to_voice"] = provider_id file_cfg["text_to_voice"] = provider_id
if model:
local_config["text_to_voice_model"] = model local_config["text_to_voice_model"] = model
file_cfg["text_to_voice_model"] = model file_cfg["text_to_voice_model"] = model
local_config["tts_voice_id"] = voice
file_cfg["tts_voice_id"] = voice
self._write_file_config(file_cfg) self._write_file_config(file_cfg)
logger.info(f"[ModelsHandler] tts updated: provider={provider_id!r} model={model!r}") logger.info(
f"[ModelsHandler] tts updated: provider={provider_id!r} "
f"model={model!r} voice={voice!r}"
)
self._refresh_voice_routing() self._refresh_voice_routing()
return json.dumps({"status": "success", "provider": provider_id, "model": model}) return json.dumps({
"status": "success",
"provider": provider_id, "model": model, "voice": voice,
})
@staticmethod @staticmethod
def _refresh_voice_routing() -> None: def _refresh_voice_routing() -> None:
@@ -2066,17 +2529,20 @@ class ModelsHandler:
logger.warning(f"[ModelsHandler] Bridge voice refresh failed: {e}") logger.warning(f"[ModelsHandler] Bridge voice refresh failed: {e}")
def _set_embedding(self, provider_id: str, model: str) -> str: def _set_embedding(self, provider_id: str, model: str) -> str:
# provider_id="" + model="" means "switch back to legacy auto mode". # Two valid states: both empty (reset to pick-or-empty) OR both set.
# A provider without a model leaves the runtime in a broken half-state,
# so reject that explicitly instead of silently writing it through.
if provider_id and not model:
return json.dumps({
"status": "error",
"message": "embedding model is required when a provider is selected",
})
local_config = conf() local_config = conf()
file_cfg = self._read_file_config() file_cfg = self._read_file_config()
local_config["embedding_provider"] = provider_id local_config["embedding_provider"] = provider_id
file_cfg["embedding_provider"] = provider_id file_cfg["embedding_provider"] = provider_id
if model:
local_config["embedding_model"] = model local_config["embedding_model"] = model
file_cfg["embedding_model"] = model file_cfg["embedding_model"] = model
else:
local_config["embedding_model"] = ""
file_cfg["embedding_model"] = ""
self._write_file_config(file_cfg) self._write_file_config(file_cfg)
logger.info(f"[ModelsHandler] embedding updated: provider={provider_id!r} model={model!r}") logger.info(f"[ModelsHandler] embedding updated: provider={provider_id!r} model={model!r}")
# The next /memory rebuild-index command hot-swaps the provider onto # The next /memory rebuild-index command hot-swaps the provider onto

View File

@@ -16,8 +16,8 @@
"open_ai_api_base": "https://api.openai.com/v1", "open_ai_api_base": "https://api.openai.com/v1",
"gemini_api_key": "", "gemini_api_key": "",
"gemini_api_base": "https://generativelanguage.googleapis.com", "gemini_api_base": "https://generativelanguage.googleapis.com",
"voice_to_text": "openai", "voice_to_text": "",
"text_to_voice": "openai", "text_to_voice": "",
"voice_reply_voice": false, "voice_reply_voice": false,
"speech_recognition": true, "speech_recognition": true,
"group_speech_recognition": false, "group_speech_recognition": false,

View File

@@ -330,8 +330,18 @@ def load_config():
config_str = read_file(config_path) config_str = read_file(config_path)
logger.debug("[INIT] config str: {}".format(drag_sensitive(config_str))) logger.debug("[INIT] config str: {}".format(drag_sensitive(config_str)))
# 将json字符串反序列化为dict类型 # 将json字符串反序列化为dict类型
config = Config(json.loads(config_str)) # `object_pairs_hook` lets us catch users who accidentally typed the
# same key twice (e.g. two `"tools"` blocks) — json.loads would
# otherwise silently drop all but the last occurrence.
config = Config(json.loads(config_str, object_pairs_hook=_merge_duplicate_keys))
# Migrate legacy singular keys (`tool`, `skill`) into the canonical
# plural buckets so the rest of the codebase only reads one schema.
# Deep-merge so existing `tools`/`skills` entries are preserved and
# only missing namespaces are filled in from the legacy section.
_merge_legacy_namespace(config, legacy="tool", canonical="tools")
_merge_legacy_namespace(config, legacy="skill", canonical="skills")
# override config with environment variables. # override config with environment variables.
# Some online deployment platforms (e.g. Railway) deploy project from github directly. So you shouldn't put your secrets like api key in a config file, instead use environment variables to override the default config. # Some online deployment platforms (e.g. Railway) deploy project from github directly. So you shouldn't put your secrets like api key in a config file, instead use environment variables to override the default config.
@@ -422,7 +432,7 @@ def load_config():
os.environ[env_key] = str(val) os.environ[env_key] = str(val)
injected += 1 injected += 1
injected += _sync_skill_config_to_env(config.get("skill", {})) injected += _sync_skill_config_to_env(config.get("skills", {}))
if injected: if injected:
logger.info("[INIT] Synced {} config values to environment variables".format(injected)) logger.info("[INIT] Synced {} config values to environment variables".format(injected))
@@ -430,11 +440,90 @@ def load_config():
config.load_user_datas() config.load_user_datas()
def _deep_merge_dicts(base: dict, incoming: dict) -> dict:
"""Recursively merge ``incoming`` into ``base`` (incoming wins on leaves)."""
for key, val in incoming.items():
if (
key in base
and isinstance(base[key], dict)
and isinstance(val, dict)
):
_deep_merge_dicts(base[key], val)
else:
base[key] = val
return base
def _merge_duplicate_keys(pairs):
"""object_pairs_hook for json.loads: deep-merge duplicate top-level keys
(lists concat, dicts merge, scalars take the latter) instead of dropping."""
out = {}
duplicates = []
for key, val in pairs:
if key not in out:
out[key] = val
continue
duplicates.append(key)
prev = out[key]
if isinstance(prev, dict) and isinstance(val, dict):
_deep_merge_dicts(prev, val)
elif isinstance(prev, list) and isinstance(val, list):
prev.extend(val)
else:
out[key] = val
if duplicates:
# logger may not be wired yet — fall back to print so we never lose the warning.
unique = sorted(set(duplicates))
try:
logger.warning("[INIT] config.json has duplicate keys (merged): %s", unique)
except Exception:
print("[INIT] config.json has duplicate keys (merged):", unique)
return out
def _merge_legacy_namespace(cfg, legacy: str, canonical: str) -> None:
"""Fold deprecated singular keys (``tool`` / ``skill``) into their plural
canonical counterparts at load time. Canonical entries always win."""
legacy_section = cfg.get(legacy)
if not isinstance(legacy_section, dict) or not legacy_section:
cfg.pop(legacy, None)
return
canonical_section = cfg.get(canonical)
if not isinstance(canonical_section, dict):
canonical_section = {}
merged_keys = []
for name, val in legacy_section.items():
if name in canonical_section:
if isinstance(canonical_section[name], dict) and isinstance(val, dict):
for sub_key, sub_val in val.items():
if (
sub_key in canonical_section[name]
and isinstance(canonical_section[name][sub_key], dict)
and isinstance(sub_val, dict)
):
_deep_merge_dicts(sub_val, canonical_section[name][sub_key])
canonical_section[name][sub_key] = sub_val
else:
canonical_section[name].setdefault(sub_key, sub_val)
continue
canonical_section[name] = val
merged_keys.append(name)
cfg[canonical] = canonical_section
cfg.pop(legacy, None)
if merged_keys:
logger.warning(
"[INIT] Legacy config key '{}' is deprecated; merged into '{}': {}. "
"Please rename '{}' to '{}' in your config.json.".format(
legacy, canonical, merged_keys, legacy, canonical,
)
)
def _sync_skill_config_to_env(skill_section) -> int: def _sync_skill_config_to_env(skill_section) -> int:
"""Flatten skill-namespaced config into environment variables. """Flatten skill-namespaced config into environment variables.
Mapping rule: ``config["skill"][<name>][<key>]`` -> ``SKILL_<NAME>_<KEY>`` Mapping rule: ``config["skills"][<name>][<key>]`` -> ``SKILL_<NAME>_<KEY>``
(e.g. ``skill["image-generation"].model`` -> ``SKILL_IMAGE_GENERATION_MODEL``). (e.g. ``skills["image-generation"].model`` -> ``SKILL_IMAGE_GENERATION_MODEL``).
This lets subprocess-based skill scripts read their own settings without This lets subprocess-based skill scripts read their own settings without
importing project code. Existing env vars are NOT overwritten so the importing project code. Existing env vars are NOT overwritten so the

View File

@@ -40,7 +40,7 @@ To force a specific Vision model, set it explicitly in `config.json`:
```json ```json
{ {
"tool": { "tools": {
"vision": { "vision": {
"model": "ernie-4.5-turbo-vl" "model": "ernie-4.5-turbo-vl"
} }

View File

@@ -11,7 +11,7 @@ New built-in `image-generation` skill supporting text-to-image, image-to-image,
- **Zero model selection**: Just configure an API key and it works — no need to manually specify a model. You can also name a specific model in conversation (e.g. "draw a cat with seedream") - **Zero model selection**: Just configure an API key and it works — no need to manually specify a model. You can also name a specific model in conversation (e.g. "draw a cat with seedream")
- **Flexible control**: Supports `quality`, `size` (512/1K4K), and `aspect_ratio` parameters, with each provider automatically mapping to its supported values - **Flexible control**: Supports `quality`, `size` (512/1K4K), and `aspect_ratio` parameters, with each provider automatically mapping to its supported values
- **Image editing**: Pass existing images for editing, style transfer, or multi-image fusion (Seedream supports up to 14 reference images) - **Image editing**: Pass existing images for editing, style transfer, or multi-image fusion (Seedream supports up to 14 reference images)
- **Skill-level config**: Pin a default model via `skill.image-generation.model` in `config.json` - **Skill-level config**: Pin a default model via `skills.image-generation.model` in `config.json`
- **Image lightbox**: All images in the Web console now support click-to-enlarge preview - **Image lightbox**: All images in the Web console now support click-to-enlarge preview
Docs: [Image Generation Skill](https://docs.cowagent.ai/en/skills/image-generation) Docs: [Image Generation Skill](https://docs.cowagent.ai/en/skills/image-generation)

View File

@@ -51,7 +51,7 @@ The voice and streaming building blocks come from a community contribution #2791
## 🔧 Tools and Safety ## 🔧 Tools and Safety
- **Vision model selection**: `tool.vision.model` config now actually takes effect, with automatic fallback when unconfigured #2792 - **Vision model selection**: `tools.vision.model` config now actually takes effect, with automatic fallback when unconfigured #2792
- **Bash safety prompt**: The destructive-deletion confirm prompt is now scoped to paths outside the workspace — routine in-workspace operations are no longer interrupted - **Bash safety prompt**: The destructive-deletion confirm prompt is now scoped to paths outside the workspace — routine in-workspace operations are no longer interrupted
## 🐛 Other Fixes ## 🐛 Other Fixes

View File

@@ -87,7 +87,7 @@ Configure ARK_API_KEY as xxx
To force all image generation through a specific provider's model, add this to `config.json`: To force all image generation through a specific provider's model, add this to `config.json`:
```json ```json
"skill": { "skills": {
"image-generation": { "image-generation": {
"model": "seedream-5.0-lite" "model": "seedream-5.0-lite"
} }

View File

@@ -51,7 +51,7 @@ To specify a particular model for the vision tool, add to `config.json`:
```json ```json
{ {
"tool": { "tools": {
"vision": { "vision": {
"model": "ernie-4.5-turbo-vl" "model": "ernie-4.5-turbo-vl"
} }

View File

@@ -40,7 +40,7 @@ description: Baidu Qianfan ERNIE モデル設定
```json ```json
{ {
"tool": { "tools": {
"vision": { "vision": {
"model": "ernie-4.5-turbo-vl" "model": "ernie-4.5-turbo-vl"
} }

View File

@@ -11,7 +11,7 @@ description: CowAgent 2.0.7 - 画像生成スキル6プロバイダー自動
- **モデル選択不要**API Key を設定するだけで使用可能、モデルを手動で指定する必要なし。会話で特定モデルを指名することも可能「seedream で猫を描いて」) - **モデル選択不要**API Key を設定するだけで使用可能、モデルを手動で指定する必要なし。会話で特定モデルを指名することも可能「seedream で猫を描いて」)
- **柔軟な制御**`quality`(画質)、`size`解像度、512/1K〜4K、`aspect_ratio`(アスペクト比)パラメータ対応、各プロバイダーが自動的に有効な値にマッピング - **柔軟な制御**`quality`(画質)、`size`解像度、512/1K〜4K、`aspect_ratio`(アスペクト比)パラメータ対応、各プロバイダーが自動的に有効な値にマッピング
- **画像編集**既存の画像を渡して編集・スタイル変換・複数画像融合が可能Seedream は最大 14 枚の参照画像をサポート) - **画像編集**既存の画像を渡して編集・スタイル変換・複数画像融合が可能Seedream は最大 14 枚の参照画像をサポート)
- **スキルレベル設定**`config.json` の `skill.image-generation.model` でデフォルトモデルを固定可能 - **スキルレベル設定**`config.json` の `skills.image-generation.model` でデフォルトモデルを固定可能
- **画像ライトボックス**Web コンソールのすべての画像がクリックで拡大プレビュー対応 - **画像ライトボックス**Web コンソールのすべての画像がクリックで拡大プレビュー対応
ドキュメント:[画像生成スキル](https://docs.cowagent.ai/ja/skills/image-generation) ドキュメント:[画像生成スキル](https://docs.cowagent.ai/ja/skills/image-generation)

View File

@@ -51,7 +51,7 @@ description: CowAgent 2.0.8 - 飛書チャネル全面アップグレード(
## 🔧 ツールと安全性 ## 🔧 ツールと安全性
- **Vision モデル選択**`tool.vision.model` 設定が実際に反映されるようになり、未設定時は自動フォールバック #2792 - **Vision モデル選択**`tools.vision.model` 設定が実際に反映されるようになり、未設定時は自動フォールバック #2792
- **Bash セーフティ確認**:破壊的削除の確認プロンプトをワークスペース外のパスに限定。ワークスペース内の通常操作は中断されません - **Bash セーフティ確認**:破壊的削除の確認プロンプトをワークスペース外のパスに限定。ワークスペース内の通常操作は中断されません
## 🐛 その他の修正 ## 🐛 その他の修正

View File

@@ -87,7 +87,7 @@ ARK_API_KEY を xxx に設定して
すべての画像生成を特定のプロバイダーのモデルで固定したい場合、`config.json` に以下を追加: すべての画像生成を特定のプロバイダーのモデルで固定したい場合、`config.json` に以下を追加:
```json ```json
"skill": { "skills": {
"image-generation": { "image-generation": {
"model": "seedream-5.0-lite" "model": "seedream-5.0-lite"
} }

View File

@@ -51,7 +51,7 @@ Vision ツールで使用するモデルを指定するには、`config.json`
```json ```json
{ {
"tool": { "tools": {
"vision": { "vision": {
"model": "ernie-4.5-turbo-vl" "model": "ernie-4.5-turbo-vl"
} }

View File

@@ -40,7 +40,7 @@ description: 百度千帆 ERNIE 模型配置
```json ```json
{ {
"tool": { "tools": {
"vision": { "vision": {
"model": "ernie-4.5-turbo-vl" "model": "ernie-4.5-turbo-vl"
} }

View File

@@ -11,7 +11,7 @@ description: CowAgent 2.0.7 - 图像生成技能(六厂商自动路由)、
- **开箱即用**:配置 API Key 即可使用,无需手动指定模型。也支持在对话中指定特定模型 - **开箱即用**:配置 API Key 即可使用,无需手动指定模型。也支持在对话中指定特定模型
- **灵活控制**:支持 `quality`(画质)、`size`分辨率512/1K~4K、`aspect_ratio`(宽高比)等参数,各厂商自动适配有效值 - **灵活控制**:支持 `quality`(画质)、`size`分辨率512/1K~4K、`aspect_ratio`(宽高比)等参数,各厂商自动适配有效值
- **图片编辑**:传入已有图片即可进行编辑、风格迁移、多图融合 - **图片编辑**:传入已有图片即可进行编辑、风格迁移、多图融合
- **Skill 级配置**:支持通过 `config.json` 中的 `skill.image-generation.model` 固定默认模型 - **Skill 级配置**:支持通过 `config.json` 中的 `skills.image-generation.model` 固定默认模型
相关文档:[图像生成技能](https://docs.cowagent.ai/skills/image-generation) 相关文档:[图像生成技能](https://docs.cowagent.ai/skills/image-generation)

View File

@@ -46,7 +46,7 @@ description: CowAgent 2.0.8 - 飞书渠道全面升级(语音、流式打字
## 🔧 工具与安全 ## 🔧 工具与安全
- **图像识别模型**:让 `tool.vision.model` 配置真正生效,未配置时自动 fallback #2792 Thanks CNXudiandian - **图像识别模型**:让 `tools.vision.model` 配置真正生效,未配置时自动 fallback #2792 Thanks CNXudiandian
- **Bash 安全确认**:仅对工作区外的破坏性删除做二次确认,工作区内常规操作不再打扰 - **Bash 安全确认**:仅对工作区外的破坏性删除做二次确认,工作区内常规操作不再打扰
## 🐛 其他修复 ## 🐛 其他修复

View File

@@ -88,7 +88,7 @@ description: 文生图 / 图生图 / 多图融合,支持多家厂商自动路
如果想让所有图像生成固定走某个厂商的模型,可以在 `config.json` 里加: 如果想让所有图像生成固定走某个厂商的模型,可以在 `config.json` 里加:
```json ```json
"skill": { "skills": {
"image-generation": { "image-generation": {
"model": "seedream-5.0-lite" "model": "seedream-5.0-lite"
} }

View File

@@ -40,7 +40,7 @@ Vision 工具采用多级自动选择 + 自动兜底策略,无需手动配置
```json ```json
{ {
"tool": { "tools": {
"vision": { "vision": {
"model": "gpt-4.1" "model": "gpt-4.1"
} }

View File

@@ -1110,7 +1110,7 @@ def main():
# Model resolution priority: # Model resolution priority:
# 1. Explicit `model` in the call args (agent / user override) # 1. Explicit `model` in the call args (agent / user override)
# 2. SKILL_IMAGE_GENERATION_MODEL env var (synced from # 2. SKILL_IMAGE_GENERATION_MODEL env var (synced from
# config["skill"]["image-generation"]["model"] at startup) # config["skills"]["image-generation"]["model"] at startup)
# 3. None → fall back to automatic provider routing (try every # 3. None → fall back to automatic provider routing (try every
# provider with a configured API key in global priority order) # provider with a configured API key in global priority order)
model = args.get("model") or os.environ.get("SKILL_IMAGE_GENERATION_MODEL") or "" model = args.get("model") or os.environ.get("SKILL_IMAGE_GENERATION_MODEL") or ""

View File

@@ -394,7 +394,7 @@ class TestQianfanVisionTool(unittest.TestCase):
"open_ai_api_key": "", "open_ai_api_key": "",
"linkai_api_key": "", "linkai_api_key": "",
"use_linkai": False, "use_linkai": False,
"tool": {}, "tools": {},
} }
if values: if values:
data.update(values) data.update(values)
@@ -424,7 +424,7 @@ class TestQianfanVisionTool(unittest.TestCase):
def test_vision_routes_ernie_model_override_to_qianfan(self): def test_vision_routes_ernie_model_override_to_qianfan(self):
fake_conf = self._fake_conf({ fake_conf = self._fake_conf({
"qianfan_api_key": "test-qianfan-key", "qianfan_api_key": "test-qianfan-key",
"tool": {"vision": {"model": "ernie-4.5-turbo-vl-32k"}}, "tools": {"vision": {"model": "ernie-4.5-turbo-vl-32k"}},
}) })
fake_bot = MagicMock() fake_bot = MagicMock()
fake_bot.call_vision = MagicMock() fake_bot.call_vision = MagicMock()

View File

@@ -1,20 +1,13 @@
# encoding:utf-8 # encoding:utf-8
""" """DashScope voice: qwen3-asr-flash (ASR) + qwen3-tts-flash (TTS)
DashScope (Aliyun Bailian) voice service. via dashscope.MultiModalConversation."""
import datetime
ASR : qwen3-asr-flash via dashscope.MultiModalConversation
TTS : not yet implemented (see CosyVoice / qwen3-tts)
Why MultiModalConversation instead of the OpenAI-compatible endpoint:
- SDK is already a project dep (used by chat/vision)
- Native API accepts local file:// paths up to 100 QPS without an OSS
round-trip, which is what we need for the "send a short voice
message" flow. Public URLs / Base64 also work.
"""
import os import os
import random
from typing import Optional from typing import Optional
import dashscope import dashscope
import requests
from dashscope import MultiModalConversation from dashscope import MultiModalConversation
from bridge.reply import Reply, ReplyType from bridge.reply import Reply, ReplyType
@@ -25,16 +18,14 @@ from voice.voice import Voice
DEFAULT_ASR_MODEL = "qwen3-asr-flash" DEFAULT_ASR_MODEL = "qwen3-asr-flash"
# qwen3-asr-flash hard cap (single file, sync call). Longer audio needs DEFAULT_TTS_MODEL = "qwen3-tts-flash"
# qwen3-asr-flash-filetrans which is async-only and out of scope here. DEFAULT_TTS_VOICE = "Cherry"
MAX_DURATION_SECONDS = 300 MAX_DURATION_SECONDS = 300
MAX_FILE_BYTES = 10 * 1024 * 1024 MAX_FILE_BYTES = 10 * 1024 * 1024
class DashScopeVoice(Voice): class DashScopeVoice(Voice):
def __init__(self): def __init__(self):
# api_key is applied per-call (chat bot does the same) so a live
# config change via the web console takes effect without restart.
pass pass
def voiceToText(self, voice_file: str): def voiceToText(self, voice_file: str):
@@ -83,14 +74,72 @@ class DashScopeVoice(Voice):
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~") return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
def textToVoice(self, text: str): def textToVoice(self, text: str):
# TTS will be added in a follow-up commit (qwen3-tts / cosyvoice). try:
return Reply(ReplyType.ERROR, "DashScope 语音合成尚未接入") api_key = conf().get("dashscope_api_key", "")
if not api_key:
logger.error("[DashScopeVoice] dashscope_api_key is not configured")
return Reply(ReplyType.ERROR, "未配置 DashScope API key")
dashscope.api_key = api_key
model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
voice = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
response = MultiModalConversation.call(
model=model,
api_key=api_key,
text=text,
voice=voice,
stream=False,
)
url = self._extract_audio_url(response)
if not url:
logger.error(f"[DashScopeVoice] textToVoice failed: {response}")
return Reply(ReplyType.ERROR, "语音合成失败")
local_path = self._download_audio(url)
if not local_path:
return Reply(ReplyType.ERROR, "语音合成失败")
logger.info(f"[DashScopeVoice] textToVoice model={model} voice={voice} file={local_path}")
return Reply(ReplyType.VOICE, local_path)
except Exception as e:
logger.exception(f"[DashScopeVoice] textToVoice exception: {e}")
return Reply(ReplyType.ERROR, "语音合成失败")
@staticmethod
def _extract_audio_url(response) -> Optional[str]:
try:
if getattr(response, "status_code", 200) != 200:
return None
audio = response.output.get("audio") if response.output else None
if isinstance(audio, dict):
return audio.get("url") or None
return getattr(audio, "url", None)
except Exception:
return None
@staticmethod
def _download_audio(url: str) -> Optional[str]:
try:
tmp_dir = os.path.join(os.getcwd(), "tmp")
os.makedirs(tmp_dir, exist_ok=True)
ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
ext = os.path.splitext(url.split("?", 1)[0])[1].lower() or ".wav"
if ext not in (".mp3", ".wav", ".m4a", ".aac", ".opus"):
ext = ".wav"
dst = os.path.join(tmp_dir, f"dashscope_tts_{ts}_{random.randint(0, 9999)}{ext}")
resp = requests.get(url, timeout=60)
resp.raise_for_status()
with open(dst, "wb") as f:
f.write(resp.content)
return dst
except Exception as e:
logger.error(f"[DashScopeVoice] download audio failed: {e}")
return None
@staticmethod @staticmethod
def _ensure_compatible_format(voice_file: str) -> str: def _ensure_compatible_format(voice_file: str) -> str:
"""Convert AMR/SILK to mp3 since qwen3-asr-flash doesn't accept them. # qwen3-asr-flash doesn't accept AMR/SILK; mp3/wav/m4a/aac/opus pass through.
Other formats (mp3/wav/m4a/aac/opus/webm) are passed through.
"""
lower = voice_file.lower() lower = voice_file.lower()
if lower.endswith(".amr") or lower.endswith(".silk") or lower.endswith(".slk"): if lower.endswith(".amr") or lower.endswith(".silk") or lower.endswith(".slk"):
try: try:
@@ -98,20 +147,11 @@ class DashScopeVoice(Voice):
audio_convert.any_to_mp3(voice_file, mp3_file) audio_convert.any_to_mp3(voice_file, mp3_file)
return mp3_file return mp3_file
except Exception as e: except Exception as e:
logger.warning( logger.warning(f"[DashScopeVoice] mp3 convert failed: {e}")
f"[DashScopeVoice] convert {voice_file} to mp3 failed: {e}; "
f"submitting original file"
)
return voice_file return voice_file
@staticmethod @staticmethod
def _extract_text(response) -> Optional[str]: def _extract_text(response) -> Optional[str]:
"""Pull the recognized text out of MultiModalConversation response.
Successful shape (result_format="message"):
response.output.choices[0].message.content -> list of {"text": "..."}
or in some SDK versions a plain string.
"""
try: try:
if getattr(response, "status_code", 200) != 200: if getattr(response, "status_code", 200) != 200:
return None return None

View File

@@ -1,16 +1,18 @@
""" """LinkAI voice: Whisper ASR + multi-vendor TTS (OpenAI / Doubao / Baidu)
google voice service proxied via https://docs.link-ai.tech/platform/api/voice-speech."""
""" import datetime
import os
import random import random
import requests import requests
from voice import audio_convert
from bridge.reply import Reply, ReplyType from bridge.reply import Reply, ReplyType
from common import const
from common.log import logger from common.log import logger
from config import conf from config import conf
from voice import audio_convert
from voice.voice import Voice from voice.voice import Voice
from common import const
import os
import datetime
class LinkAIVoice(Voice): class LinkAIVoice(Voice):
def __init__(self): def __init__(self):
@@ -21,8 +23,7 @@ class LinkAIVoice(Voice):
try: try:
url = conf().get("linkai_api_base", "https://api.link-ai.tech") + "/v1/audio/transcriptions" url = conf().get("linkai_api_base", "https://api.link-ai.tech") + "/v1/audio/transcriptions"
headers = {"Authorization": "Bearer " + conf().get("linkai_api_key")} headers = {"Authorization": "Bearer " + conf().get("linkai_api_key")}
model = None # Pin whisper-1: gateway ignores any other ASR model id.
if not conf().get("text_to_voice") or conf().get("voice_to_text") == "openai":
model = const.WHISPER_1 model = const.WHISPER_1
if voice_file.endswith(".amr"): if voice_file.endswith(".amr"):
try: try:
@@ -30,54 +31,59 @@ class LinkAIVoice(Voice):
audio_convert.any_to_mp3(voice_file, mp3_file) audio_convert.any_to_mp3(voice_file, mp3_file)
voice_file = mp3_file voice_file = mp3_file
except Exception as e: except Exception as e:
logger.warn(f"[LinkVoice] amr file transfer failed, directly send amr voice file: {format(e)}") logger.warning(f"[LinkVoice] amr file transfer failed, directly send amr voice file: {e}")
file = open(voice_file, "rb") with open(voice_file, "rb") as file:
file_body = { res = requests.post(
"file": file url,
} files={"file": file},
data = { headers=headers,
"model": model data={"model": model},
} timeout=(5, 60),
res = requests.post(url, files=file_body, headers=headers, data=data, timeout=(5, 60)) )
if res.status_code == 200: if res.status_code != 200:
text = res.json().get("text") msg = ""
else: try:
res_json = res.json() msg = res.json().get("message", "")
logger.error(f"[LinkVoice] voiceToText error, status_code={res.status_code}, msg={res_json.get('message')}") except Exception:
pass
logger.error(f"[LinkVoice] voiceToText error, status_code={res.status_code}, msg={msg}")
return None return None
reply = Reply(ReplyType.TEXT, text) text = res.json().get("text")
logger.info(f"[LinkVoice] voiceToText success, text={text}, file name={voice_file}") logger.info(f"[LinkVoice] voiceToText success, text={text}, file name={voice_file}")
return Reply(ReplyType.TEXT, text)
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)
return None return None
return reply
def textToVoice(self, text): def textToVoice(self, text):
try: try:
url = conf().get("linkai_api_base", "https://api.link-ai.tech") + "/v1/audio/speech" url = conf().get("linkai_api_base", "https://api.link-ai.tech") + "/v1/audio/speech"
headers = {"Authorization": "Bearer " + conf().get("linkai_api_key")} headers = {"Authorization": "Bearer " + conf().get("linkai_api_key")}
model = const.TTS_1 # Gateway routes by `model` (tts-1 / doubao / baidu) + `voice` from
if not conf().get("text_to_voice") or conf().get("text_to_voice") in ["openai", const.TTS_1, const.TTS_1_HD]: # that engine's catalog. `app_code` is optional workspace override.
model = conf().get("text_to_voice_model") or const.TTS_1
data = { data = {
"model": model,
"input": text, "input": text,
"voice": conf().get("tts_voice_id"), "voice": conf().get("tts_voice_id"),
"app_code": conf().get("linkai_app_code") "app_code": conf().get("linkai_app_code"),
} }
model = conf().get("text_to_voice_model")
if model:
data["model"] = model
res = requests.post(url, headers=headers, json=data, timeout=(5, 120)) res = requests.post(url, headers=headers, json=data, timeout=(5, 120))
if res.status_code == 200: if res.status_code != 200:
msg = ""
try:
msg = res.json().get("message", "")
except Exception:
pass
logger.error(f"[LinkVoice] textToVoice error, status_code={res.status_code}, msg={msg}")
return None
tmp_file_name = "tmp/" + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + str(random.randint(0, 1000)) + ".mp3" tmp_file_name = "tmp/" + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + str(random.randint(0, 1000)) + ".mp3"
os.makedirs(os.path.dirname(tmp_file_name), exist_ok=True)
with open(tmp_file_name, 'wb') as f: with open(tmp_file_name, 'wb') as f:
f.write(res.content) f.write(res.content)
reply = Reply(ReplyType.VOICE, tmp_file_name) logger.info(f"[LinkVoice] textToVoice success, input={text}, voice_id={data.get('voice')}")
logger.info(f"[LinkVoice] textToVoice success, input={text}, model={model}, voice_id={data.get('voice')}") return Reply(ReplyType.VOICE, tmp_file_name)
return reply
else:
res_json = res.json()
logger.error(f"[LinkVoice] textToVoice error, status_code={res.status_code}, msg={res_json.get('message')}")
return None
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)
# reply = Reply(ReplyType.ERROR, "遇到了一点小问题,请稍后再问我吧")
return None return None

View File

@@ -1,8 +1,7 @@
# encoding:utf-8 # encoding:utf-8
""" """MiniMax TTS via /v1/t2a_v2 (SSE stream, hex-encoded mp3 chunks)."""
MiniMax TTS voice service
"""
import datetime import datetime
import json
import random import random
import requests import requests
@@ -12,24 +11,12 @@ from config import conf
from voice.voice import Voice from voice.voice import Voice
MINIMAX_TTS_VOICES = [
"English_Graceful_Lady",
"English_Insightful_Speaker",
"English_radiant_girl",
"English_Persuasive_Man",
"English_Lucky_Robot",
"English_expressive_narrator",
"Chinese_Warm_Woman",
"Chinese_Gentle_Man",
]
class MinimaxVoice(Voice): class MinimaxVoice(Voice):
def __init__(self): def __init__(self):
self.api_key = conf().get("minimax_api_key") self.api_key = conf().get("minimax_api_key")
self.api_base = conf().get("minimax_api_base") or "https://api.minimax.io" # Mainland endpoint matches `sk-api-0-...` keys; override via
# Strip trailing /v1 if present so we can always append /v1/t2a_v2 # `minimax_api_base` for international (api.minimax.io) workspaces.
self.api_base = self.api_base.rstrip("/") self.api_base = (conf().get("minimax_api_base") or "https://api.minimaxi.com").rstrip("/")
if self.api_base.endswith("/v1"): if self.api_base.endswith("/v1"):
self.api_base = self.api_base[:-3] self.api_base = self.api_base[:-3]
@@ -68,12 +55,14 @@ class MinimaxVoice(Voice):
response = requests.post(url, headers=headers, json=payload, stream=True, timeout=60) response = requests.post(url, headers=headers, json=payload, stream=True, timeout=60)
response.raise_for_status() response.raise_for_status()
# Parse SSE stream and collect hex-encoded audio chunks # MiniMax returns HTTP 200 even on errors; capture base_resp for diagnostics.
audio_chunks = [] audio_chunks = []
buffer = "" last_base_resp = None
event_count = 0
for raw in response.iter_lines(): for raw in response.iter_lines():
if not raw: if not raw:
continue continue
event_count += 1
line = raw.decode("utf-8") if isinstance(raw, bytes) else raw line = raw.decode("utf-8") if isinstance(raw, bytes) else raw
if not line.startswith("data:"): if not line.startswith("data:"):
continue continue
@@ -81,16 +70,31 @@ class MinimaxVoice(Voice):
if not json_str or json_str == "[DONE]": if not json_str or json_str == "[DONE]":
continue continue
try: try:
import json
event_data = json.loads(json_str) event_data = json.loads(json_str)
audio_hex = event_data.get("data", {}).get("audio")
if audio_hex:
audio_chunks.append(bytes.fromhex(audio_hex))
except Exception: except Exception:
continue continue
base_resp = event_data.get("base_resp") or {}
if base_resp:
last_base_resp = base_resp
audio_hex = (event_data.get("data") or {}).get("audio")
if audio_hex:
try:
audio_chunks.append(bytes.fromhex(audio_hex))
except Exception as e:
logger.warning(f"[MINIMAX] skip bad audio hex chunk: {e}")
if not audio_chunks: if not audio_chunks:
logger.error("[MINIMAX] TTS returned no audio data") ct = response.headers.get("Content-Type", "")
if last_base_resp and last_base_resp.get("status_code") not in (None, 0):
logger.error(
f"[MINIMAX] TTS failed: status_code={last_base_resp.get('status_code')}, "
f"status_msg={last_base_resp.get('status_msg')}, model={model}, voice_id={voice_id}"
)
else:
logger.error(
f"[MINIMAX] TTS returned no audio data, model={model}, voice_id={voice_id}, "
f"url={url}, http={response.status_code}, content_type={ct!r}, events={event_count}"
)
return Reply(ReplyType.ERROR, "语音合成失败,未获取到音频数据") return Reply(ReplyType.ERROR, "语音合成失败,未获取到音频数据")
audio_data = b"".join(audio_chunks) audio_data = b"".join(audio_chunks)

View File

@@ -31,7 +31,8 @@ class OpenaiVoice(Voice):
"file": file, "file": file,
} }
data = { data = {
"model": "whisper-1", # Override via `voice_to_text_model` (e.g. fall back to whisper-1).
"model": conf().get("voice_to_text_model") or "gpt-4o-mini-transcribe",
} }
response = requests.post(url, headers=headers, files=files, data=data) response = requests.post(url, headers=headers, files=files, data=data)
response_data = response.json() response_data = response.json()

View File

@@ -1,14 +1,8 @@
# encoding:utf-8 # encoding:utf-8
""" """ZhipuAI voice: glm-asr-2512 (ASR) + glm-tts (TTS) via BigModel REST API."""
ZhipuAI (BigModel) voice service. import datetime
ASR : glm-asr-2512 via the OpenAI-compatible /audio/transcriptions endpoint.
TTS : not yet implemented.
Endpoint accepts multipart/form-data with `model`, `file`, and `stream`.
File size <= 25MB, duration <= 30s per request.
"""
import os import os
import random
import requests import requests
@@ -20,6 +14,8 @@ from voice.voice import Voice
DEFAULT_ASR_MODEL = "glm-asr-2512" DEFAULT_ASR_MODEL = "glm-asr-2512"
DEFAULT_TTS_MODEL = "glm-tts"
DEFAULT_TTS_VOICE = "tongtong"
DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4" DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4"
MAX_FILE_BYTES = 25 * 1024 * 1024 MAX_FILE_BYTES = 25 * 1024 * 1024
REQUEST_TIMEOUT = (5, 60) REQUEST_TIMEOUT = (5, 60)
@@ -27,7 +23,6 @@ REQUEST_TIMEOUT = (5, 60)
class ZhipuAIVoice(Voice): class ZhipuAIVoice(Voice):
def __init__(self): def __init__(self):
# api_key / base read per-call so live config edits take effect.
pass pass
def voiceToText(self, voice_file: str): def voiceToText(self, voice_file: str):
@@ -81,12 +76,91 @@ class ZhipuAIVoice(Voice):
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~") return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
def textToVoice(self, text: str): def textToVoice(self, text: str):
return Reply(ReplyType.ERROR, "ZhipuAI 语音合成尚未接入") try:
api_key = conf().get("zhipu_ai_api_key", "")
if not api_key:
logger.error("[ZhipuAIVoice] zhipu_ai_api_key is not configured")
return Reply(ReplyType.ERROR, "未配置 ZhipuAI API key")
api_base = (conf().get("zhipu_ai_api_base") or DEFAULT_API_BASE).rstrip("/")
url = f"{api_base}/audio/speech"
model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
voice_id = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
payload = {
"model": model,
"input": text,
"voice": voice_id,
"response_format": "wav",
"speed": 1.0,
"volume": 1.0,
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
response = requests.post(
url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT
)
if response.status_code != 200:
logger.error(
f"[ZhipuAIVoice] textToVoice failed: status={response.status_code} "
f"body={response.text[:500]} model={model} voice={voice_id}"
)
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
# Some errors come back as JSON / SSE with HTTP 200.
ct = response.headers.get("Content-Type", "")
if "application/json" in ct or "text/event-stream" in ct:
try:
err = response.json()
except Exception:
err = {"raw": response.text[:500]}
logger.error(
f"[ZhipuAIVoice] textToVoice unexpected text response "
f"(content_type={ct}): {err}"
)
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
audio_bytes = response.content
ext = self._sniff_audio_ext(audio_bytes) or "wav"
file_name = (
"tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+ str(random.randint(0, 1000)) + "." + ext
)
os.makedirs(os.path.dirname(file_name), exist_ok=True)
with open(file_name, "wb") as f:
f.write(audio_bytes)
logger.info(
f"[ZhipuAIVoice] textToVoice model={model} voice={voice_id} "
f"file={file_name} bytes={len(audio_bytes)} ext={ext}"
)
return Reply(ReplyType.VOICE, file_name)
except Exception as e:
logger.exception(f"[ZhipuAIVoice] textToVoice exception: {e}")
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
@staticmethod
def _sniff_audio_ext(data: bytes) -> str:
"""Detect audio container by magic bytes; returns '' on unknown."""
if len(data) < 12:
return ""
head = data[:12]
if head[:4] == b"RIFF" and head[8:12] == b"WAVE":
return "wav"
if head[:3] == b"ID3" or head[:2] == b"\xff\xfb" or head[:2] == b"\xff\xf3" or head[:2] == b"\xff\xf2":
return "mp3"
if head[:4] == b"OggS":
return "ogg"
if head[:4] == b"fLaC":
return "flac"
return ""
@staticmethod @staticmethod
def _ensure_compatible_format(voice_file: str) -> str: def _ensure_compatible_format(voice_file: str) -> str:
# glm-asr-2512 only accepts .wav / .mp3 — convert everything else # glm-asr-2512 only accepts .wav / .mp3
# (webm from the browser mic, m4a/amr/silk from chat channels, etc).
lower = voice_file.lower() lower = voice_file.lower()
if lower.endswith(".mp3") or lower.endswith(".wav"): if lower.endswith(".mp3") or lower.endswith(".wav"):
return voice_file return voice_file
@@ -95,8 +169,5 @@ class ZhipuAIVoice(Voice):
audio_convert.any_to_mp3(voice_file, mp3_file) audio_convert.any_to_mp3(voice_file, mp3_file)
return mp3_file return mp3_file
except Exception as e: except Exception as e:
logger.warning( logger.warning(f"[ZhipuAIVoice] mp3 convert failed: {e}")
f"[ZhipuAIVoice] convert {voice_file} to mp3 failed: {e}; "
f"submitting original file"
)
return voice_file return voice_file