mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-06-02 00:57:41 +08:00
feat(voice): rework TTS/ASR stack and unify tool/skill config schema
This commit is contained in:
@@ -44,6 +44,7 @@ CREATE TABLE IF NOT EXISTS messages (
|
|||||||
role TEXT NOT NULL,
|
role TEXT NOT NULL,
|
||||||
content TEXT NOT NULL,
|
content TEXT NOT NULL,
|
||||||
created_at INTEGER NOT NULL,
|
created_at INTEGER NOT NULL,
|
||||||
|
extras TEXT NOT NULL DEFAULT '',
|
||||||
UNIQUE (session_id, seq)
|
UNIQUE (session_id, seq)
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -67,6 +68,12 @@ _MIGRATION_ADD_CONTEXT_START_SEQ = """
|
|||||||
ALTER TABLE sessions ADD COLUMN context_start_seq INTEGER NOT NULL DEFAULT 0;
|
ALTER TABLE sessions ADD COLUMN context_start_seq INTEGER NOT NULL DEFAULT 0;
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Generic JSON sidecar for per-message attachments (TTS audio URL, future use).
|
||||||
|
# Always optional — readers must tolerate missing column / empty / invalid JSON.
|
||||||
|
_MIGRATION_ADD_MSG_EXTRAS = """
|
||||||
|
ALTER TABLE messages ADD COLUMN extras TEXT NOT NULL DEFAULT '';
|
||||||
|
"""
|
||||||
|
|
||||||
DEFAULT_MAX_AGE_DAYS: int = 30
|
DEFAULT_MAX_AGE_DAYS: int = 30
|
||||||
|
|
||||||
|
|
||||||
@@ -169,20 +176,26 @@ def _group_into_display_turns(
|
|||||||
cur_rest: List[tuple] = []
|
cur_rest: List[tuple] = []
|
||||||
started = False
|
started = False
|
||||||
|
|
||||||
for role, raw_content, created_at in rows:
|
for role, raw_content, created_at, raw_extras in rows:
|
||||||
try:
|
try:
|
||||||
content = json.loads(raw_content)
|
content = json.loads(raw_content)
|
||||||
except Exception:
|
except Exception:
|
||||||
content = raw_content
|
content = raw_content
|
||||||
|
try:
|
||||||
|
extras = json.loads(raw_extras) if raw_extras else {}
|
||||||
|
if not isinstance(extras, dict):
|
||||||
|
extras = {}
|
||||||
|
except Exception:
|
||||||
|
extras = {}
|
||||||
|
|
||||||
if role == "user" and _is_visible_user_message(content):
|
if role == "user" and _is_visible_user_message(content):
|
||||||
if started:
|
if started:
|
||||||
groups.append((cur_user, cur_rest))
|
groups.append((cur_user, cur_rest))
|
||||||
cur_user = (content, created_at)
|
cur_user = (content, created_at, extras)
|
||||||
cur_rest = []
|
cur_rest = []
|
||||||
started = True
|
started = True
|
||||||
else:
|
else:
|
||||||
cur_rest.append((role, content, created_at))
|
cur_rest.append((role, content, created_at, extras))
|
||||||
|
|
||||||
if started:
|
if started:
|
||||||
groups.append((cur_user, cur_rest))
|
groups.append((cur_user, cur_rest))
|
||||||
@@ -195,7 +208,7 @@ def _group_into_display_turns(
|
|||||||
for user_row, rest in groups:
|
for user_row, rest in groups:
|
||||||
# User turn
|
# User turn
|
||||||
if user_row:
|
if user_row:
|
||||||
content, created_at = user_row
|
content, created_at, _u_extras = user_row
|
||||||
text = _extract_display_text(content)
|
text = _extract_display_text(content)
|
||||||
if text:
|
if text:
|
||||||
turns.append({"role": "user", "content": text, "created_at": created_at})
|
turns.append({"role": "user", "content": text, "created_at": created_at})
|
||||||
@@ -206,8 +219,11 @@ def _group_into_display_turns(
|
|||||||
tool_results: Dict[str, str] = {}
|
tool_results: Dict[str, str] = {}
|
||||||
final_text = ""
|
final_text = ""
|
||||||
final_ts: Optional[int] = None
|
final_ts: Optional[int] = None
|
||||||
|
merged_extras: Dict[str, Any] = {}
|
||||||
|
|
||||||
for role, content, created_at in rest:
|
for role, content, created_at, extras in rest:
|
||||||
|
if role == "assistant" and isinstance(extras, dict):
|
||||||
|
merged_extras.update(extras)
|
||||||
if role == "user":
|
if role == "user":
|
||||||
tool_results.update(_extract_tool_results(content))
|
tool_results.update(_extract_tool_results(content))
|
||||||
elif role == "assistant":
|
elif role == "assistant":
|
||||||
@@ -256,6 +272,8 @@ def _group_into_display_turns(
|
|||||||
"steps": steps,
|
"steps": steps,
|
||||||
"created_at": final_ts or (user_row[1] if user_row else 0),
|
"created_at": final_ts or (user_row[1] if user_row else 0),
|
||||||
}
|
}
|
||||||
|
if merged_extras:
|
||||||
|
turn["extras"] = merged_extras
|
||||||
turns.append(turn)
|
turns.append(turn)
|
||||||
|
|
||||||
return turns
|
return turns
|
||||||
@@ -411,13 +429,15 @@ class ConversationStore:
|
|||||||
content = json.dumps(
|
content = json.dumps(
|
||||||
msg.get("content", ""), ensure_ascii=False
|
msg.get("content", ""), ensure_ascii=False
|
||||||
)
|
)
|
||||||
|
extras_obj = msg.get("extras") or {}
|
||||||
|
extras = json.dumps(extras_obj, ensure_ascii=False) if extras_obj else ""
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"""
|
"""
|
||||||
INSERT OR IGNORE INTO messages
|
INSERT OR IGNORE INTO messages
|
||||||
(session_id, seq, role, content, created_at)
|
(session_id, seq, role, content, created_at, extras)
|
||||||
VALUES (?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?, ?)
|
||||||
""",
|
""",
|
||||||
(session_id, next_seq, role, content, now),
|
(session_id, next_seq, role, content, now, extras),
|
||||||
)
|
)
|
||||||
next_seq += 1
|
next_seq += 1
|
||||||
|
|
||||||
@@ -651,6 +671,55 @@ class ConversationStore:
|
|||||||
logger.info(f"[ConversationStore] Pruned {deleted} expired sessions")
|
logger.info(f"[ConversationStore] Pruned {deleted} expired sessions")
|
||||||
return deleted
|
return deleted
|
||||||
|
|
||||||
|
def attach_extras_to_last_assistant(
|
||||||
|
self,
|
||||||
|
session_id: str,
|
||||||
|
extras: Dict[str, Any],
|
||||||
|
) -> Optional[int]:
|
||||||
|
"""
|
||||||
|
Merge ``extras`` into the latest assistant message of a session.
|
||||||
|
|
||||||
|
Used by post-processing (e.g. TTS) that needs to annotate an already
|
||||||
|
persisted bot reply with attachments such as audio URLs.
|
||||||
|
|
||||||
|
Returns the message seq that was updated, or ``None`` if no assistant
|
||||||
|
message exists or the update could not be applied.
|
||||||
|
"""
|
||||||
|
if not extras:
|
||||||
|
return None
|
||||||
|
with self._lock:
|
||||||
|
conn = self._connect()
|
||||||
|
try:
|
||||||
|
row = conn.execute(
|
||||||
|
"""
|
||||||
|
SELECT seq, extras FROM messages
|
||||||
|
WHERE session_id = ? AND role = 'assistant'
|
||||||
|
ORDER BY seq DESC LIMIT 1
|
||||||
|
""",
|
||||||
|
(session_id,),
|
||||||
|
).fetchone()
|
||||||
|
if not row:
|
||||||
|
return None
|
||||||
|
seq, raw = row
|
||||||
|
try:
|
||||||
|
cur = json.loads(raw) if raw else {}
|
||||||
|
if not isinstance(cur, dict):
|
||||||
|
cur = {}
|
||||||
|
except Exception:
|
||||||
|
cur = {}
|
||||||
|
cur.update(extras)
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE messages SET extras = ? WHERE session_id = ? AND seq = ?",
|
||||||
|
(json.dumps(cur, ensure_ascii=False), session_id, seq),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
return seq
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"[ConversationStore] attach_extras failed: {e}")
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
def load_history_page(
|
def load_history_page(
|
||||||
self,
|
self,
|
||||||
session_id: str,
|
session_id: str,
|
||||||
@@ -698,7 +767,22 @@ class ConversationStore:
|
|||||||
).fetchone()
|
).fetchone()
|
||||||
ctx_start = ctx_row[0] if ctx_row else 0
|
ctx_start = ctx_row[0] if ctx_row else 0
|
||||||
|
|
||||||
|
# extras column is added by migration; tolerate older DBs that
|
||||||
|
# might miss it by falling back to a NULL literal.
|
||||||
|
try:
|
||||||
rows = conn.execute(
|
rows = conn.execute(
|
||||||
|
"""
|
||||||
|
SELECT seq, role, content, created_at, extras
|
||||||
|
FROM messages
|
||||||
|
WHERE session_id = ?
|
||||||
|
ORDER BY seq ASC
|
||||||
|
""",
|
||||||
|
(session_id,),
|
||||||
|
).fetchall()
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
rows = [
|
||||||
|
(seq, role, content, created_at, "")
|
||||||
|
for (seq, role, content, created_at) in conn.execute(
|
||||||
"""
|
"""
|
||||||
SELECT seq, role, content, created_at
|
SELECT seq, role, content, created_at
|
||||||
FROM messages
|
FROM messages
|
||||||
@@ -707,6 +791,7 @@ class ConversationStore:
|
|||||||
""",
|
""",
|
||||||
(session_id,),
|
(session_id,),
|
||||||
).fetchall()
|
).fetchall()
|
||||||
|
]
|
||||||
finally:
|
finally:
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
@@ -719,13 +804,16 @@ class ConversationStore:
|
|||||||
include_thinking = False
|
include_thinking = False
|
||||||
|
|
||||||
# Strip seq for display grouping, but record max seq per visible user group
|
# Strip seq for display grouping, but record max seq per visible user group
|
||||||
plain_rows = [(role, content, created_at) for _seq, role, content, created_at in rows]
|
plain_rows = [
|
||||||
|
(role, content, created_at, extras_raw)
|
||||||
|
for _seq, role, content, created_at, extras_raw in rows
|
||||||
|
]
|
||||||
visible = _group_into_display_turns(plain_rows, include_thinking=include_thinking)
|
visible = _group_into_display_turns(plain_rows, include_thinking=include_thinking)
|
||||||
|
|
||||||
# Build a mapping: find the seq of each visible user message to annotate context boundary.
|
# Build a mapping: find the seq of each visible user message to annotate context boundary.
|
||||||
# Walk through rows to find visible user message seqs in order.
|
# Walk through rows to find visible user message seqs in order.
|
||||||
visible_user_seqs: List[int] = []
|
visible_user_seqs: List[int] = []
|
||||||
for seq, role, raw_content, _ts in rows:
|
for seq, role, raw_content, _ts, _extras in rows:
|
||||||
if role != "user":
|
if role != "user":
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
@@ -911,6 +999,18 @@ class ConversationStore:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"[ConversationStore] Migration (context_start_seq) failed: {e}")
|
logger.warning(f"[ConversationStore] Migration (context_start_seq) failed: {e}")
|
||||||
|
|
||||||
|
msg_cols = {
|
||||||
|
row[1]
|
||||||
|
for row in conn.execute("PRAGMA table_info(messages)").fetchall()
|
||||||
|
}
|
||||||
|
if "extras" not in msg_cols:
|
||||||
|
try:
|
||||||
|
conn.execute(_MIGRATION_ADD_MSG_EXTRAS)
|
||||||
|
conn.commit()
|
||||||
|
logger.info("[ConversationStore] Migrated: added messages.extras column")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"[ConversationStore] Migration (extras) failed: {e}")
|
||||||
|
|
||||||
def _connect(self) -> sqlite3.Connection:
|
def _connect(self) -> sqlite3.Connection:
|
||||||
conn = sqlite3.connect(str(self._db_path), timeout=10)
|
conn = sqlite3.connect(str(self._db_path), timeout=10)
|
||||||
conn.execute("PRAGMA journal_mode=WAL")
|
conn.execute("PRAGMA journal_mode=WAL")
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ Vision tool - Analyze images using Vision API.
|
|||||||
Supports local files (auto base64-encoded) and HTTP URLs.
|
Supports local files (auto base64-encoded) and HTTP URLs.
|
||||||
|
|
||||||
Provider resolution:
|
Provider resolution:
|
||||||
- tool.vision.model (if set) means "prefer this model first; fall back to
|
- tools.vision.model (if set) means "prefer this model first; fall back to
|
||||||
other configured providers if it fails". The model name is mapped to its
|
other configured providers if it fails". The model name is mapped to its
|
||||||
native provider (e.g. doubao-* → Doubao, kimi-* → Moonshot, gpt-* →
|
native provider (e.g. doubao-* → Doubao, kimi-* → Moonshot, gpt-* →
|
||||||
OpenAI/LinkAI). That provider is tried first, then the standard auto
|
OpenAI/LinkAI). That provider is tried first, then the standard auto
|
||||||
@@ -60,7 +60,7 @@ _DISCOVERABLE_MODELS = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
# Model name prefix → discoverable provider display_name.
|
# Model name prefix → discoverable provider display_name.
|
||||||
# Used to auto-route tool.vision.model to its native provider.
|
# Used to auto-route tools.vision.model to its native provider.
|
||||||
# Matched case-insensitively; longest prefix wins.
|
# Matched case-insensitively; longest prefix wins.
|
||||||
_MODEL_PREFIX_TO_PROVIDER = [
|
_MODEL_PREFIX_TO_PROVIDER = [
|
||||||
("doubao-", "Doubao"),
|
("doubao-", "Doubao"),
|
||||||
@@ -154,7 +154,7 @@ class Vision(BaseTool):
|
|||||||
|
|
||||||
# Default model is only used as a last-resort placeholder for providers
|
# Default model is only used as a last-resort placeholder for providers
|
||||||
# whose VisionProvider.model_override is None (e.g. raw OpenAI provider
|
# whose VisionProvider.model_override is None (e.g. raw OpenAI provider
|
||||||
# when the user did not configure tool.vision.model).
|
# when the user did not configure tools.vision.model).
|
||||||
return self._call_with_fallback(providers, DEFAULT_MODEL, question, image_content)
|
return self._call_with_fallback(providers, DEFAULT_MODEL, question, image_content)
|
||||||
|
|
||||||
def _call_with_fallback(self, providers: List[VisionProvider], model: str,
|
def _call_with_fallback(self, providers: List[VisionProvider], model: str,
|
||||||
@@ -193,12 +193,12 @@ class Vision(BaseTool):
|
|||||||
"""
|
"""
|
||||||
Build an ordered list of providers to try.
|
Build an ordered list of providers to try.
|
||||||
|
|
||||||
Semantics of `tool.vision.model`:
|
Semantics of `tools.vision.model`:
|
||||||
"Prefer this model first; fall back to other configured providers
|
"Prefer this model first; fall back to other configured providers
|
||||||
if it fails."
|
if it fails."
|
||||||
|
|
||||||
Order:
|
Order:
|
||||||
1. The provider that natively serves `tool.vision.model` (if any
|
1. The provider that natively serves `tools.vision.model` (if any
|
||||||
and its API key is configured) — using the user-specified model
|
and its API key is configured) — using the user-specified model
|
||||||
name verbatim.
|
name verbatim.
|
||||||
2. Auto-discovery chain as fallback:
|
2. Auto-discovery chain as fallback:
|
||||||
@@ -213,7 +213,7 @@ class Vision(BaseTool):
|
|||||||
user_model = self._resolve_user_vision_model()
|
user_model = self._resolve_user_vision_model()
|
||||||
providers: List[VisionProvider] = []
|
providers: List[VisionProvider] = []
|
||||||
|
|
||||||
# Step 1: preferred provider derived from tool.vision.model
|
# Step 1: preferred provider derived from tools.vision.model
|
||||||
if user_model:
|
if user_model:
|
||||||
preferred = self._route_by_model_name(user_model)
|
preferred = self._route_by_model_name(user_model)
|
||||||
if preferred:
|
if preferred:
|
||||||
@@ -251,11 +251,11 @@ class Vision(BaseTool):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _resolve_user_vision_model() -> Optional[str]:
|
def _resolve_user_vision_model() -> Optional[str]:
|
||||||
"""Read tool.vision.model from config; return None if unset/blank."""
|
"""Read tools.vision.model (singular ``tool`` kept as runtime fallback)."""
|
||||||
tool_conf = conf().get("tool", {})
|
tools_conf = conf().get("tools") or conf().get("tool") or {}
|
||||||
if not isinstance(tool_conf, dict):
|
if not isinstance(tools_conf, dict):
|
||||||
return None
|
return None
|
||||||
vision_conf = tool_conf.get("vision", {})
|
vision_conf = tools_conf.get("vision", {})
|
||||||
if not isinstance(vision_conf, dict):
|
if not isinstance(vision_conf, dict):
|
||||||
return None
|
return None
|
||||||
m = vision_conf.get("model")
|
m = vision_conf.get("model")
|
||||||
@@ -303,7 +303,7 @@ class Vision(BaseTool):
|
|||||||
self._append_provider(providers, lambda: self._build_linkai_provider(user_model))
|
self._append_provider(providers, lambda: self._build_linkai_provider(user_model))
|
||||||
if providers:
|
if providers:
|
||||||
return providers
|
return providers
|
||||||
logger.warning(f"[Vision] tool.vision.model='{user_model}' looks like an OpenAI "
|
logger.warning(f"[Vision] tools.vision.model='{user_model}' looks like an OpenAI "
|
||||||
f"model but neither OPENAI_API_KEY nor LINKAI_API_KEY is configured.")
|
f"model but neither OPENAI_API_KEY nor LINKAI_API_KEY is configured.")
|
||||||
return None # fall through to auto
|
return None # fall through to auto
|
||||||
|
|
||||||
@@ -317,7 +317,7 @@ class Vision(BaseTool):
|
|||||||
continue
|
continue
|
||||||
api_key = conf().get(config_key, "")
|
api_key = conf().get(config_key, "")
|
||||||
if not api_key or not api_key.strip():
|
if not api_key or not api_key.strip():
|
||||||
logger.warning(f"[Vision] tool.vision.model='{user_model}' routes to "
|
logger.warning(f"[Vision] tools.vision.model='{user_model}' routes to "
|
||||||
f"'{display_name}' but '{config_key}' is not configured. "
|
f"'{display_name}' but '{config_key}' is not configured. "
|
||||||
f"Falling back to auto-discovery.")
|
f"Falling back to auto-discovery.")
|
||||||
return None # fall through to auto
|
return None # fall through to auto
|
||||||
@@ -452,8 +452,8 @@ class Vision(BaseTool):
|
|||||||
if not self._main_bot_supports_vision(bot):
|
if not self._main_bot_supports_vision(bot):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Use the configured main model name; do NOT inject tool.vision.model
|
# Use the configured main model name; do NOT inject tools.vision.model
|
||||||
# here, because by the time we reach this branch the tool.vision.model
|
# here, because by the time we reach this branch the tools.vision.model
|
||||||
# routing has already been attempted (and either matched the main bot
|
# routing has already been attempted (and either matched the main bot
|
||||||
# or failed to find a provider).
|
# or failed to find a provider).
|
||||||
main_model_name = conf().get("model") or None
|
main_model_name = conf().get("model") or None
|
||||||
|
|||||||
@@ -171,7 +171,13 @@ class ChatChannel(Channel):
|
|||||||
if "desire_rtype" not in context and conf().get("always_reply_voice") and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE:
|
if "desire_rtype" not in context and conf().get("always_reply_voice") and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE:
|
||||||
context["desire_rtype"] = ReplyType.VOICE
|
context["desire_rtype"] = ReplyType.VOICE
|
||||||
elif context.type == ContextType.VOICE:
|
elif context.type == ContextType.VOICE:
|
||||||
if "desire_rtype" not in context and conf().get("voice_reply_voice") and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE:
|
# Voice input replies with voice when either voice_reply_voice
|
||||||
|
# (mirror voice) or the global always_reply_voice toggle is on.
|
||||||
|
if (
|
||||||
|
"desire_rtype" not in context
|
||||||
|
and (conf().get("voice_reply_voice") or conf().get("always_reply_voice"))
|
||||||
|
and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE
|
||||||
|
):
|
||||||
context["desire_rtype"] = ReplyType.VOICE
|
context["desire_rtype"] = ReplyType.VOICE
|
||||||
return context
|
return context
|
||||||
|
|
||||||
|
|||||||
@@ -1515,10 +1515,16 @@ class FeiShuChanel(ChatChannel):
|
|||||||
else:
|
else:
|
||||||
context.type = ContextType.TEXT
|
context.type = ContextType.TEXT
|
||||||
context.content = content.strip()
|
context.content = content.strip()
|
||||||
|
# Text input opts into voice replies only when the always-on toggle is set.
|
||||||
|
if "desire_rtype" not in context and conf().get("always_reply_voice"):
|
||||||
|
context["desire_rtype"] = ReplyType.VOICE
|
||||||
|
|
||||||
elif context.type == ContextType.VOICE:
|
elif context.type == ContextType.VOICE:
|
||||||
# 2.语音请求
|
# 2.语音请求: voice input replies with voice if either
|
||||||
if "desire_rtype" not in context and conf().get("voice_reply_voice"):
|
# voice_reply_voice (mirror reply) or always_reply_voice is on.
|
||||||
|
if "desire_rtype" not in context and (
|
||||||
|
conf().get("voice_reply_voice") or conf().get("always_reply_voice")
|
||||||
|
):
|
||||||
context["desire_rtype"] = ReplyType.VOICE
|
context["desire_rtype"] = ReplyType.VOICE
|
||||||
|
|
||||||
return context
|
return context
|
||||||
|
|||||||
@@ -1294,3 +1294,76 @@
|
|||||||
overflow: hidden;
|
overflow: hidden;
|
||||||
min-height: 2.5em; /* ~2 lines at text-sm leading-relaxed */
|
min-height: 2.5em; /* ~2 lines at text-sm leading-relaxed */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* --------------------------------------------------------------------
|
||||||
|
* Voice pill — compact custom audio player used by mic uploads and TTS
|
||||||
|
* replies. Replaces the bulky native <audio controls> with a play/pause
|
||||||
|
* icon + thin progress bar + duration counter so it blends into chat
|
||||||
|
* bubbles without the chrome-grey browser default look.
|
||||||
|
* ------------------------------------------------------------------ */
|
||||||
|
.voice-pill {
|
||||||
|
display: inline-flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 8px;
|
||||||
|
padding: 6px 10px;
|
||||||
|
border-radius: 999px;
|
||||||
|
background: rgba(15, 23, 42, 0.05);
|
||||||
|
color: rgb(71, 85, 105);
|
||||||
|
font-size: 12px;
|
||||||
|
line-height: 1;
|
||||||
|
max-width: 240px;
|
||||||
|
user-select: none;
|
||||||
|
cursor: default;
|
||||||
|
}
|
||||||
|
.dark .voice-pill {
|
||||||
|
background: rgba(255, 255, 255, 0.08);
|
||||||
|
color: rgb(203, 213, 225);
|
||||||
|
}
|
||||||
|
.voice-pill[data-loading="1"] {
|
||||||
|
opacity: 0.65;
|
||||||
|
}
|
||||||
|
.voice-pill-btn {
|
||||||
|
width: 22px;
|
||||||
|
height: 22px;
|
||||||
|
border-radius: 999px;
|
||||||
|
display: inline-flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
background: var(--color-primary-500, #2563eb);
|
||||||
|
color: #fff;
|
||||||
|
flex-shrink: 0;
|
||||||
|
cursor: pointer;
|
||||||
|
transition: transform 0.1s ease;
|
||||||
|
}
|
||||||
|
.voice-pill-btn:hover { transform: scale(1.05); }
|
||||||
|
.voice-pill-btn i { font-size: 9px; margin-left: 1px; }
|
||||||
|
.voice-pill-btn[data-state="play"] i { margin-left: 2px; }
|
||||||
|
.voice-pill-btn[data-state="pause"] i { margin-left: 0; }
|
||||||
|
.voice-pill-track {
|
||||||
|
flex: 1;
|
||||||
|
height: 3px;
|
||||||
|
border-radius: 999px;
|
||||||
|
background: rgba(100, 116, 139, 0.25);
|
||||||
|
overflow: hidden;
|
||||||
|
min-width: 70px;
|
||||||
|
}
|
||||||
|
.dark .voice-pill-track {
|
||||||
|
background: rgba(148, 163, 184, 0.25);
|
||||||
|
}
|
||||||
|
.voice-pill-fill {
|
||||||
|
height: 100%;
|
||||||
|
width: 0%;
|
||||||
|
background: var(--color-primary-500, #2563eb);
|
||||||
|
border-radius: inherit;
|
||||||
|
transition: width 0.1s linear;
|
||||||
|
}
|
||||||
|
.voice-pill-time {
|
||||||
|
font-variant-numeric: tabular-nums;
|
||||||
|
font-size: 11px;
|
||||||
|
color: inherit;
|
||||||
|
opacity: 0.75;
|
||||||
|
flex-shrink: 0;
|
||||||
|
min-width: 28px;
|
||||||
|
text-align: right;
|
||||||
|
}
|
||||||
|
.voice-pill audio { display: none; }
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ const I18N = {
|
|||||||
models_add_vendor: '添加厂商',
|
models_add_vendor: '添加厂商',
|
||||||
models_provider: '厂商',
|
models_provider: '厂商',
|
||||||
models_model: '模型',
|
models_model: '模型',
|
||||||
|
models_voice: '音色',
|
||||||
models_configured: '已配置',
|
models_configured: '已配置',
|
||||||
models_not_configured: '未配置',
|
models_not_configured: '未配置',
|
||||||
models_pick_to_configure: '选择以配置',
|
models_pick_to_configure: '选择以配置',
|
||||||
@@ -160,6 +161,11 @@ const I18N = {
|
|||||||
mic_permission_denied: '无法访问麦克风,请检查浏览器权限',
|
mic_permission_denied: '无法访问麦克风,请检查浏览器权限',
|
||||||
mic_too_short: '录音太短,请重试',
|
mic_too_short: '录音太短,请重试',
|
||||||
mic_error: '语音识别失败',
|
mic_error: '语音识别失败',
|
||||||
|
speak_msg: '朗读这段回复',
|
||||||
|
voice_reply_mode_label: '语音回复策略',
|
||||||
|
voice_reply_off: '关闭',
|
||||||
|
voice_reply_if_voice: '仅语音问/语音答',
|
||||||
|
voice_reply_always: '总是语音回复',
|
||||||
attach_menu_folder: '上传文件夹',
|
attach_menu_folder: '上传文件夹',
|
||||||
confirm_yes: '确认',
|
confirm_yes: '确认',
|
||||||
confirm_cancel: '取消',
|
confirm_cancel: '取消',
|
||||||
@@ -180,6 +186,7 @@ const I18N = {
|
|||||||
models_add_vendor: 'Add Vendor',
|
models_add_vendor: 'Add Vendor',
|
||||||
models_provider: 'Provider',
|
models_provider: 'Provider',
|
||||||
models_model: 'Model',
|
models_model: 'Model',
|
||||||
|
models_voice: 'Voice',
|
||||||
models_configured: 'configured',
|
models_configured: 'configured',
|
||||||
models_not_configured: 'not configured',
|
models_not_configured: 'not configured',
|
||||||
models_pick_to_configure: 'pick to configure',
|
models_pick_to_configure: 'pick to configure',
|
||||||
@@ -315,6 +322,11 @@ const I18N = {
|
|||||||
mic_permission_denied: 'Cannot access microphone — check browser permissions',
|
mic_permission_denied: 'Cannot access microphone — check browser permissions',
|
||||||
mic_too_short: 'Recording too short, please retry',
|
mic_too_short: 'Recording too short, please retry',
|
||||||
mic_error: 'Speech recognition failed',
|
mic_error: 'Speech recognition failed',
|
||||||
|
speak_msg: 'Read this reply aloud',
|
||||||
|
voice_reply_mode_label: 'Voice reply policy',
|
||||||
|
voice_reply_off: 'Off',
|
||||||
|
voice_reply_if_voice: 'Voice only if voice input',
|
||||||
|
voice_reply_always: 'Always reply with voice',
|
||||||
attach_menu_folder: 'Upload Folder',
|
attach_menu_folder: 'Upload Folder',
|
||||||
confirm_yes: 'Confirm',
|
confirm_yes: 'Confirm',
|
||||||
confirm_cancel: 'Cancel',
|
confirm_cancel: 'Cancel',
|
||||||
@@ -1474,6 +1486,7 @@ function sendVoiceMessage(text, audioUrl) {
|
|||||||
message: text,
|
message: text,
|
||||||
stream: true,
|
stream: true,
|
||||||
timestamp: timestamp.toISOString(),
|
timestamp: timestamp.toISOString(),
|
||||||
|
is_voice: true,
|
||||||
};
|
};
|
||||||
|
|
||||||
const MAX_RETRIES = 2;
|
const MAX_RETRIES = 2;
|
||||||
@@ -1512,19 +1525,19 @@ function sendVoiceMessage(text, audioUrl) {
|
|||||||
function addUserVoiceMessage(audioUrl, caption, timestamp) {
|
function addUserVoiceMessage(audioUrl, caption, timestamp) {
|
||||||
const el = document.createElement('div');
|
const el = document.createElement('div');
|
||||||
el.className = 'flex justify-end px-4 sm:px-6 py-3';
|
el.className = 'flex justify-end px-4 sm:px-6 py-3';
|
||||||
// Voice-message bubble: playable <audio> on top, ASR caption beneath.
|
// Voice-message bubble: compact voice pill on top, ASR caption beneath.
|
||||||
// The bubble keeps the same primary tint as a normal user message so
|
// The bubble keeps the same primary tint as a normal user message so
|
||||||
// it visually slots into the conversation flow.
|
// it visually slots into the conversation flow.
|
||||||
el.innerHTML = `
|
el.innerHTML = `
|
||||||
<div class="max-w-[75%] sm:max-w-[60%]">
|
<div class="max-w-[75%] sm:max-w-[60%]">
|
||||||
<div class="bg-slate-100 dark:bg-white/10 text-slate-700 dark:text-slate-200 rounded-2xl px-3 py-2 msg-content user-bubble">
|
<div class="bg-slate-100 dark:bg-white/10 text-slate-700 dark:text-slate-200 rounded-2xl px-3 py-2 msg-content user-bubble">
|
||||||
<audio controls preload="metadata" src="${audioUrl}"
|
<div class="user-voice-slot"></div>
|
||||||
class="block w-[260px] max-w-full h-9"></audio>
|
|
||||||
${caption ? `<div class="text-xs mt-1.5 leading-snug text-slate-500 dark:text-slate-400 whitespace-pre-wrap break-words">${escapeHtml(caption)}</div>` : ''}
|
${caption ? `<div class="text-xs mt-1.5 leading-snug text-slate-500 dark:text-slate-400 whitespace-pre-wrap break-words">${escapeHtml(caption)}</div>` : ''}
|
||||||
</div>
|
</div>
|
||||||
<div class="text-xs text-slate-400 dark:text-slate-500 mt-1.5 text-right">${formatTime(timestamp)}</div>
|
<div class="text-xs text-slate-400 dark:text-slate-500 mt-1.5 text-right">${formatTime(timestamp)}</div>
|
||||||
</div>
|
</div>
|
||||||
`;
|
`;
|
||||||
|
el.querySelector('.user-voice-slot').appendChild(renderVoicePill(audioUrl));
|
||||||
messagesDiv.appendChild(el);
|
messagesDiv.appendChild(el);
|
||||||
_autoScrollEnabled = true;
|
_autoScrollEnabled = true;
|
||||||
scrollChatToBottom(true);
|
scrollChatToBottom(true);
|
||||||
@@ -1639,12 +1652,16 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
|
|||||||
<div class="agent-steps"></div>
|
<div class="agent-steps"></div>
|
||||||
<div class="answer-content sse-streaming"></div>
|
<div class="answer-content sse-streaming"></div>
|
||||||
<div class="media-content"></div>
|
<div class="media-content"></div>
|
||||||
|
<div class="bot-audio-slot"></div>
|
||||||
</div>
|
</div>
|
||||||
<div class="flex items-center gap-2 mt-1.5">
|
<div class="flex items-center gap-2 mt-1.5">
|
||||||
<span class="text-xs text-slate-400 dark:text-slate-500">${formatTime(timestamp)}</span>
|
<span class="text-xs text-slate-400 dark:text-slate-500">${formatTime(timestamp)}</span>
|
||||||
<button class="copy-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${currentLang === 'zh' ? '复制' : 'Copy'}" style="display:none">
|
<button class="copy-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${currentLang === 'zh' ? '复制' : 'Copy'}" style="display:none">
|
||||||
<i class="fas fa-copy"></i>
|
<i class="fas fa-copy"></i>
|
||||||
</button>
|
</button>
|
||||||
|
<button class="speak-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${t('speak_msg')}" style="display:none;">
|
||||||
|
<i class="fas fa-volume-up"></i>
|
||||||
|
</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
`;
|
`;
|
||||||
@@ -1856,11 +1873,12 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
|
|||||||
scrollChatToBottom();
|
scrollChatToBottom();
|
||||||
|
|
||||||
} else if (item.type === 'done') {
|
} else if (item.type === 'done') {
|
||||||
|
// Don't close the stream yet: the backend keeps it open
|
||||||
|
// for a short tail to deliver async attachments such as
|
||||||
|
// TTS audio (`voice_attach`). It will close the stream on
|
||||||
|
// its own via onerror once the tail expires.
|
||||||
done = true;
|
done = true;
|
||||||
es.close();
|
|
||||||
delete activeStreams[requestId];
|
|
||||||
|
|
||||||
// item.content may be empty when "done" is only a stream-close signal after media.
|
|
||||||
const finalText = item.content || accumulatedText;
|
const finalText = item.content || accumulatedText;
|
||||||
|
|
||||||
if (!botEl && finalText) {
|
if (!botEl && finalText) {
|
||||||
@@ -1874,6 +1892,7 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
|
|||||||
if (copyBtn && finalText) copyBtn.style.display = '';
|
if (copyBtn && finalText) copyBtn.style.display = '';
|
||||||
applyHighlighting(botEl);
|
applyHighlighting(botEl);
|
||||||
}
|
}
|
||||||
|
renderBotSpeakerButton(botEl, finalText);
|
||||||
scrollChatToBottom();
|
scrollChatToBottom();
|
||||||
|
|
||||||
if (titleInfo) {
|
if (titleInfo) {
|
||||||
@@ -1883,6 +1902,15 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
|
|||||||
loadSessionList();
|
loadSessionList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else if (item.type === 'voice_attach') {
|
||||||
|
// TTS finished — attach a playable audio element to the
|
||||||
|
// current bot bubble. The stream closes right after.
|
||||||
|
if (botEl && item.url) {
|
||||||
|
attachAudioToBotBubble(botEl, item.url, { autoplay: true });
|
||||||
|
}
|
||||||
|
es.close();
|
||||||
|
delete activeStreams[requestId];
|
||||||
|
|
||||||
} else if (item.type === 'error') {
|
} else if (item.type === 'error') {
|
||||||
done = true;
|
done = true;
|
||||||
es.close();
|
es.close();
|
||||||
@@ -1896,7 +1924,10 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) {
|
|||||||
es.close();
|
es.close();
|
||||||
delete activeStreams[requestId];
|
delete activeStreams[requestId];
|
||||||
|
|
||||||
if (done) return;
|
if (done) {
|
||||||
|
// Normal close after the post-done tail expired; nothing to do.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (currentReasoningEl) {
|
if (currentReasoningEl) {
|
||||||
finalizeThinking(currentReasoningEl, reasoningStartTime, reasoningText);
|
finalizeThinking(currentReasoningEl, reasoningStartTime, reasoningText);
|
||||||
@@ -2187,21 +2218,174 @@ function createBotMessageEl(content, timestamp, requestId, msg) {
|
|||||||
<div class="bg-white dark:bg-[#1A1A1A] border border-slate-200 dark:border-white/10 rounded-2xl px-4 py-3 text-sm leading-relaxed msg-content text-slate-700 dark:text-slate-200">
|
<div class="bg-white dark:bg-[#1A1A1A] border border-slate-200 dark:border-white/10 rounded-2xl px-4 py-3 text-sm leading-relaxed msg-content text-slate-700 dark:text-slate-200">
|
||||||
${stepsHtml ? `<div class="agent-steps">${stepsHtml}</div>` : ''}
|
${stepsHtml ? `<div class="agent-steps">${stepsHtml}</div>` : ''}
|
||||||
<div class="answer-content">${renderMarkdown(displayContent)}</div>
|
<div class="answer-content">${renderMarkdown(displayContent)}</div>
|
||||||
|
<div class="bot-audio-slot"></div>
|
||||||
</div>
|
</div>
|
||||||
<div class="flex items-center gap-2 mt-1.5">
|
<div class="flex items-center gap-2 mt-1.5">
|
||||||
<span class="text-xs text-slate-400 dark:text-slate-500">${formatTime(timestamp)}</span>
|
<span class="text-xs text-slate-400 dark:text-slate-500">${formatTime(timestamp)}</span>
|
||||||
<button class="copy-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${currentLang === 'zh' ? '复制' : 'Copy'}">
|
<button class="copy-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${currentLang === 'zh' ? '复制' : 'Copy'}">
|
||||||
<i class="fas fa-copy"></i>
|
<i class="fas fa-copy"></i>
|
||||||
</button>
|
</button>
|
||||||
|
<button class="speak-msg-btn text-xs text-slate-300 dark:text-slate-600 hover:text-slate-500 dark:hover:text-slate-400 transition-colors cursor-pointer" title="${t('speak_msg')}" style="display:none;">
|
||||||
|
<i class="fas fa-volume-up"></i>
|
||||||
|
</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
`;
|
`;
|
||||||
el.querySelector('.answer-content').dataset.rawMd = displayContent;
|
el.querySelector('.answer-content').dataset.rawMd = displayContent;
|
||||||
|
// Existing TTS attachment (history replay): mount the player up-front.
|
||||||
|
const existingAudio = msg && msg.extras && msg.extras.audio && msg.extras.audio.url;
|
||||||
|
if (existingAudio) {
|
||||||
|
attachAudioToBotBubble(el, existingAudio, { autoplay: false });
|
||||||
|
}
|
||||||
|
renderBotSpeakerButton(el, displayContent);
|
||||||
applyHighlighting(el);
|
applyHighlighting(el);
|
||||||
bindChatKnowledgeLinks(el);
|
bindChatKnowledgeLinks(el);
|
||||||
return el;
|
return el;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Append (or replace) a small audio player inside a bot bubble's
|
||||||
|
// dedicated `.bot-audio-slot`. Used by both live TTS pushes and history
|
||||||
|
// replay. Silent failures: never throws.
|
||||||
|
function attachAudioToBotBubble(botEl, audioUrl, opts) {
|
||||||
|
try {
|
||||||
|
if (!botEl || !audioUrl) return;
|
||||||
|
const slot = botEl.querySelector('.bot-audio-slot');
|
||||||
|
if (!slot) return;
|
||||||
|
slot.innerHTML = '';
|
||||||
|
slot.style.marginTop = '6px';
|
||||||
|
const pill = renderVoicePill(audioUrl, { autoplay: !!(opts && opts.autoplay) });
|
||||||
|
slot.appendChild(pill);
|
||||||
|
const speakBtn = botEl.querySelector('.speak-msg-btn');
|
||||||
|
if (speakBtn) speakBtn.style.display = 'none';
|
||||||
|
} catch (_) { /* silent */ }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build a compact play/pause + progress + duration pill that wraps a
|
||||||
|
// hidden <audio>. Returns the root element; safe to embed anywhere.
|
||||||
|
function renderVoicePill(audioUrl, opts) {
|
||||||
|
opts = opts || {};
|
||||||
|
const wrap = document.createElement('div');
|
||||||
|
wrap.className = 'voice-pill';
|
||||||
|
wrap.innerHTML = `
|
||||||
|
<button type="button" class="voice-pill-btn" data-state="play" aria-label="play">
|
||||||
|
<i class="fas fa-play"></i>
|
||||||
|
</button>
|
||||||
|
<div class="voice-pill-track"><div class="voice-pill-fill"></div></div>
|
||||||
|
<span class="voice-pill-time">0:00</span>
|
||||||
|
<audio preload="metadata" src="${audioUrl}"></audio>
|
||||||
|
`;
|
||||||
|
const btn = wrap.querySelector('.voice-pill-btn');
|
||||||
|
const fill = wrap.querySelector('.voice-pill-fill');
|
||||||
|
const timeEl = wrap.querySelector('.voice-pill-time');
|
||||||
|
const audio = wrap.querySelector('audio');
|
||||||
|
|
||||||
|
const fmt = (s) => {
|
||||||
|
if (!isFinite(s) || s < 0) s = 0;
|
||||||
|
const m = Math.floor(s / 60);
|
||||||
|
const r = Math.floor(s % 60);
|
||||||
|
return `${m}:${r < 10 ? '0' : ''}${r}`;
|
||||||
|
};
|
||||||
|
const setIcon = (state) => {
|
||||||
|
btn.dataset.state = state;
|
||||||
|
btn.querySelector('i').className = state === 'pause' ? 'fas fa-pause' : 'fas fa-play';
|
||||||
|
btn.setAttribute('aria-label', state === 'pause' ? 'pause' : 'play');
|
||||||
|
};
|
||||||
|
|
||||||
|
audio.addEventListener('loadedmetadata', () => {
|
||||||
|
if (audio.duration && isFinite(audio.duration)) timeEl.textContent = fmt(audio.duration);
|
||||||
|
});
|
||||||
|
audio.addEventListener('timeupdate', () => {
|
||||||
|
const dur = audio.duration || 0;
|
||||||
|
if (dur > 0) {
|
||||||
|
fill.style.width = `${Math.min(100, (audio.currentTime / dur) * 100)}%`;
|
||||||
|
timeEl.textContent = fmt(dur - audio.currentTime);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
audio.addEventListener('ended', () => {
|
||||||
|
setIcon('play');
|
||||||
|
fill.style.width = '0%';
|
||||||
|
timeEl.textContent = fmt(audio.duration || 0);
|
||||||
|
});
|
||||||
|
audio.addEventListener('play', () => setIcon('pause'));
|
||||||
|
audio.addEventListener('pause', () => setIcon('play'));
|
||||||
|
|
||||||
|
btn.addEventListener('click', (e) => {
|
||||||
|
e.stopPropagation();
|
||||||
|
if (audio.paused) {
|
||||||
|
audio.play().catch(() => {});
|
||||||
|
} else {
|
||||||
|
audio.pause();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (opts.autoplay) {
|
||||||
|
// Autoplay may be blocked by the browser; fall back silently and
|
||||||
|
// let the user tap the play button.
|
||||||
|
const tryPlay = () => audio.play().catch(() => {});
|
||||||
|
if (audio.readyState >= 2) tryPlay();
|
||||||
|
else audio.addEventListener('canplay', tryPlay, { once: true });
|
||||||
|
}
|
||||||
|
return wrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show the manual "read aloud" button when TTS is configured but the
|
||||||
|
// bubble has no audio yet. Lazily probes capability via /api/models so
|
||||||
|
// we don't expose the button when nothing can synthesize speech.
|
||||||
|
function renderBotSpeakerButton(botEl, text) {
|
||||||
|
if (!botEl || !text || !text.trim()) return;
|
||||||
|
const btn = botEl.querySelector('.speak-msg-btn');
|
||||||
|
if (!btn) return;
|
||||||
|
if (botEl.querySelector('.bot-audio-slot audio')) return;
|
||||||
|
_isTtsReady().then(ready => {
|
||||||
|
if (!ready) return;
|
||||||
|
btn.style.display = '';
|
||||||
|
btn.onclick = () => _triggerManualTts(btn, botEl, text);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let _ttsReadyPromise = null;
|
||||||
|
let _ttsReadyTs = 0;
|
||||||
|
function _isTtsReady() {
|
||||||
|
// Cache for 30s to avoid hammering /api/models on every bubble.
|
||||||
|
if (_ttsReadyPromise && Date.now() - _ttsReadyTs < 30000) {
|
||||||
|
return _ttsReadyPromise;
|
||||||
|
}
|
||||||
|
_ttsReadyTs = Date.now();
|
||||||
|
_ttsReadyPromise = fetch('/api/models')
|
||||||
|
.then(r => r.json())
|
||||||
|
.then(data => {
|
||||||
|
const tts = data && data.capabilities && data.capabilities.tts;
|
||||||
|
if (!tts) return false;
|
||||||
|
return Boolean(tts.current_provider || tts.suggested_provider);
|
||||||
|
})
|
||||||
|
.catch(() => false);
|
||||||
|
return _ttsReadyPromise;
|
||||||
|
}
|
||||||
|
|
||||||
|
function _triggerManualTts(btn, botEl, text) {
|
||||||
|
if (btn.dataset.busy === '1') return;
|
||||||
|
btn.dataset.busy = '1';
|
||||||
|
const icon = btn.querySelector('i');
|
||||||
|
const prev = icon ? icon.className : '';
|
||||||
|
if (icon) icon.className = 'fas fa-spinner fa-spin';
|
||||||
|
fetch('/api/voice/tts', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ text, session_id: sessionId }),
|
||||||
|
})
|
||||||
|
.then(r => r.json())
|
||||||
|
.then(data => {
|
||||||
|
if (data && data.status === 'success' && data.audio_url) {
|
||||||
|
attachAudioToBotBubble(botEl, data.audio_url, { autoplay: true });
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch(() => {})
|
||||||
|
.finally(() => {
|
||||||
|
btn.dataset.busy = '0';
|
||||||
|
if (icon) icon.className = prev || 'fas fa-volume-up';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
function addUserMessage(content, timestamp, attachments) {
|
function addUserMessage(content, timestamp, attachments) {
|
||||||
const el = createUserMessageEl(content, timestamp, attachments);
|
const el = createUserMessageEl(content, timestamp, attachments);
|
||||||
messagesDiv.appendChild(el);
|
messagesDiv.appendChild(el);
|
||||||
@@ -3842,14 +4026,39 @@ function renderCapabilityBody(def, cap, body) {
|
|||||||
|
|
||||||
body.innerHTML = providerHtml + modelHtml + dimHtml + footer;
|
body.innerHTML = providerHtml + modelHtml + dimHtml + footer;
|
||||||
|
|
||||||
// The body subtree is detached from `document` at this moment (the parent
|
// TTS: mount reply-mode above provider; defer off-mode toggle to the end.
|
||||||
// wrap is not yet appended), so we must scope lookups to `body` rather
|
if (def.id === 'tts') {
|
||||||
// than calling document.getElementById, which would return null and crash
|
renderVoiceReplyMode(body, cap.reply_mode || 'off', { skipVisibilityToggle: true });
|
||||||
// initDropdown's internal querySelector.
|
// Voice-timbre picker depends on provider+model; rebuilt by callbacks.
|
||||||
|
const modelWrap = body.querySelector(`#cap-${def.id}-model-wrap`);
|
||||||
|
if (modelWrap) {
|
||||||
|
const voiceWrap = document.createElement('div');
|
||||||
|
voiceWrap.id = `cap-${def.id}-voice-wrap`;
|
||||||
|
voiceWrap.innerHTML = `
|
||||||
|
<label class="block text-sm font-medium text-slate-600 dark:text-slate-400 mb-1.5">${t('models_voice')}</label>
|
||||||
|
<div id="cap-${def.id}-voice" class="cfg-dropdown" tabindex="0">
|
||||||
|
<div class="cfg-dropdown-selected">
|
||||||
|
<span class="cfg-dropdown-text">--</span>
|
||||||
|
<i class="fas fa-chevron-down cfg-dropdown-arrow"></i>
|
||||||
|
</div>
|
||||||
|
<div class="cfg-dropdown-menu"></div>
|
||||||
|
</div>
|
||||||
|
<div id="cap-${def.id}-voice-custom-wrap" class="hidden mt-2">
|
||||||
|
<input id="cap-${def.id}-voice-custom" type="text"
|
||||||
|
class="w-full px-3 py-2 text-sm rounded-md border border-slate-200 dark:border-slate-700
|
||||||
|
bg-white dark:bg-slate-800 text-slate-700 dark:text-slate-200
|
||||||
|
placeholder:text-slate-400 dark:placeholder:text-slate-500
|
||||||
|
focus:outline-none focus:ring-2 focus:ring-primary-500"
|
||||||
|
placeholder="voice id" />
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
modelWrap.parentNode.insertBefore(voiceWrap, modelWrap.nextSibling);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// `body` is still detached from `document`; scope lookups locally.
|
||||||
const provDd = body.querySelector(`#cap-${def.id}-provider`);
|
const provDd = body.querySelector(`#cap-${def.id}-provider`);
|
||||||
// initDropdown's option shape is {value, label}; we strip our private
|
// Strip private fields before handing to the generic initDropdown helper.
|
||||||
// _configured/_tracked fields before handing it over so the helper stays
|
|
||||||
// generic, then re-attach status decorations afterwards.
|
|
||||||
const ddOpts = providerOpts.map(o => ({ value: o.value, label: o.label }));
|
const ddOpts = providerOpts.map(o => ({ value: o.value, label: o.label }));
|
||||||
|
|
||||||
let pendingProvider = null;
|
let pendingProvider = null;
|
||||||
@@ -3860,15 +4069,9 @@ function renderCapabilityBody(def, cap, body) {
|
|||||||
pendingCapabilitySelection = null;
|
pendingCapabilitySelection = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// For auto-capable capabilities, an "auto" strategy means the user has
|
// Auto strategy => leave empty sentinel selected. `suggested_provider`
|
||||||
// not pinned a vendor; we honor that by selecting the empty-string
|
// is a UI-only preselect (not persisted until the user clicks Save).
|
||||||
// sentinel rather than the resolved fallback provider name.
|
// No current + no suggestion => leave unselected with a placeholder.
|
||||||
// `suggested_provider` is a UI-only preselect (used by embedding & ASR)
|
|
||||||
// when the user has not pinned a vendor yet — purely cosmetic, not
|
|
||||||
// persisted until the user clicks Save.
|
|
||||||
// For "pick or empty" capabilities (no current, no suggestion), we leave
|
|
||||||
// the dropdown unselected and show a muted placeholder so the user is
|
|
||||||
// nudged to pick explicitly.
|
|
||||||
const noSelectionAndNoHint = !cap.current_provider && !cap.suggested_provider;
|
const noSelectionAndNoHint = !cap.current_provider && !cap.suggested_provider;
|
||||||
const initialProviderValue = pendingProvider
|
const initialProviderValue = pendingProvider
|
||||||
? pendingProvider
|
? pendingProvider
|
||||||
@@ -3889,20 +4092,82 @@ function renderCapabilityBody(def, cap, body) {
|
|||||||
|
|
||||||
if (def.needsModel) {
|
if (def.needsModel) {
|
||||||
rebuildCapabilityModelDropdown(def, initialProviderValue, cap.current_model || '', body);
|
rebuildCapabilityModelDropdown(def, initialProviderValue, cap.current_model || '', body);
|
||||||
// Hide the model picker entirely while the capability is in `auto`
|
// Hide model picker in auto mode — fallback hint below covers it.
|
||||||
// mode — there is nothing useful to pin, and the fallback hint
|
|
||||||
// below explains what'll actually run.
|
|
||||||
setCapabilityModelPickerVisible(def, initialProviderValue !== '' || !capabilitySupportsAuto(def.id), body);
|
setCapabilityModelPickerVisible(def, initialProviderValue !== '' || !capabilitySupportsAuto(def.id), body);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (def.id === 'tts') {
|
||||||
|
rebuildCapabilityVoiceDropdown(
|
||||||
|
initialProviderValue,
|
||||||
|
cap.current_voice || '',
|
||||||
|
body,
|
||||||
|
cap.current_model || ''
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Inject auto/router-pending hint banners before the action footer.
|
// Inject auto/router-pending hint banners before the action footer.
|
||||||
renderCapabilityHints(def, cap, body, initialProviderValue);
|
renderCapabilityHints(def, cap, body, initialProviderValue);
|
||||||
|
|
||||||
|
if (def.id === 'tts') {
|
||||||
|
_setTtsConfigVisible(body, (cap.reply_mode || 'off') !== 'off');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Toggle visibility of the model picker. Used both at first render and
|
// TTS reply-policy dropdown (off / voice_if_voice / always). Persists on
|
||||||
// whenever the provider dropdown swings between an explicit vendor and the
|
// change. When off, hides the rest of the TTS card.
|
||||||
// "auto" sentinel. We toggle the wrapper rather than re-rendering so the
|
function renderVoiceReplyMode(host, currentMode, options) {
|
||||||
// existing dropdown state survives a round-trip back to a real vendor.
|
options = options || {};
|
||||||
|
const opts = [
|
||||||
|
{ value: 'off', label: t('voice_reply_off') },
|
||||||
|
{ value: 'voice_if_voice', label: t('voice_reply_if_voice') },
|
||||||
|
{ value: 'always', label: t('voice_reply_always') },
|
||||||
|
];
|
||||||
|
const wrap = document.createElement('div');
|
||||||
|
wrap.id = 'voice-reply-mode-wrap';
|
||||||
|
wrap.innerHTML = `
|
||||||
|
<label class="block text-sm font-medium text-slate-600 dark:text-slate-400 mb-1.5">${t('voice_reply_mode_label')}</label>
|
||||||
|
<div id="voice-reply-mode-dd" class="cfg-dropdown" tabindex="0">
|
||||||
|
<div class="cfg-dropdown-selected">
|
||||||
|
<span class="cfg-dropdown-text">--</span>
|
||||||
|
<i class="fas fa-chevron-down cfg-dropdown-arrow"></i>
|
||||||
|
</div>
|
||||||
|
<div class="cfg-dropdown-menu"></div>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
host.prepend(wrap);
|
||||||
|
|
||||||
|
const dd = wrap.querySelector('#voice-reply-mode-dd');
|
||||||
|
const valid = ['off', 'voice_if_voice', 'always'];
|
||||||
|
const initial = valid.includes(currentMode) ? currentMode : 'off';
|
||||||
|
if (!options.skipVisibilityToggle) _setTtsConfigVisible(host, initial !== 'off');
|
||||||
|
initDropdown(dd, opts, initial, (mode) => {
|
||||||
|
if (!valid.includes(mode)) return;
|
||||||
|
_setTtsConfigVisible(host, mode !== 'off');
|
||||||
|
fetch('/api/models', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ action: 'set_voice_reply_mode', mode }),
|
||||||
|
})
|
||||||
|
.then(r => r.json())
|
||||||
|
.then(data => {
|
||||||
|
if (data && data.status === 'success') {
|
||||||
|
_ttsReadyPromise = null; // force re-probe on next bubble
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch(() => {});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show/hide everything in the TTS card below the reply-mode dropdown.
|
||||||
|
function _setTtsConfigVisible(host, visible) {
|
||||||
|
if (!host) return;
|
||||||
|
Array.from(host.children).forEach((child) => {
|
||||||
|
if (child.id === 'voice-reply-mode-wrap') return;
|
||||||
|
child.classList.toggle('hidden', !visible);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Toggle wrapper visibility instead of re-rendering so dropdown state survives.
|
||||||
function setCapabilityModelPickerVisible(def, visible, scope) {
|
function setCapabilityModelPickerVisible(def, visible, scope) {
|
||||||
const root = scope || document;
|
const root = scope || document;
|
||||||
const wrap = root.querySelector(`#cap-${def.id}-model-wrap`);
|
const wrap = root.querySelector(`#cap-${def.id}-model-wrap`);
|
||||||
@@ -4135,7 +4400,7 @@ function rebuildCapabilityModelDropdown(def, providerId, selectedModel, scope) {
|
|||||||
|
|
||||||
initDropdown(el, opts, initialValue, (value) => {
|
initDropdown(el, opts, initialValue, (value) => {
|
||||||
const customWrap = document.getElementById(`cap-${def.id}-model-custom-wrap`);
|
const customWrap = document.getElementById(`cap-${def.id}-model-custom-wrap`);
|
||||||
if (!customWrap) return;
|
if (customWrap) {
|
||||||
if (value === '__custom__') {
|
if (value === '__custom__') {
|
||||||
customWrap.classList.remove('hidden');
|
customWrap.classList.remove('hidden');
|
||||||
const input = document.getElementById(`cap-${def.id}-model-custom`);
|
const input = document.getElementById(`cap-${def.id}-model-custom`);
|
||||||
@@ -4143,6 +4408,14 @@ function rebuildCapabilityModelDropdown(def, providerId, selectedModel, scope) {
|
|||||||
} else {
|
} else {
|
||||||
customWrap.classList.add('hidden');
|
customWrap.classList.add('hidden');
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
// TTS voice catalog may be scoped per engine model (aggregating
|
||||||
|
// gateways). Rebuild the voice picker whenever the model changes.
|
||||||
|
if (def.id === 'tts') {
|
||||||
|
const provDd = document.getElementById('cap-tts-provider');
|
||||||
|
const provId = provDd ? getDropdownValue(provDd) : '';
|
||||||
|
rebuildCapabilityVoiceDropdown(provId, '', null, value);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
const customWrap = root.querySelector(`#cap-${def.id}-model-custom-wrap`);
|
const customWrap = root.querySelector(`#cap-${def.id}-model-custom-wrap`);
|
||||||
@@ -4157,22 +4430,93 @@ function rebuildCapabilityModelDropdown(def, providerId, selectedModel, scope) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TTS-only: rebuild the voice timbre picker against the provider's
|
||||||
|
// curated voice list. Hidden when no provider is picked.
|
||||||
|
//
|
||||||
|
// Each voice entry may be:
|
||||||
|
// - a bare string (code = label)
|
||||||
|
// - {value, label, hint?} so we can show a friendly Chinese name
|
||||||
|
// while persisting the raw API code that the runtime sends.
|
||||||
|
function rebuildCapabilityVoiceDropdown(providerId, selectedVoice, scope, modelId) {
|
||||||
|
const root = scope || document;
|
||||||
|
const wrap = root.querySelector(`#cap-tts-voice-wrap`);
|
||||||
|
const el = root.querySelector(`#cap-tts-voice`);
|
||||||
|
if (!wrap || !el) return;
|
||||||
|
const cap = modelsState.capabilities.tts || {};
|
||||||
|
const voicesByProvider = cap.provider_voices || {};
|
||||||
|
let raw = (providerId && voicesByProvider[providerId]) || [];
|
||||||
|
// Some providers (gateways) scope voices by engine model id.
|
||||||
|
if (raw && !Array.isArray(raw) && typeof raw === 'object') {
|
||||||
|
const activeModel = modelId
|
||||||
|
|| (root.querySelector(`#cap-tts-model`) ? getDropdownValue(root.querySelector(`#cap-tts-model`)) : '');
|
||||||
|
raw = (activeModel && raw[activeModel]) || [];
|
||||||
|
}
|
||||||
|
if (!raw || raw.length === 0) {
|
||||||
|
wrap.classList.add('hidden');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
wrap.classList.remove('hidden');
|
||||||
|
// Voice picker: friendly name on the left, raw API code as right-hand
|
||||||
|
// hint. Persisted/sent value is always the raw code.
|
||||||
|
const codes = [];
|
||||||
|
const opts = raw.map(entry => {
|
||||||
|
if (typeof entry === 'string') {
|
||||||
|
codes.push(entry);
|
||||||
|
return { value: entry, label: entry };
|
||||||
|
}
|
||||||
|
codes.push(entry.value);
|
||||||
|
const code = entry.value;
|
||||||
|
const desc = entry.hint || entry.label || code;
|
||||||
|
return {
|
||||||
|
value: code,
|
||||||
|
label: desc,
|
||||||
|
hint: desc === code ? '' : code,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
opts.push({ value: '__custom__', label: currentLang === 'zh' ? '自定义...' : 'Custom...' });
|
||||||
|
|
||||||
|
// Off-catalog values route through the custom branch.
|
||||||
|
let initial = selectedVoice || '';
|
||||||
|
const isCustom = initial && !codes.includes(initial);
|
||||||
|
if (isCustom) initial = '__custom__';
|
||||||
|
if (!initial) initial = codes[0];
|
||||||
|
|
||||||
|
initDropdown(el, opts, initial, (value) => {
|
||||||
|
const customWrap = root.querySelector(`#cap-tts-voice-custom-wrap`);
|
||||||
|
if (!customWrap) return;
|
||||||
|
if (value === '__custom__') {
|
||||||
|
customWrap.classList.remove('hidden');
|
||||||
|
const input = root.querySelector(`#cap-tts-voice-custom`);
|
||||||
|
if (input && !input.value) input.value = isCustom ? selectedVoice : '';
|
||||||
|
} else {
|
||||||
|
customWrap.classList.add('hidden');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const customWrap = root.querySelector(`#cap-tts-voice-custom-wrap`);
|
||||||
|
if (customWrap) {
|
||||||
|
if (initial === '__custom__') {
|
||||||
|
customWrap.classList.remove('hidden');
|
||||||
|
const input = root.querySelector(`#cap-tts-voice-custom`);
|
||||||
|
if (input) input.value = isCustom ? selectedVoice : '';
|
||||||
|
} else {
|
||||||
|
customWrap.classList.add('hidden');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function onCapabilityProviderChange(def, providerId, scope) {
|
function onCapabilityProviderChange(def, providerId, scope) {
|
||||||
if (def.needsModel) {
|
if (def.needsModel) {
|
||||||
// For capabilities that support `auto`, switching to the empty
|
// Empty sentinel hides the model picker (capability is in auto mode).
|
||||||
// sentinel hides the model picker entirely so the card reads as
|
|
||||||
// "we'll figure it out"; switching back to a real vendor re-runs
|
|
||||||
// the rebuild against the capability-scoped model list.
|
|
||||||
const isAuto = providerId === '' && capabilitySupportsAuto(def.id);
|
const isAuto = providerId === '' && capabilitySupportsAuto(def.id);
|
||||||
if (!isAuto) {
|
if (!isAuto) {
|
||||||
rebuildCapabilityModelDropdown(def, providerId, '', scope);
|
rebuildCapabilityModelDropdown(def, providerId, '', scope);
|
||||||
}
|
}
|
||||||
setCapabilityModelPickerVisible(def, !isAuto, scope);
|
setCapabilityModelPickerVisible(def, !isAuto, scope);
|
||||||
}
|
}
|
||||||
// Refresh the auto-hint so it disappears once the user pins a vendor
|
if (def.id === 'tts') {
|
||||||
// and reappears when they swing back to "auto". renderCapabilityHints
|
rebuildCapabilityVoiceDropdown(providerId, '', scope);
|
||||||
// now writes directly into the footer's hint slot, so we just call it
|
}
|
||||||
// again — no need to clean up stale DOM nodes.
|
|
||||||
const body = scope || document.querySelector(`[data-cap-body="${def.id}"]`);
|
const body = scope || document.querySelector(`[data-cap-body="${def.id}"]`);
|
||||||
if (body) {
|
if (body) {
|
||||||
const cap = modelsState.capabilities[def.id] || {};
|
const cap = modelsState.capabilities[def.id] || {};
|
||||||
@@ -4202,6 +4546,16 @@ function saveCapability(capId) {
|
|||||||
// the backend treats this as "fall back to the runtime chain".
|
// the backend treats this as "fall back to the runtime chain".
|
||||||
const isAuto = provider === '' && capabilitySupportsAuto(capId);
|
const isAuto = provider === '' && capabilitySupportsAuto(capId);
|
||||||
const model = isAuto ? '' : getCapabilityModelValue(def);
|
const model = isAuto ? '' : getCapabilityModelValue(def);
|
||||||
|
// TTS carries an extra voice timbre (supports free-text custom ids).
|
||||||
|
let voice = '';
|
||||||
|
if (capId === 'tts' && !isAuto) {
|
||||||
|
const voiceDd = document.getElementById(`cap-${capId}-voice`);
|
||||||
|
voice = voiceDd ? getDropdownValue(voiceDd) : '';
|
||||||
|
if (voice === '__custom__') {
|
||||||
|
const input = document.getElementById(`cap-${capId}-voice-custom`);
|
||||||
|
voice = input ? input.value.trim() : '';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Embedding changes invalidate any pre-existing vector index because
|
// Embedding changes invalidate any pre-existing vector index because
|
||||||
// dimensions / vendor differ. Gate the save behind a confirm, and on
|
// dimensions / vendor differ. Gate the save behind a confirm, and on
|
||||||
@@ -4243,19 +4597,19 @@ function saveCapability(capId) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_persistCapability(capId, provider, model);
|
_persistCapability(capId, provider, model, undefined, { voice });
|
||||||
}
|
}
|
||||||
|
|
||||||
function _persistCapability(capId, provider, model, onAfterSuccess) {
|
function _persistCapability(capId, provider, model, onAfterSuccess, extras) {
|
||||||
|
const payload = { action: 'set_capability', capability: capId, provider_id: provider, model: model };
|
||||||
|
if (extras && extras.voice !== undefined) payload.voice = extras.voice;
|
||||||
fetch('/api/models', {
|
fetch('/api/models', {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: { 'Content-Type': 'application/json' },
|
headers: { 'Content-Type': 'application/json' },
|
||||||
body: JSON.stringify({ action: 'set_capability', capability: capId, provider_id: provider, model: model }),
|
body: JSON.stringify(payload),
|
||||||
}).then(r => r.json()).then(data => {
|
}).then(r => r.json()).then(data => {
|
||||||
if (data.status === 'success') {
|
if (data.status === 'success') {
|
||||||
// Show "Saved" first, then refresh — loadModelsView would
|
// Flash "Saved" before reload so the status survives the rebuild.
|
||||||
// otherwise rebuild the card and wipe the status span before
|
|
||||||
// the user can register the confirmation.
|
|
||||||
showStatus(`cap-${capId}-status`, 'models_save_success', false);
|
showStatus(`cap-${capId}-status`, 'models_save_success', false);
|
||||||
setTimeout(() => {
|
setTimeout(() => {
|
||||||
loadModelsView({ preserveScroll: true });
|
loadModelsView({ preserveScroll: true });
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import logging
|
|||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
import shutil
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
@@ -295,6 +296,12 @@ class WebChannel(ChatChannel):
|
|||||||
"timestamp": time.time()
|
"timestamp": time.time()
|
||||||
})
|
})
|
||||||
logger.debug(f"SSE done sent for request {request_id}")
|
logger.debug(f"SSE done sent for request {request_id}")
|
||||||
|
# Auto-trigger TTS once the bot finishes its text reply. The
|
||||||
|
# synthesis runs in the background so the chat stream is never
|
||||||
|
# blocked; the resulting audio URL is pushed via a follow-up
|
||||||
|
# `voice_attach` SSE event and persisted to messages.extras.
|
||||||
|
if reply.type == ReplyType.TEXT and content.strip():
|
||||||
|
self._maybe_dispatch_auto_tts(request_id, session_id, content, context)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Fallback: polling mode
|
# Fallback: polling mode
|
||||||
@@ -461,16 +468,133 @@ class WebChannel(ChatChannel):
|
|||||||
|
|
||||||
return on_event
|
return on_event
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# TTS auto-dispatch
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
@staticmethod
|
||||||
|
def _resolve_voice_reply_mode() -> str:
|
||||||
|
"""
|
||||||
|
Decide the TTS auto-reply policy.
|
||||||
|
|
||||||
|
Source of truth is the cross-channel pair
|
||||||
|
(`always_reply_voice`, `voice_reply_voice`) which chat_channel
|
||||||
|
also consults. The web UI presents these as a single three-state
|
||||||
|
picker (off / voice_if_voice / always) via a lossless mapping.
|
||||||
|
"""
|
||||||
|
if conf().get("always_reply_voice", False):
|
||||||
|
return "always"
|
||||||
|
if conf().get("voice_reply_voice", False):
|
||||||
|
return "voice_if_voice"
|
||||||
|
return "off"
|
||||||
|
|
||||||
|
# Mirror of ModelsHandler._TTS_PROVIDERS. zhipu is intentionally omitted
|
||||||
|
# from the UI (GLM-TTS prelude beep); pinning it in config.json still works.
|
||||||
|
_TTS_PROVIDERS_SUGGEST_ORDER = ["openai", "minimax", "dashscope", "linkai"]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _tts_provider_ready(cls) -> bool:
|
||||||
|
"""True if user picked a provider OR any suggested vendor has an API key."""
|
||||||
|
if (conf().get("text_to_voice") or "").strip():
|
||||||
|
return True
|
||||||
|
for pid in cls._TTS_PROVIDERS_SUGGEST_ORDER:
|
||||||
|
meta = ConfigHandler.PROVIDER_MODELS.get(pid) or {}
|
||||||
|
key_field = meta.get("api_key_field")
|
||||||
|
if not key_field:
|
||||||
|
continue
|
||||||
|
val = (conf().get(key_field) or "").strip()
|
||||||
|
if val and val not in ("YOUR API KEY", "YOUR_API_KEY"):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _maybe_dispatch_auto_tts(
|
||||||
|
self,
|
||||||
|
request_id: str,
|
||||||
|
session_id: str,
|
||||||
|
text: str,
|
||||||
|
context: dict,
|
||||||
|
) -> None:
|
||||||
|
try:
|
||||||
|
mode = self._resolve_voice_reply_mode()
|
||||||
|
if mode == "off":
|
||||||
|
return
|
||||||
|
if mode == "voice_if_voice" and not context.get("is_voice_input"):
|
||||||
|
return
|
||||||
|
if not self._tts_provider_ready():
|
||||||
|
return
|
||||||
|
threading.Thread(
|
||||||
|
target=self._synthesize_tts_async,
|
||||||
|
args=(request_id, session_id, text),
|
||||||
|
daemon=True,
|
||||||
|
).start()
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"[WebChannel] auto-tts dispatch skipped: {e}")
|
||||||
|
|
||||||
|
def _synthesize_tts_async(
|
||||||
|
self,
|
||||||
|
request_id: str,
|
||||||
|
session_id: str,
|
||||||
|
text: str,
|
||||||
|
) -> None:
|
||||||
|
try:
|
||||||
|
from bridge.bridge import Bridge
|
||||||
|
reply = Bridge().fetch_text_to_voice(text)
|
||||||
|
if reply is None or reply.type != ReplyType.VOICE or not reply.content:
|
||||||
|
logger.warning(
|
||||||
|
f"[WebChannel] TTS produced no audio for request {request_id}: "
|
||||||
|
f"reply={reply}"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
url = self._publish_tts_audio(reply.content)
|
||||||
|
if not url:
|
||||||
|
logger.warning(f"[WebChannel] TTS publish failed for request {request_id}")
|
||||||
|
return
|
||||||
|
payload = {"audio": {"url": url, "kind": "tts"}}
|
||||||
|
try:
|
||||||
|
from agent.memory import get_conversation_store
|
||||||
|
get_conversation_store().attach_extras_to_last_assistant(session_id, payload)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"[WebChannel] tts persist skipped: {e}")
|
||||||
|
q = self.sse_queues.get(request_id)
|
||||||
|
if q is None:
|
||||||
|
logger.warning(
|
||||||
|
f"[WebChannel] TTS ready but SSE queue already closed "
|
||||||
|
f"for request {request_id} (url={url})"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
q.put({
|
||||||
|
"type": "voice_attach",
|
||||||
|
"url": url,
|
||||||
|
"request_id": request_id,
|
||||||
|
"timestamp": time.time(),
|
||||||
|
})
|
||||||
|
logger.info(f"[WebChannel] TTS voice_attach pushed for request {request_id}: {url}")
|
||||||
|
except Exception as e:
|
||||||
|
# TTS failures are intentionally silent (no user-facing error).
|
||||||
|
logger.warning(f"[WebChannel] TTS synthesis failed: {e}")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _publish_tts_audio(src_path: str) -> str:
|
||||||
|
"""Move a TTS file into uploads/ and return its public URL."""
|
||||||
|
try:
|
||||||
|
if not src_path or not os.path.isfile(src_path):
|
||||||
|
logger.warning(f"[WebChannel] publish_tts_audio missing source: {src_path!r}")
|
||||||
|
return ""
|
||||||
|
ext = os.path.splitext(src_path)[1].lower() or ".mp3"
|
||||||
|
upload_dir = _get_upload_dir()
|
||||||
|
os.makedirs(upload_dir, exist_ok=True)
|
||||||
|
ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
||||||
|
dst_name = f"voice_reply_{ts}_{random.randint(0, 9999)}{ext}"
|
||||||
|
dst_path = os.path.join(upload_dir, dst_name)
|
||||||
|
shutil.move(src_path, dst_path)
|
||||||
|
logger.debug(f"[WebChannel] publish_tts_audio moved {src_path} -> {dst_path}")
|
||||||
|
return f"/uploads/{dst_name}"
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"[WebChannel] publish_tts_audio failed: {e}")
|
||||||
|
return ""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _cleanup_stale_voice_recordings(max_age_seconds: int = 3600) -> None:
|
def _cleanup_stale_voice_recordings(max_age_seconds: int = 3600) -> None:
|
||||||
"""Delete voice-input audio files older than `max_age_seconds`.
|
"""Drop voice_input_* uploads older than max_age_seconds (run at startup)."""
|
||||||
|
|
||||||
Called once at startup. Web mic recordings live in the upload
|
|
||||||
directory so the browser can replay them inside the conversation
|
|
||||||
bubble. We don't persist them to history, so once a process
|
|
||||||
restarts they're useless — but they're never auto-cleaned
|
|
||||||
anywhere else, so without this they accumulate over time.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
upload_dir = _get_upload_dir()
|
upload_dir = _get_upload_dir()
|
||||||
if not os.path.isdir(upload_dir):
|
if not os.path.isdir(upload_dir):
|
||||||
@@ -619,6 +743,10 @@ class WebChannel(ChatChannel):
|
|||||||
prompt = json_data.get('message', '')
|
prompt = json_data.get('message', '')
|
||||||
use_sse = json_data.get('stream', True)
|
use_sse = json_data.get('stream', True)
|
||||||
attachments = json_data.get('attachments', [])
|
attachments = json_data.get('attachments', [])
|
||||||
|
# Tag the message as originating from voice input so the post-reply
|
||||||
|
# TTS hook can honour the `voice_if_voice` policy (mirrors the
|
||||||
|
# desire_rtype concept used by other channels).
|
||||||
|
is_voice_input = bool(json_data.get('is_voice', False))
|
||||||
|
|
||||||
# Append file references to the prompt (same format as QQ channel)
|
# Append file references to the prompt (same format as QQ channel)
|
||||||
if attachments:
|
if attachments:
|
||||||
@@ -669,6 +797,11 @@ class WebChannel(ChatChannel):
|
|||||||
context["session_id"] = session_id
|
context["session_id"] = session_id
|
||||||
context["receiver"] = session_id
|
context["receiver"] = session_id
|
||||||
context["request_id"] = request_id
|
context["request_id"] = request_id
|
||||||
|
if is_voice_input:
|
||||||
|
# Web channel runs its own TTS post-pipeline via
|
||||||
|
# _maybe_dispatch_auto_tts; don't set desire_rtype here or
|
||||||
|
# chat_channel would synthesize a duplicate VOICE reply.
|
||||||
|
context["is_voice_input"] = True
|
||||||
|
|
||||||
if use_sse:
|
if use_sse:
|
||||||
context["on_event"] = self._make_sse_callback(request_id)
|
context["on_event"] = self._make_sse_callback(request_id)
|
||||||
@@ -696,27 +829,39 @@ class WebChannel(ChatChannel):
|
|||||||
q = self.sse_queues[request_id]
|
q = self.sse_queues[request_id]
|
||||||
idle_timeout = 600 # 10 minutes without any real event
|
idle_timeout = 600 # 10 minutes without any real event
|
||||||
deadline = time.time() + idle_timeout
|
deadline = time.time() + idle_timeout
|
||||||
done = False
|
# After the main reply is done we keep the stream open for a short
|
||||||
|
# tail so async post-processing (TTS auto-synthesis) can deliver a
|
||||||
|
# `voice_attach` event before the client disconnects.
|
||||||
|
POST_DONE_TAIL_SECONDS = 60
|
||||||
|
post_done = False
|
||||||
|
post_deadline = 0.0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while time.time() < deadline:
|
while time.time() < deadline:
|
||||||
try:
|
try:
|
||||||
item = q.get(timeout=1)
|
item = q.get(timeout=1)
|
||||||
except Empty:
|
except Empty:
|
||||||
|
if post_done and time.time() >= post_deadline:
|
||||||
|
break
|
||||||
yield b": keepalive\n\n"
|
yield b": keepalive\n\n"
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Real event received, reset idle deadline
|
|
||||||
deadline = time.time() + idle_timeout
|
deadline = time.time() + idle_timeout
|
||||||
|
|
||||||
payload = json.dumps(item, ensure_ascii=False)
|
payload = json.dumps(item, ensure_ascii=False)
|
||||||
yield f"data: {payload}\n\n".encode("utf-8")
|
yield f"data: {payload}\n\n".encode("utf-8")
|
||||||
|
|
||||||
if item.get("type") == "done":
|
itype = item.get("type")
|
||||||
done = True
|
if itype == "done":
|
||||||
break
|
post_done = True
|
||||||
|
post_deadline = time.time() + POST_DONE_TAIL_SECONDS
|
||||||
|
elif itype == "voice_attach":
|
||||||
|
# WSGI buffers the previous chunk until the next yield;
|
||||||
|
# shrink the tail so the generator wakes up quickly to
|
||||||
|
# emit a couple of keepalive comments that push the
|
||||||
|
# voice_attach payload through to the browser.
|
||||||
|
post_done = True
|
||||||
|
post_deadline = time.time() + 2 # 2s post-attach tail
|
||||||
finally:
|
finally:
|
||||||
if done:
|
|
||||||
self.sse_queues.pop(request_id, None)
|
self.sse_queues.pop(request_id, None)
|
||||||
|
|
||||||
def poll_response(self):
|
def poll_response(self):
|
||||||
@@ -811,6 +956,7 @@ class WebChannel(ChatChannel):
|
|||||||
'/uploads/(.*)', 'UploadsHandler',
|
'/uploads/(.*)', 'UploadsHandler',
|
||||||
'/api/file', 'FileServeHandler',
|
'/api/file', 'FileServeHandler',
|
||||||
'/api/voice/asr', 'VoiceAsrHandler',
|
'/api/voice/asr', 'VoiceAsrHandler',
|
||||||
|
'/api/voice/tts', 'VoiceTtsHandler',
|
||||||
'/poll', 'PollHandler',
|
'/poll', 'PollHandler',
|
||||||
'/stream', 'StreamHandler',
|
'/stream', 'StreamHandler',
|
||||||
'/chat', 'ChatHandler',
|
'/chat', 'ChatHandler',
|
||||||
@@ -936,15 +1082,8 @@ class UploadHandler:
|
|||||||
|
|
||||||
|
|
||||||
class VoiceAsrHandler:
|
class VoiceAsrHandler:
|
||||||
"""
|
"""Receive a mic recording, persist it under uploads/ and run ASR.
|
||||||
Accept a short audio recording from the web console mic button,
|
Returns {status, text, audio_url} so the UI can render a playback bubble."""
|
||||||
save it under uploads/ so the browser can replay it, then run it
|
|
||||||
through the currently configured ASR provider.
|
|
||||||
|
|
||||||
Returns {status, text, audio_url} on success — the frontend renders
|
|
||||||
a voice-message bubble with the playable audio and the transcribed
|
|
||||||
caption.
|
|
||||||
"""
|
|
||||||
def POST(self):
|
def POST(self):
|
||||||
_require_auth()
|
_require_auth()
|
||||||
web.header('Content-Type', 'application/json; charset=utf-8')
|
web.header('Content-Type', 'application/json; charset=utf-8')
|
||||||
@@ -997,6 +1136,48 @@ class VoiceAsrHandler:
|
|||||||
return json.dumps({"status": "error", "message": str(e)})
|
return json.dumps({"status": "error", "message": str(e)})
|
||||||
|
|
||||||
|
|
||||||
|
class VoiceTtsHandler:
|
||||||
|
"""On-demand TTS for the in-chat "read aloud" button. Returns the
|
||||||
|
audio URL and (when session_id is given) persists it onto the message."""
|
||||||
|
def POST(self):
|
||||||
|
_require_auth()
|
||||||
|
web.header('Content-Type', 'application/json; charset=utf-8')
|
||||||
|
try:
|
||||||
|
data = json.loads(web.data() or b"{}")
|
||||||
|
text = (data.get("text") or "").strip()
|
||||||
|
session_id = (data.get("session_id") or "").strip()
|
||||||
|
if not text:
|
||||||
|
return json.dumps({"status": "error", "message": "empty text"})
|
||||||
|
# `@singleton` makes WebChannel a factory function — go via instance.
|
||||||
|
channel = WebChannel()
|
||||||
|
if not channel._tts_provider_ready():
|
||||||
|
return json.dumps({"status": "error", "message": "tts not configured"})
|
||||||
|
|
||||||
|
from bridge.bridge import Bridge
|
||||||
|
reply = Bridge().fetch_text_to_voice(text)
|
||||||
|
if reply is None or reply.type != ReplyType.VOICE or not reply.content:
|
||||||
|
msg = getattr(reply, "content", "") or "tts failed"
|
||||||
|
return json.dumps({"status": "error", "message": str(msg)})
|
||||||
|
|
||||||
|
url = channel._publish_tts_audio(reply.content)
|
||||||
|
if not url:
|
||||||
|
return json.dumps({"status": "error", "message": "publish failed"})
|
||||||
|
|
||||||
|
if session_id:
|
||||||
|
try:
|
||||||
|
from agent.memory import get_conversation_store
|
||||||
|
get_conversation_store().attach_extras_to_last_assistant(
|
||||||
|
session_id, {"audio": {"url": url, "kind": "tts"}},
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"[VoiceTtsHandler] persist skipped: {e}")
|
||||||
|
|
||||||
|
return json.dumps({"status": "success", "audio_url": url})
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(f"[VoiceTtsHandler] failed: {e}")
|
||||||
|
return json.dumps({"status": "error", "message": str(e)})
|
||||||
|
|
||||||
|
|
||||||
class UploadsHandler:
|
class UploadsHandler:
|
||||||
def GET(self, file_name):
|
def GET(self, file_name):
|
||||||
_require_auth()
|
_require_auth()
|
||||||
@@ -1357,10 +1538,243 @@ class ModelsHandler:
|
|||||||
POST /api/models/capability -> set provider/model for a capability
|
POST /api/models/capability -> set provider/model for a capability
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Capability -> editable flag, current-value resolver, and supported provider
|
# Capability -> provider ids drawn from ConfigHandler.PROVIDER_MODELS.
|
||||||
# ids drawn from ConfigHandler.PROVIDER_MODELS where applicable.
|
|
||||||
_ASR_PROVIDERS = ["openai", "dashscope", "zhipu", "linkai"]
|
_ASR_PROVIDERS = ["openai", "dashscope", "zhipu", "linkai"]
|
||||||
_TTS_PROVIDERS = ["openai", "linkai", "minimax", "baidu", "ali", "xunfei", "azure", "google", "elevenlabs", "edge", "pytts"]
|
# Web-console white-list. Other vendors stay usable via direct config.
|
||||||
|
_TTS_PROVIDERS = ["openai", "minimax", "dashscope", "linkai"]
|
||||||
|
|
||||||
|
# TTS engine catalog (speech models, not voice timbres). Entries are
|
||||||
|
# either a bare code or {value, hint?} when a friendly label helps.
|
||||||
|
_TTS_PROVIDER_MODELS = {
|
||||||
|
"openai": ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"],
|
||||||
|
"minimax": [
|
||||||
|
{"value": "speech-2.8-hd", "hint": "情绪渲染融合语气词,自然听感"},
|
||||||
|
{"value": "speech-2.8-turbo", "hint": "极致生成速度,更自然逼真"},
|
||||||
|
{"value": "speech-2.6-hd", "hint": "超低延时,归一化升级"},
|
||||||
|
{"value": "speech-2.6-turbo", "hint": "更快更便宜,适合语音聊天/数字人"},
|
||||||
|
],
|
||||||
|
"dashscope": [
|
||||||
|
{"value": "qwen3-tts-flash", "hint": "覆盖普通话、方言与主流外语"},
|
||||||
|
],
|
||||||
|
# Aggregating gateway: a single endpoint multiplexes several
|
||||||
|
# underlying TTS engines, selected via the `model` field.
|
||||||
|
# Each engine exposes its own voice catalog (see _TTS_PROVIDER_VOICES).
|
||||||
|
"linkai": [
|
||||||
|
{"value": "tts-1", "hint": "OpenAI · 多语种通用"},
|
||||||
|
{"value": "doubao", "hint": "字节豆包 · 中文音色丰富"},
|
||||||
|
{"value": "baidu", "hint": "百度 · 中文主播音色"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Per-provider voice timbres. Entries can be a bare code string
|
||||||
|
# (label = code) or {value, hint?} when a friendly secondary label
|
||||||
|
# helps recognition. We keep `value` as the raw API code so power
|
||||||
|
# users can cross-reference config.json.
|
||||||
|
_TTS_PROVIDER_VOICES = {
|
||||||
|
"openai": [
|
||||||
|
"alloy", "echo", "fable", "onyx", "nova", "shimmer",
|
||||||
|
"ash", "ballad", "coral", "sage", "verse",
|
||||||
|
],
|
||||||
|
"minimax": [
|
||||||
|
# Mandarin Chinese (full catalog)
|
||||||
|
{"value": "male-qn-qingse", "hint": "中文 · 青涩青年(男)"},
|
||||||
|
{"value": "male-qn-jingying", "hint": "中文 · 精英青年(男)"},
|
||||||
|
{"value": "male-qn-badao", "hint": "中文 · 霸道青年(男)"},
|
||||||
|
{"value": "male-qn-daxuesheng", "hint": "中文 · 青年大学生(男)"},
|
||||||
|
{"value": "female-shaonv", "hint": "中文 · 少女(女)"},
|
||||||
|
{"value": "female-yujie", "hint": "中文 · 御姐(女)"},
|
||||||
|
{"value": "female-chengshu", "hint": "中文 · 成熟女性(女)"},
|
||||||
|
{"value": "female-tianmei", "hint": "中文 · 甜美女性(女)"},
|
||||||
|
{"value": "male-qn-qingse-jingpin", "hint": "中文 · 青涩青年-beta(男)"},
|
||||||
|
{"value": "male-qn-jingying-jingpin", "hint": "中文 · 精英青年-beta(男)"},
|
||||||
|
{"value": "male-qn-badao-jingpin", "hint": "中文 · 霸道青年-beta(男)"},
|
||||||
|
{"value": "male-qn-daxuesheng-jingpin", "hint": "中文 · 青年大学生-beta(男)"},
|
||||||
|
{"value": "female-shaonv-jingpin", "hint": "中文 · 少女-beta(女)"},
|
||||||
|
{"value": "female-yujie-jingpin", "hint": "中文 · 御姐-beta(女)"},
|
||||||
|
{"value": "female-chengshu-jingpin", "hint": "中文 · 成熟女性-beta(女)"},
|
||||||
|
{"value": "female-tianmei-jingpin", "hint": "中文 · 甜美女性-beta(女)"},
|
||||||
|
{"value": "clever_boy", "hint": "中文 · 聪明男童"},
|
||||||
|
{"value": "cute_boy", "hint": "中文 · 可爱男童"},
|
||||||
|
{"value": "lovely_girl", "hint": "中文 · 萌萌女童"},
|
||||||
|
{"value": "cartoon_pig", "hint": "中文 · 卡通猪小琪"},
|
||||||
|
{"value": "bingjiao_didi", "hint": "中文 · 病娇弟弟"},
|
||||||
|
{"value": "junlang_nanyou", "hint": "中文 · 俊朗男友"},
|
||||||
|
{"value": "chunzhen_xuedi", "hint": "中文 · 纯真学弟"},
|
||||||
|
{"value": "lengdan_xiongzhang", "hint": "中文 · 冷淡学长"},
|
||||||
|
{"value": "badao_shaoye", "hint": "中文 · 霸道少爷"},
|
||||||
|
{"value": "tianxin_xiaoling", "hint": "中文 · 甜心小玲"},
|
||||||
|
{"value": "qiaopi_mengmei", "hint": "中文 · 俏皮萌妹"},
|
||||||
|
{"value": "wumei_yujie", "hint": "中文 · 妩媚御姐"},
|
||||||
|
{"value": "diadia_xuemei", "hint": "中文 · 嗲嗲学妹"},
|
||||||
|
{"value": "danya_xuejie", "hint": "中文 · 淡雅学姐"},
|
||||||
|
{"value": "Chinese (Mandarin)_Reliable_Executive", "hint": "中文 · 沉稳高管"},
|
||||||
|
{"value": "Chinese (Mandarin)_News_Anchor", "hint": "中文 · 新闻女声"},
|
||||||
|
{"value": "Chinese (Mandarin)_Mature_Woman", "hint": "中文 · 傲娇御姐"},
|
||||||
|
{"value": "Chinese (Mandarin)_Unrestrained_Young_Man","hint": "中文 · 不羁青年"},
|
||||||
|
{"value": "Arrogant_Miss", "hint": "中文 · 嚣张小姐"},
|
||||||
|
{"value": "Robot_Armor", "hint": "中文 · 机械战甲"},
|
||||||
|
{"value": "Chinese (Mandarin)_Kind-hearted_Antie", "hint": "中文 · 热心大婶"},
|
||||||
|
{"value": "Chinese (Mandarin)_HK_Flight_Attendant", "hint": "中文 · 港普空姐"},
|
||||||
|
{"value": "Chinese (Mandarin)_Humorous_Elder", "hint": "中文 · 搞笑大爷"},
|
||||||
|
{"value": "Chinese (Mandarin)_Gentleman", "hint": "中文 · 温润男声"},
|
||||||
|
{"value": "Chinese (Mandarin)_Warm_Bestie", "hint": "中文 · 温暖闺蜜"},
|
||||||
|
{"value": "Chinese (Mandarin)_Male_Announcer", "hint": "中文 · 播报男声"},
|
||||||
|
{"value": "Chinese (Mandarin)_Sweet_Lady", "hint": "中文 · 甜美女声"},
|
||||||
|
{"value": "Chinese (Mandarin)_Southern_Young_Man", "hint": "中文 · 南方小哥"},
|
||||||
|
{"value": "Chinese (Mandarin)_Wise_Women", "hint": "中文 · 阅历姐姐"},
|
||||||
|
{"value": "Chinese (Mandarin)_Gentle_Youth", "hint": "中文 · 温润青年"},
|
||||||
|
{"value": "Chinese (Mandarin)_Warm_Girl", "hint": "中文 · 温暖少女"},
|
||||||
|
{"value": "Chinese (Mandarin)_Kind-hearted_Elder", "hint": "中文 · 花甲奶奶"},
|
||||||
|
{"value": "Chinese (Mandarin)_Cute_Spirit", "hint": "中文 · 憨憨萌兽"},
|
||||||
|
{"value": "Chinese (Mandarin)_Radio_Host", "hint": "中文 · 电台男主播"},
|
||||||
|
{"value": "Chinese (Mandarin)_Lyrical_Voice", "hint": "中文 · 抒情男声"},
|
||||||
|
{"value": "Chinese (Mandarin)_Straightforward_Boy", "hint": "中文 · 率真弟弟"},
|
||||||
|
{"value": "Chinese (Mandarin)_Sincere_Adult", "hint": "中文 · 真诚青年"},
|
||||||
|
{"value": "Chinese (Mandarin)_Gentle_Senior", "hint": "中文 · 温柔学姐"},
|
||||||
|
{"value": "Chinese (Mandarin)_Stubborn_Friend", "hint": "中文 · 嘴硬竹马"},
|
||||||
|
{"value": "Chinese (Mandarin)_Crisp_Girl", "hint": "中文 · 清脆少女"},
|
||||||
|
{"value": "Chinese (Mandarin)_Pure-hearted_Boy", "hint": "中文 · 清澈邻家弟弟"},
|
||||||
|
{"value": "Chinese (Mandarin)_Soft_Girl", "hint": "中文 · 柔和少女"},
|
||||||
|
# Cantonese (full catalog)
|
||||||
|
{"value": "Cantonese_ProfessionalHost(F)", "hint": "粤语 · 专业女主持"},
|
||||||
|
{"value": "Cantonese_GentleLady", "hint": "粤语 · 温柔女声"},
|
||||||
|
{"value": "Cantonese_ProfessionalHost(M)", "hint": "粤语 · 专业男主持"},
|
||||||
|
{"value": "Cantonese_PlayfulMan", "hint": "粤语 · 活泼男声"},
|
||||||
|
{"value": "Cantonese_CuteGirl", "hint": "粤语 · 可爱女孩"},
|
||||||
|
{"value": "Cantonese_KindWoman", "hint": "粤语 · 善良女声"},
|
||||||
|
# English (curated: 1F + 1M)
|
||||||
|
{"value": "English_Graceful_Lady", "hint": "英文 · Graceful Lady(女)"},
|
||||||
|
{"value": "English_Trustworthy_Man", "hint": "英文 · Trustworthy Man(男)"},
|
||||||
|
# Japanese (curated: 1F + 1M)
|
||||||
|
{"value": "Japanese_KindLady", "hint": "日文 · Kind Lady(女)"},
|
||||||
|
{"value": "Japanese_LoyalKnight", "hint": "日文 · Loyal Knight(男)"},
|
||||||
|
# Korean (curated: 1F + 1M)
|
||||||
|
{"value": "Korean_SweetGirl", "hint": "韩文 · Sweet Girl(女)"},
|
||||||
|
{"value": "Korean_CheerfulBoyfriend", "hint": "韩文 · Cheerful Boyfriend(男)"},
|
||||||
|
],
|
||||||
|
"dashscope": [
|
||||||
|
{"value": "Cherry", "hint": "芊悦 · 阳光女声"},
|
||||||
|
{"value": "Serena", "hint": "苏瑶 · 温柔女声"},
|
||||||
|
{"value": "Chelsie", "hint": "千雪 · 二次元少女"},
|
||||||
|
{"value": "Ethan", "hint": "晨煦 · 阳光男声"},
|
||||||
|
{"value": "Moon", "hint": "月白 · 率性男声"},
|
||||||
|
{"value": "Kai", "hint": "凯 · 治愈男声"},
|
||||||
|
{"value": "Nofish", "hint": "不吃鱼 · 设计师男声"},
|
||||||
|
{"value": "Bella", "hint": "萌宝 · 小萝莉"},
|
||||||
|
{"value": "Bunny", "hint": "萌小姬 · 萌系少女"},
|
||||||
|
{"value": "Stella", "hint": "少女阿月 · 元气少女"},
|
||||||
|
{"value": "Neil", "hint": "阿闻 · 新闻主播"},
|
||||||
|
{"value": "Seren", "hint": "小婉 · 助眠女声"},
|
||||||
|
{"value": "Jada", "hint": "上海话 · 阿珍"},
|
||||||
|
{"value": "Dylan", "hint": "北京话 · 晓东"},
|
||||||
|
{"value": "Sunny", "hint": "四川话 · 晴儿"},
|
||||||
|
{"value": "Eric", "hint": "四川话 · 程川"},
|
||||||
|
{"value": "Rocky", "hint": "粤语 · 阿强"},
|
||||||
|
{"value": "Kiki", "hint": "粤语 · 阿清"},
|
||||||
|
{"value": "Peter", "hint": "天津话 · 李彼得"},
|
||||||
|
{"value": "Marcus", "hint": "陕西话 · 秦川"},
|
||||||
|
{"value": "Roy", "hint": "闽南语 · 阿杰"},
|
||||||
|
],
|
||||||
|
# Aggregating gateway: voices are scoped per engine model. The
|
||||||
|
# frontend picks the correct list based on the selected model so
|
||||||
|
# users don't see incompatible timbres for the active engine.
|
||||||
|
"linkai": {
|
||||||
|
"tts-1": [
|
||||||
|
"alloy", "echo", "fable", "onyx", "nova", "shimmer",
|
||||||
|
],
|
||||||
|
"doubao": [
|
||||||
|
{"value": "zh_female_wanwanxiaohe_moon_bigtts", "hint": "湾湾小何"},
|
||||||
|
{"value": "BV007_streaming", "hint": "亲切女声"},
|
||||||
|
{"value": "BV001_streaming", "hint": "通用女声"},
|
||||||
|
{"value": "BV002_streaming", "hint": "通用男声"},
|
||||||
|
{"value": "BV051_streaming", "hint": "奶气萌娃"},
|
||||||
|
{"value": "zh_female_linjianvhai_moon_bigtts", "hint": "邻家女孩"},
|
||||||
|
{"value": "BV700_streaming", "hint": "灿灿"},
|
||||||
|
{"value": "BV019_streaming", "hint": "重庆小伙"},
|
||||||
|
{"value": "BV524_streaming", "hint": "日语男声"},
|
||||||
|
{"value": "BV021_streaming", "hint": "东北老铁"},
|
||||||
|
{"value": "BV701_streaming", "hint": "擎苍"},
|
||||||
|
{"value": "BV113_streaming", "hint": "甜宠少御"},
|
||||||
|
{"value": "BV056_streaming", "hint": "阳光男声"},
|
||||||
|
{"value": "BV213_streaming", "hint": "广西表哥"},
|
||||||
|
{"value": "BV119_streaming", "hint": "通用赘婿"},
|
||||||
|
{"value": "BV705_streaming", "hint": "炀炀"},
|
||||||
|
{"value": "BV033_streaming", "hint": "温柔小哥"},
|
||||||
|
{"value": "BV102_streaming", "hint": "儒雅青年"},
|
||||||
|
{"value": "BV522_streaming", "hint": "气质女生"},
|
||||||
|
{"value": "BV034_streaming", "hint": "知性姐姐 · 双语"},
|
||||||
|
{"value": "BV005_streaming", "hint": "活泼女声"},
|
||||||
|
{"value": "zh_female_wanqudashu_moon_bigtts", "hint": "湾区大叔"},
|
||||||
|
{"value": "zh_female_daimengchuanmei_moon_bigtts", "hint": "呆萌川妹"},
|
||||||
|
{"value": "zh_male_guozhoudege_moon_bigtts", "hint": "广州德哥"},
|
||||||
|
{"value": "zh_male_beijingxiaoye_moon_bigtts", "hint": "北京小爷"},
|
||||||
|
{"value": "zh_male_shaonianzixin_moon_bigtts", "hint": "少年梓辛 / Brayan"},
|
||||||
|
{"value": "zh_female_meilinvyou_moon_bigtts", "hint": "魅力女友"},
|
||||||
|
{"value": "zh_male_shenyeboke_moon_bigtts", "hint": "深夜播客"},
|
||||||
|
{"value": "zh_female_sajiaonvyou_moon_bigtts", "hint": "柔美女友"},
|
||||||
|
{"value": "zh_female_yuanqinvyou_moon_bigtts", "hint": "撒娇学妹"},
|
||||||
|
{"value": "zh_male_haoyuxiaoge_moon_bigtts", "hint": "浩宇小哥"},
|
||||||
|
{"value": "zh_male_guangxiyuanzhou_moon_bigtts", "hint": "广西远舟"},
|
||||||
|
{"value": "zh_female_meituojieer_moon_bigtts", "hint": "妹坨洁儿"},
|
||||||
|
{"value": "zh_male_yuzhouzixuan_moon_bigtts", "hint": "豫州子轩"},
|
||||||
|
{"value": "BV115_streaming", "hint": "古风少御"},
|
||||||
|
{"value": "zh_female_gaolengyujie_moon_bigtts", "hint": "高冷御姐"},
|
||||||
|
{"value": "zh_male_yuanboxiaoshu_moon_bigtts", "hint": "渊博小叔"},
|
||||||
|
{"value": "zh_male_yangguangqingnian_moon_bigtts", "hint": "阳光青年"},
|
||||||
|
{"value": "zh_male_aojiaobazong_moon_bigtts", "hint": "傲娇霸总"},
|
||||||
|
{"value": "zh_male_jingqiangkanye_moon_bigtts", "hint": "京腔侃爷 / Harmony"},
|
||||||
|
{"value": "zh_female_shuangkuaisisi_moon_bigtts", "hint": "爽快思思 / Skye"},
|
||||||
|
{"value": "zh_male_wennuanahu_moon_bigtts", "hint": "温暖阿虎 / Alvin"},
|
||||||
|
{"value": "multi_female_shuangkuaisisi_moon_bigtts", "hint": "はるこ / Esmeralda"},
|
||||||
|
{"value": "multi_male_jingqiangkanye_moon_bigtts", "hint": "かずね / Javier or Álvaro"},
|
||||||
|
{"value": "multi_female_gaolengyujie_moon_bigtts", "hint": "あけみ"},
|
||||||
|
{"value": "multi_male_wanqudashu_moon_bigtts", "hint": "ひろし / Roberto"},
|
||||||
|
{"value": "ICL_zh_female_bingruoshaonv_tob", "hint": "病弱少女"},
|
||||||
|
{"value": "ICL_zh_female_huoponvhai_tob", "hint": "活泼女孩"},
|
||||||
|
{"value": "ICL_zh_female_heainainai_tob", "hint": "和蔼奶奶"},
|
||||||
|
{"value": "ICL_zh_female_linjuayi_tob", "hint": "邻居阿姨"},
|
||||||
|
{"value": "zh_female_wenrouxiaoya_moon_bigtts", "hint": "温柔小雅"},
|
||||||
|
{"value": "zh_female_tianmeixiaoyuan_moon_bigtts", "hint": "甜美小源"},
|
||||||
|
{"value": "zh_female_qingchezizi_moon_bigtts", "hint": "清澈梓梓"},
|
||||||
|
{"value": "zh_male_dongfanghaoran_moon_bigtts", "hint": "东方浩然"},
|
||||||
|
{"value": "zh_male_jieshuoxiaoming_moon_bigtts", "hint": "解说小明"},
|
||||||
|
{"value": "zh_female_kailangjiejie_moon_bigtts", "hint": "开朗姐姐"},
|
||||||
|
{"value": "zh_male_linjiananhai_moon_bigtts", "hint": "邻家男孩"},
|
||||||
|
{"value": "zh_female_tianmeiyueyue_moon_bigtts", "hint": "甜美悦悦"},
|
||||||
|
{"value": "zh_female_xinlingjitang_moon_bigtts", "hint": "心灵鸡汤"},
|
||||||
|
],
|
||||||
|
"baidu": [
|
||||||
|
{"value": "baidu_0", "hint": "度小美 · 标准女主播"},
|
||||||
|
{"value": "baidu_1", "hint": "度小宇 · 亲切男声"},
|
||||||
|
{"value": "baidu_3", "hint": "度逍遥 · 情感男声"},
|
||||||
|
{"value": "baidu_4", "hint": "度丫丫 · 童声"},
|
||||||
|
{"value": "baidu_5", "hint": "度小娇 · 成熟女主播"},
|
||||||
|
{"value": "baidu_5003", "hint": "度逍遥 · 情感男声"},
|
||||||
|
{"value": "baidu_5118", "hint": "度小鹿 · 甜美女声"},
|
||||||
|
{"value": "baidu_103", "hint": "度米朵 · 可爱童声"},
|
||||||
|
{"value": "baidu_106", "hint": "度博文 · 专业男主播"},
|
||||||
|
{"value": "baidu_110", "hint": "度小童 · 童声主播"},
|
||||||
|
{"value": "baidu_111", "hint": "度小萌 · 软萌妹子"},
|
||||||
|
{"value": "baidu_4003", "hint": "度逍遥 · 情感男声"},
|
||||||
|
{"value": "baidu_4100", "hint": "度小雯 · 活力女主播"},
|
||||||
|
{"value": "baidu_4103", "hint": "度米朵 · 可爱女声"},
|
||||||
|
{"value": "baidu_4105", "hint": "度灵儿 · 清澈女声"},
|
||||||
|
{"value": "baidu_4106", "hint": "度博文 · 专业男主播"},
|
||||||
|
{"value": "baidu_4115", "hint": "度小贤 · 电台男主播"},
|
||||||
|
{"value": "baidu_4117", "hint": "度小乔 · 活泼女声"},
|
||||||
|
{"value": "baidu_4119", "hint": "度小鹿 · 甜美女声"},
|
||||||
|
{"value": "baidu_4129", "hint": "度小彦 · 知识男主播"},
|
||||||
|
{"value": "baidu_4140", "hint": "度小新 · 专业女主播"},
|
||||||
|
{"value": "baidu_4143", "hint": "度清风 · 配音男声"},
|
||||||
|
{"value": "baidu_4144", "hint": "度姗姗 · 娱乐女声"},
|
||||||
|
{"value": "baidu_4149", "hint": "度星河 · 广告男声"},
|
||||||
|
{"value": "baidu_4206", "hint": "度博文 · 综艺男声"},
|
||||||
|
{"value": "baidu_4226", "hint": "南方 · 电台女主播"},
|
||||||
|
{"value": "baidu_4254", "hint": "度小清 · 广告女声"},
|
||||||
|
{"value": "baidu_4278", "hint": "度小贝 · 知识女主播"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
_EMBEDDING_PROVIDERS = ["openai", "dashscope", "doubao", "zhipu", "linkai"]
|
_EMBEDDING_PROVIDERS = ["openai", "dashscope", "doubao", "zhipu", "linkai"]
|
||||||
|
|
||||||
# Capability-scoped model catalogs. The chat dropdown can reuse the
|
# Capability-scoped model catalogs. The chat dropdown can reuse the
|
||||||
@@ -1525,7 +1939,7 @@ class ModelsHandler:
|
|||||||
@classmethod
|
@classmethod
|
||||||
def _predict_vision_auto(cls, local_config: dict) -> dict:
|
def _predict_vision_auto(cls, local_config: dict) -> dict:
|
||||||
"""Predict which provider vision.py will actually dispatch to when
|
"""Predict which provider vision.py will actually dispatch to when
|
||||||
no tool.vision.model is set. Mirrors the fallback order in
|
no tools.vision.model is set. Mirrors the fallback order in
|
||||||
agent/tools/vision/vision.py::_resolve_providers so the UI hint
|
agent/tools/vision/vision.py::_resolve_providers so the UI hint
|
||||||
matches reality."""
|
matches reality."""
|
||||||
chat = cls._chat_capability(local_config)
|
chat = cls._chat_capability(local_config)
|
||||||
@@ -1590,12 +2004,12 @@ class ModelsHandler:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _vision_capability(cls, local_config: dict) -> dict:
|
def _vision_capability(cls, local_config: dict) -> dict:
|
||||||
"""Vision model. tool.vision.model is the explicit override; otherwise
|
"""Vision model. tools.vision.model is the explicit override; otherwise
|
||||||
the runtime fallback chain in agent/tools/vision/vision.py decides."""
|
the runtime fallback chain in agent/tools/vision/vision.py decides."""
|
||||||
tool_conf = local_config.get("tool") or {}
|
tools_conf = local_config.get("tools") or local_config.get("tool") or {}
|
||||||
if not isinstance(tool_conf, dict):
|
if not isinstance(tools_conf, dict):
|
||||||
tool_conf = {}
|
tools_conf = {}
|
||||||
vision_conf = tool_conf.get("vision") or {}
|
vision_conf = tools_conf.get("vision") or {}
|
||||||
if not isinstance(vision_conf, dict):
|
if not isinstance(vision_conf, dict):
|
||||||
vision_conf = {}
|
vision_conf = {}
|
||||||
user_specified = (vision_conf.get("model") or "").strip()
|
user_specified = (vision_conf.get("model") or "").strip()
|
||||||
@@ -1652,14 +2066,38 @@ class ModelsHandler:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _tts_capability(cls, local_config: dict) -> dict:
|
def _tts_capability(cls, local_config: dict) -> dict:
|
||||||
provider_id = (local_config.get("text_to_voice") or "openai").strip().lower()
|
explicit = (local_config.get("text_to_voice") or "").strip().lower()
|
||||||
|
# Providers outside the white-list don't drive the picker, but their
|
||||||
|
# underlying runtime config is preserved so bridge still routes them.
|
||||||
|
ui_provider = explicit if explicit in cls._TTS_PROVIDERS else ""
|
||||||
|
suggested = ""
|
||||||
|
if not ui_provider:
|
||||||
|
for pid in cls._TTS_PROVIDERS:
|
||||||
|
meta = ConfigHandler.PROVIDER_MODELS.get(pid) or {}
|
||||||
|
key_field = meta.get("api_key_field")
|
||||||
|
if key_field and cls._is_real_key(local_config.get(key_field, "")):
|
||||||
|
suggested = pid
|
||||||
|
break
|
||||||
return {
|
return {
|
||||||
"editable": True,
|
"editable": True,
|
||||||
"current_provider": provider_id,
|
"current_provider": ui_provider,
|
||||||
"current_model": local_config.get("text_to_voice_model", "") or "",
|
"suggested_provider": suggested,
|
||||||
|
"current_model": (local_config.get("text_to_voice_model") or "") if ui_provider else "",
|
||||||
|
"current_voice": (local_config.get("tts_voice_id") or "") if ui_provider else "",
|
||||||
"providers": cls._TTS_PROVIDERS,
|
"providers": cls._TTS_PROVIDERS,
|
||||||
|
"provider_models": cls._TTS_PROVIDER_MODELS,
|
||||||
|
"provider_voices": cls._TTS_PROVIDER_VOICES,
|
||||||
|
"reply_mode": cls._tts_reply_mode(local_config),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _tts_reply_mode(local_config: dict) -> str:
|
||||||
|
if local_config.get("always_reply_voice", False):
|
||||||
|
return "always"
|
||||||
|
if local_config.get("voice_reply_voice", False):
|
||||||
|
return "voice_if_voice"
|
||||||
|
return "off"
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _embedding_capability(cls, local_config: dict) -> dict:
|
def _embedding_capability(cls, local_config: dict) -> dict:
|
||||||
# Embedding is "pick or empty" — runtime's legacy openai/linkai
|
# Embedding is "pick or empty" — runtime's legacy openai/linkai
|
||||||
@@ -1728,17 +2166,20 @@ class ModelsHandler:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _image_capability(cls, local_config: dict) -> dict:
|
def _image_capability(cls, local_config: dict) -> dict:
|
||||||
"""Image generation. Source of truth: config["skill"]["image-generation"]["model"]
|
"""Image generation. Source of truth: config["skills"]["image-generation"]["model"]
|
||||||
(mirrors the per-skill config schema documented in skills/image-generation).
|
(mirrors the per-skill config schema documented in skills/image-generation).
|
||||||
The runtime resolver in skills/image-generation/scripts/generate.py
|
The runtime resolver in skills/image-generation/scripts/generate.py
|
||||||
reads this via the SKILL_IMAGE_GENERATION_MODEL env var that the
|
reads this via the SKILL_IMAGE_GENERATION_MODEL env var that the
|
||||||
agent_initializer syncs at startup; provider is inferred from the
|
agent_initializer syncs at startup; provider is inferred from the
|
||||||
model name prefix, mirroring vision.py's design.
|
model name prefix, mirroring vision.py's design.
|
||||||
|
|
||||||
|
``skill`` (singular) is still tolerated as a legacy fallback —
|
||||||
|
config.load_config() folds it into ``skills`` at startup.
|
||||||
"""
|
"""
|
||||||
skill_node = local_config.get("skill") or {}
|
skills_node = local_config.get("skills") or local_config.get("skill") or {}
|
||||||
if not isinstance(skill_node, dict):
|
if not isinstance(skills_node, dict):
|
||||||
skill_node = {}
|
skills_node = {}
|
||||||
img_node = skill_node.get("image-generation") or {}
|
img_node = skills_node.get("image-generation") or {}
|
||||||
if not isinstance(img_node, dict):
|
if not isinstance(img_node, dict):
|
||||||
img_node = {}
|
img_node = {}
|
||||||
explicit_model = (img_node.get("model") or "").strip()
|
explicit_model = (img_node.get("model") or "").strip()
|
||||||
@@ -1832,6 +2273,8 @@ class ModelsHandler:
|
|||||||
return self._handle_delete_provider(data)
|
return self._handle_delete_provider(data)
|
||||||
if action == "set_capability":
|
if action == "set_capability":
|
||||||
return self._handle_set_capability(data)
|
return self._handle_set_capability(data)
|
||||||
|
if action == "set_voice_reply_mode":
|
||||||
|
return self._handle_set_voice_reply_mode(data)
|
||||||
return json.dumps({"status": "error", "message": f"unknown action: {action!r}"})
|
return json.dumps({"status": "error", "message": f"unknown action: {action!r}"})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"[ModelsHandler] POST failed: {e}")
|
logger.error(f"[ModelsHandler] POST failed: {e}")
|
||||||
@@ -1918,7 +2361,7 @@ class ModelsHandler:
|
|||||||
if capability == "asr":
|
if capability == "asr":
|
||||||
return self._set_simple("voice_to_text", provider_id)
|
return self._set_simple("voice_to_text", provider_id)
|
||||||
if capability == "tts":
|
if capability == "tts":
|
||||||
return self._set_tts(provider_id, model)
|
return self._set_tts(provider_id, model, (data.get("voice") or "").strip())
|
||||||
if capability == "embedding":
|
if capability == "embedding":
|
||||||
return self._set_embedding(provider_id, model)
|
return self._set_embedding(provider_id, model)
|
||||||
if capability == "image":
|
if capability == "image":
|
||||||
@@ -1926,35 +2369,20 @@ class ModelsHandler:
|
|||||||
return json.dumps({"status": "error", "message": f"capability not editable: {capability}"})
|
return json.dumps({"status": "error", "message": f"capability not editable: {capability}"})
|
||||||
|
|
||||||
def _set_image(self, provider_id: str, model: str) -> str:
|
def _set_image(self, provider_id: str, model: str) -> str:
|
||||||
# Source of truth: config["skill"]["image-generation"]["model"].
|
# Source of truth: skills.image-generation.model. provider_id is
|
||||||
# provider_id is informational only (used by the UI to highlight a
|
# informational only; the resolver picks the vendor by model prefix.
|
||||||
# vendor card); the runtime resolver infers the provider from the
|
|
||||||
# model name prefix at request time, mirroring vision.py's design.
|
|
||||||
# An empty model means "switch back to auto / let the script pick".
|
|
||||||
local_config = conf()
|
local_config = conf()
|
||||||
file_cfg = self._read_file_config()
|
file_cfg = self._read_file_config()
|
||||||
|
|
||||||
def _ensure_skill_node(cfg: dict) -> dict:
|
self._set_nested_namespace_value(local_config, "skills", "image-generation", "model", model or "")
|
||||||
skill_node = cfg.get("skill") or {}
|
self._set_nested_namespace_value(file_cfg, "skills", "image-generation", "model", model or "")
|
||||||
if not isinstance(skill_node, dict):
|
self._drop_legacy_namespace(local_config, "skill", "skills", child="image-generation")
|
||||||
skill_node = {}
|
self._drop_legacy_namespace(file_cfg, "skill", "skills", child="image-generation")
|
||||||
img_node = skill_node.get("image-generation") or {}
|
|
||||||
if not isinstance(img_node, dict):
|
|
||||||
img_node = {}
|
|
||||||
skill_node["image-generation"] = img_node
|
|
||||||
cfg["skill"] = skill_node
|
|
||||||
return img_node
|
|
||||||
|
|
||||||
_ensure_skill_node(local_config)["model"] = model or ""
|
|
||||||
_ensure_skill_node(file_cfg)["model"] = model or ""
|
|
||||||
|
|
||||||
self._write_file_config(file_cfg)
|
self._write_file_config(file_cfg)
|
||||||
|
|
||||||
# The skill subprocess (skills/image-generation/scripts/generate.py)
|
# The skill subprocess reads SKILL_IMAGE_GENERATION_MODEL from env at
|
||||||
# reads SKILL_IMAGE_GENERATION_MODEL from its environment, which is
|
# startup; mirror the change so live edits apply without restart.
|
||||||
# only synced from config["skill"] at startup. Update os.environ live
|
|
||||||
# so changes take effect on the next call without a restart. An empty
|
|
||||||
# model means "clear the override" → drop the env var entirely.
|
|
||||||
env_key = "SKILL_IMAGE_GENERATION_MODEL"
|
env_key = "SKILL_IMAGE_GENERATION_MODEL"
|
||||||
if model:
|
if model:
|
||||||
os.environ[env_key] = model
|
os.environ[env_key] = model
|
||||||
@@ -1992,8 +2420,6 @@ class ModelsHandler:
|
|||||||
applied["model"] = model
|
applied["model"] = model
|
||||||
|
|
||||||
if not applied:
|
if not applied:
|
||||||
# No-op save (nothing to write). Return success so the UI can
|
|
||||||
# confirm the click without showing a misleading error.
|
|
||||||
return json.dumps({"status": "success", "applied": {}, "noop": True})
|
return json.dumps({"status": "success", "applied": {}, "noop": True})
|
||||||
|
|
||||||
self._write_file_config(file_cfg)
|
self._write_file_config(file_cfg)
|
||||||
@@ -2002,34 +2428,66 @@ class ModelsHandler:
|
|||||||
return json.dumps({"status": "success", "applied": applied})
|
return json.dumps({"status": "success", "applied": applied})
|
||||||
|
|
||||||
def _set_vision(self, provider_id: str, model: str) -> str:
|
def _set_vision(self, provider_id: str, model: str) -> str:
|
||||||
# Vision uses tool.vision.model (nested). provider_id is informational
|
# Source of truth: tools.vision.model. provider_id is informational
|
||||||
# only; the runtime resolver auto-routes by model name prefix.
|
# only; the resolver picks the vendor by model prefix.
|
||||||
local_config = conf()
|
local_config = conf()
|
||||||
file_cfg = self._read_file_config()
|
file_cfg = self._read_file_config()
|
||||||
tool_node = file_cfg.get("tool") or {}
|
self._set_nested_namespace_value(file_cfg, "tools", "vision", "model", model)
|
||||||
if not isinstance(tool_node, dict):
|
self._set_nested_namespace_value(local_config, "tools", "vision", "model", model)
|
||||||
tool_node = {}
|
self._drop_legacy_namespace(file_cfg, "tool", "tools", child="vision")
|
||||||
vision_node = tool_node.get("vision") or {}
|
self._drop_legacy_namespace(local_config, "tool", "tools", child="vision")
|
||||||
if not isinstance(vision_node, dict):
|
|
||||||
vision_node = {}
|
|
||||||
vision_node["model"] = model
|
|
||||||
tool_node["vision"] = vision_node
|
|
||||||
file_cfg["tool"] = tool_node
|
|
||||||
# Mirror into in-memory config so the live agent sees the change.
|
|
||||||
runtime_tool = local_config.get("tool") or {}
|
|
||||||
if not isinstance(runtime_tool, dict):
|
|
||||||
runtime_tool = {}
|
|
||||||
runtime_vision = runtime_tool.get("vision") or {}
|
|
||||||
if not isinstance(runtime_vision, dict):
|
|
||||||
runtime_vision = {}
|
|
||||||
runtime_vision["model"] = model
|
|
||||||
runtime_tool["vision"] = runtime_vision
|
|
||||||
local_config["tool"] = runtime_tool
|
|
||||||
|
|
||||||
self._write_file_config(file_cfg)
|
self._write_file_config(file_cfg)
|
||||||
logger.info(f"[ModelsHandler] vision model set: {model!r}")
|
logger.info(f"[ModelsHandler] vision model set: {model!r}")
|
||||||
return json.dumps({"status": "success", "model": model})
|
return json.dumps({"status": "success", "model": model})
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _set_nested_namespace_value(cfg, top: str, name: str, key: str, value):
|
||||||
|
"""Set ``cfg[top][name][key] = value``, creating missing dicts."""
|
||||||
|
bucket = cfg.get(top)
|
||||||
|
if not isinstance(bucket, dict):
|
||||||
|
bucket = {}
|
||||||
|
node = bucket.get(name)
|
||||||
|
if not isinstance(node, dict):
|
||||||
|
node = {}
|
||||||
|
node[key] = value
|
||||||
|
bucket[name] = node
|
||||||
|
cfg[top] = bucket
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _drop_legacy_namespace(cfg, legacy: str, canonical: str, child: str) -> None:
|
||||||
|
"""Strip the deprecated singular key so config.json stays single-source."""
|
||||||
|
legacy_section = cfg.get(legacy)
|
||||||
|
if not isinstance(legacy_section, dict):
|
||||||
|
return
|
||||||
|
legacy_section.pop(child, None)
|
||||||
|
if legacy_section:
|
||||||
|
cfg[legacy] = legacy_section
|
||||||
|
else:
|
||||||
|
cfg.pop(legacy, None)
|
||||||
|
|
||||||
|
def _handle_set_voice_reply_mode(self, data: dict) -> str:
|
||||||
|
# UI picker (off / voice_if_voice / always) maps to the legacy
|
||||||
|
# always_reply_voice + voice_reply_voice pair that chat_channel.py
|
||||||
|
# reads, so all channels (web/feishu/wecom/...) share the routing.
|
||||||
|
mode = (data.get("mode") or "").strip().lower()
|
||||||
|
if mode not in ("off", "voice_if_voice", "always"):
|
||||||
|
return json.dumps({"status": "error", "message": f"invalid mode: {mode!r}"})
|
||||||
|
always = (mode == "always")
|
||||||
|
if_voice = (mode == "voice_if_voice")
|
||||||
|
local_config = conf()
|
||||||
|
file_cfg = self._read_file_config()
|
||||||
|
local_config["always_reply_voice"] = always
|
||||||
|
local_config["voice_reply_voice"] = if_voice
|
||||||
|
file_cfg["always_reply_voice"] = always
|
||||||
|
file_cfg["voice_reply_voice"] = if_voice
|
||||||
|
self._write_file_config(file_cfg)
|
||||||
|
logger.info(
|
||||||
|
f"[ModelsHandler] voice reply mode set: {mode!r} "
|
||||||
|
f"(always_reply_voice={always}, voice_reply_voice={if_voice})"
|
||||||
|
)
|
||||||
|
return json.dumps({"status": "success", "mode": mode})
|
||||||
|
|
||||||
def _set_simple(self, key: str, value: str) -> str:
|
def _set_simple(self, key: str, value: str) -> str:
|
||||||
local_config = conf()
|
local_config = conf()
|
||||||
file_cfg = self._read_file_config()
|
file_cfg = self._read_file_config()
|
||||||
@@ -2037,25 +2495,30 @@ class ModelsHandler:
|
|||||||
file_cfg[key] = value
|
file_cfg[key] = value
|
||||||
self._write_file_config(file_cfg)
|
self._write_file_config(file_cfg)
|
||||||
logger.info(f"[ModelsHandler] {key} set: {value!r}")
|
logger.info(f"[ModelsHandler] {key} set: {value!r}")
|
||||||
# Bridge caches voice_to_text routing + bot instance; refresh it
|
# Hot-swap the cached voice bot so the change takes effect immediately.
|
||||||
# so the change takes effect on the next voice request.
|
|
||||||
if key in ("voice_to_text", "text_to_voice"):
|
if key in ("voice_to_text", "text_to_voice"):
|
||||||
self._refresh_voice_routing()
|
self._refresh_voice_routing()
|
||||||
return json.dumps({"status": "success", key: value})
|
return json.dumps({"status": "success", key: value})
|
||||||
|
|
||||||
def _set_tts(self, provider_id: str, model: str) -> str:
|
def _set_tts(self, provider_id: str, model: str, voice: str = "") -> str:
|
||||||
local_config = conf()
|
local_config = conf()
|
||||||
file_cfg = self._read_file_config()
|
file_cfg = self._read_file_config()
|
||||||
if provider_id:
|
|
||||||
local_config["text_to_voice"] = provider_id
|
local_config["text_to_voice"] = provider_id
|
||||||
file_cfg["text_to_voice"] = provider_id
|
file_cfg["text_to_voice"] = provider_id
|
||||||
if model:
|
|
||||||
local_config["text_to_voice_model"] = model
|
local_config["text_to_voice_model"] = model
|
||||||
file_cfg["text_to_voice_model"] = model
|
file_cfg["text_to_voice_model"] = model
|
||||||
|
local_config["tts_voice_id"] = voice
|
||||||
|
file_cfg["tts_voice_id"] = voice
|
||||||
self._write_file_config(file_cfg)
|
self._write_file_config(file_cfg)
|
||||||
logger.info(f"[ModelsHandler] tts updated: provider={provider_id!r} model={model!r}")
|
logger.info(
|
||||||
|
f"[ModelsHandler] tts updated: provider={provider_id!r} "
|
||||||
|
f"model={model!r} voice={voice!r}"
|
||||||
|
)
|
||||||
self._refresh_voice_routing()
|
self._refresh_voice_routing()
|
||||||
return json.dumps({"status": "success", "provider": provider_id, "model": model})
|
return json.dumps({
|
||||||
|
"status": "success",
|
||||||
|
"provider": provider_id, "model": model, "voice": voice,
|
||||||
|
})
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _refresh_voice_routing() -> None:
|
def _refresh_voice_routing() -> None:
|
||||||
@@ -2066,17 +2529,20 @@ class ModelsHandler:
|
|||||||
logger.warning(f"[ModelsHandler] Bridge voice refresh failed: {e}")
|
logger.warning(f"[ModelsHandler] Bridge voice refresh failed: {e}")
|
||||||
|
|
||||||
def _set_embedding(self, provider_id: str, model: str) -> str:
|
def _set_embedding(self, provider_id: str, model: str) -> str:
|
||||||
# provider_id="" + model="" means "switch back to legacy auto mode".
|
# Two valid states: both empty (reset to pick-or-empty) OR both set.
|
||||||
|
# A provider without a model leaves the runtime in a broken half-state,
|
||||||
|
# so reject that explicitly instead of silently writing it through.
|
||||||
|
if provider_id and not model:
|
||||||
|
return json.dumps({
|
||||||
|
"status": "error",
|
||||||
|
"message": "embedding model is required when a provider is selected",
|
||||||
|
})
|
||||||
local_config = conf()
|
local_config = conf()
|
||||||
file_cfg = self._read_file_config()
|
file_cfg = self._read_file_config()
|
||||||
local_config["embedding_provider"] = provider_id
|
local_config["embedding_provider"] = provider_id
|
||||||
file_cfg["embedding_provider"] = provider_id
|
file_cfg["embedding_provider"] = provider_id
|
||||||
if model:
|
|
||||||
local_config["embedding_model"] = model
|
local_config["embedding_model"] = model
|
||||||
file_cfg["embedding_model"] = model
|
file_cfg["embedding_model"] = model
|
||||||
else:
|
|
||||||
local_config["embedding_model"] = ""
|
|
||||||
file_cfg["embedding_model"] = ""
|
|
||||||
self._write_file_config(file_cfg)
|
self._write_file_config(file_cfg)
|
||||||
logger.info(f"[ModelsHandler] embedding updated: provider={provider_id!r} model={model!r}")
|
logger.info(f"[ModelsHandler] embedding updated: provider={provider_id!r} model={model!r}")
|
||||||
# The next /memory rebuild-index command hot-swaps the provider onto
|
# The next /memory rebuild-index command hot-swaps the provider onto
|
||||||
|
|||||||
@@ -16,8 +16,8 @@
|
|||||||
"open_ai_api_base": "https://api.openai.com/v1",
|
"open_ai_api_base": "https://api.openai.com/v1",
|
||||||
"gemini_api_key": "",
|
"gemini_api_key": "",
|
||||||
"gemini_api_base": "https://generativelanguage.googleapis.com",
|
"gemini_api_base": "https://generativelanguage.googleapis.com",
|
||||||
"voice_to_text": "openai",
|
"voice_to_text": "",
|
||||||
"text_to_voice": "openai",
|
"text_to_voice": "",
|
||||||
"voice_reply_voice": false,
|
"voice_reply_voice": false,
|
||||||
"speech_recognition": true,
|
"speech_recognition": true,
|
||||||
"group_speech_recognition": false,
|
"group_speech_recognition": false,
|
||||||
|
|||||||
99
config.py
99
config.py
@@ -330,8 +330,18 @@ def load_config():
|
|||||||
config_str = read_file(config_path)
|
config_str = read_file(config_path)
|
||||||
logger.debug("[INIT] config str: {}".format(drag_sensitive(config_str)))
|
logger.debug("[INIT] config str: {}".format(drag_sensitive(config_str)))
|
||||||
|
|
||||||
# 将json字符串反序列化为dict类型
|
# 将json字符串反序列化为dict类型。
|
||||||
config = Config(json.loads(config_str))
|
# `object_pairs_hook` lets us catch users who accidentally typed the
|
||||||
|
# same key twice (e.g. two `"tools"` blocks) — json.loads would
|
||||||
|
# otherwise silently drop all but the last occurrence.
|
||||||
|
config = Config(json.loads(config_str, object_pairs_hook=_merge_duplicate_keys))
|
||||||
|
|
||||||
|
# Migrate legacy singular keys (`tool`, `skill`) into the canonical
|
||||||
|
# plural buckets so the rest of the codebase only reads one schema.
|
||||||
|
# Deep-merge so existing `tools`/`skills` entries are preserved and
|
||||||
|
# only missing namespaces are filled in from the legacy section.
|
||||||
|
_merge_legacy_namespace(config, legacy="tool", canonical="tools")
|
||||||
|
_merge_legacy_namespace(config, legacy="skill", canonical="skills")
|
||||||
|
|
||||||
# override config with environment variables.
|
# override config with environment variables.
|
||||||
# Some online deployment platforms (e.g. Railway) deploy project from github directly. So you shouldn't put your secrets like api key in a config file, instead use environment variables to override the default config.
|
# Some online deployment platforms (e.g. Railway) deploy project from github directly. So you shouldn't put your secrets like api key in a config file, instead use environment variables to override the default config.
|
||||||
@@ -422,7 +432,7 @@ def load_config():
|
|||||||
os.environ[env_key] = str(val)
|
os.environ[env_key] = str(val)
|
||||||
injected += 1
|
injected += 1
|
||||||
|
|
||||||
injected += _sync_skill_config_to_env(config.get("skill", {}))
|
injected += _sync_skill_config_to_env(config.get("skills", {}))
|
||||||
|
|
||||||
if injected:
|
if injected:
|
||||||
logger.info("[INIT] Synced {} config values to environment variables".format(injected))
|
logger.info("[INIT] Synced {} config values to environment variables".format(injected))
|
||||||
@@ -430,11 +440,90 @@ def load_config():
|
|||||||
config.load_user_datas()
|
config.load_user_datas()
|
||||||
|
|
||||||
|
|
||||||
|
def _deep_merge_dicts(base: dict, incoming: dict) -> dict:
|
||||||
|
"""Recursively merge ``incoming`` into ``base`` (incoming wins on leaves)."""
|
||||||
|
for key, val in incoming.items():
|
||||||
|
if (
|
||||||
|
key in base
|
||||||
|
and isinstance(base[key], dict)
|
||||||
|
and isinstance(val, dict)
|
||||||
|
):
|
||||||
|
_deep_merge_dicts(base[key], val)
|
||||||
|
else:
|
||||||
|
base[key] = val
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_duplicate_keys(pairs):
|
||||||
|
"""object_pairs_hook for json.loads: deep-merge duplicate top-level keys
|
||||||
|
(lists concat, dicts merge, scalars take the latter) instead of dropping."""
|
||||||
|
out = {}
|
||||||
|
duplicates = []
|
||||||
|
for key, val in pairs:
|
||||||
|
if key not in out:
|
||||||
|
out[key] = val
|
||||||
|
continue
|
||||||
|
duplicates.append(key)
|
||||||
|
prev = out[key]
|
||||||
|
if isinstance(prev, dict) and isinstance(val, dict):
|
||||||
|
_deep_merge_dicts(prev, val)
|
||||||
|
elif isinstance(prev, list) and isinstance(val, list):
|
||||||
|
prev.extend(val)
|
||||||
|
else:
|
||||||
|
out[key] = val
|
||||||
|
if duplicates:
|
||||||
|
# logger may not be wired yet — fall back to print so we never lose the warning.
|
||||||
|
unique = sorted(set(duplicates))
|
||||||
|
try:
|
||||||
|
logger.warning("[INIT] config.json has duplicate keys (merged): %s", unique)
|
||||||
|
except Exception:
|
||||||
|
print("[INIT] config.json has duplicate keys (merged):", unique)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_legacy_namespace(cfg, legacy: str, canonical: str) -> None:
|
||||||
|
"""Fold deprecated singular keys (``tool`` / ``skill``) into their plural
|
||||||
|
canonical counterparts at load time. Canonical entries always win."""
|
||||||
|
legacy_section = cfg.get(legacy)
|
||||||
|
if not isinstance(legacy_section, dict) or not legacy_section:
|
||||||
|
cfg.pop(legacy, None)
|
||||||
|
return
|
||||||
|
canonical_section = cfg.get(canonical)
|
||||||
|
if not isinstance(canonical_section, dict):
|
||||||
|
canonical_section = {}
|
||||||
|
merged_keys = []
|
||||||
|
for name, val in legacy_section.items():
|
||||||
|
if name in canonical_section:
|
||||||
|
if isinstance(canonical_section[name], dict) and isinstance(val, dict):
|
||||||
|
for sub_key, sub_val in val.items():
|
||||||
|
if (
|
||||||
|
sub_key in canonical_section[name]
|
||||||
|
and isinstance(canonical_section[name][sub_key], dict)
|
||||||
|
and isinstance(sub_val, dict)
|
||||||
|
):
|
||||||
|
_deep_merge_dicts(sub_val, canonical_section[name][sub_key])
|
||||||
|
canonical_section[name][sub_key] = sub_val
|
||||||
|
else:
|
||||||
|
canonical_section[name].setdefault(sub_key, sub_val)
|
||||||
|
continue
|
||||||
|
canonical_section[name] = val
|
||||||
|
merged_keys.append(name)
|
||||||
|
cfg[canonical] = canonical_section
|
||||||
|
cfg.pop(legacy, None)
|
||||||
|
if merged_keys:
|
||||||
|
logger.warning(
|
||||||
|
"[INIT] Legacy config key '{}' is deprecated; merged into '{}': {}. "
|
||||||
|
"Please rename '{}' to '{}' in your config.json.".format(
|
||||||
|
legacy, canonical, merged_keys, legacy, canonical,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _sync_skill_config_to_env(skill_section) -> int:
|
def _sync_skill_config_to_env(skill_section) -> int:
|
||||||
"""Flatten skill-namespaced config into environment variables.
|
"""Flatten skill-namespaced config into environment variables.
|
||||||
|
|
||||||
Mapping rule: ``config["skill"][<name>][<key>]`` -> ``SKILL_<NAME>_<KEY>``
|
Mapping rule: ``config["skills"][<name>][<key>]`` -> ``SKILL_<NAME>_<KEY>``
|
||||||
(e.g. ``skill["image-generation"].model`` -> ``SKILL_IMAGE_GENERATION_MODEL``).
|
(e.g. ``skills["image-generation"].model`` -> ``SKILL_IMAGE_GENERATION_MODEL``).
|
||||||
|
|
||||||
This lets subprocess-based skill scripts read their own settings without
|
This lets subprocess-based skill scripts read their own settings without
|
||||||
importing project code. Existing env vars are NOT overwritten so the
|
importing project code. Existing env vars are NOT overwritten so the
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ To force a specific Vision model, set it explicitly in `config.json`:
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"tool": {
|
"tools": {
|
||||||
"vision": {
|
"vision": {
|
||||||
"model": "ernie-4.5-turbo-vl"
|
"model": "ernie-4.5-turbo-vl"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ New built-in `image-generation` skill supporting text-to-image, image-to-image,
|
|||||||
- **Zero model selection**: Just configure an API key and it works — no need to manually specify a model. You can also name a specific model in conversation (e.g. "draw a cat with seedream")
|
- **Zero model selection**: Just configure an API key and it works — no need to manually specify a model. You can also name a specific model in conversation (e.g. "draw a cat with seedream")
|
||||||
- **Flexible control**: Supports `quality`, `size` (512/1K–4K), and `aspect_ratio` parameters, with each provider automatically mapping to its supported values
|
- **Flexible control**: Supports `quality`, `size` (512/1K–4K), and `aspect_ratio` parameters, with each provider automatically mapping to its supported values
|
||||||
- **Image editing**: Pass existing images for editing, style transfer, or multi-image fusion (Seedream supports up to 14 reference images)
|
- **Image editing**: Pass existing images for editing, style transfer, or multi-image fusion (Seedream supports up to 14 reference images)
|
||||||
- **Skill-level config**: Pin a default model via `skill.image-generation.model` in `config.json`
|
- **Skill-level config**: Pin a default model via `skills.image-generation.model` in `config.json`
|
||||||
- **Image lightbox**: All images in the Web console now support click-to-enlarge preview
|
- **Image lightbox**: All images in the Web console now support click-to-enlarge preview
|
||||||
|
|
||||||
Docs: [Image Generation Skill](https://docs.cowagent.ai/en/skills/image-generation)
|
Docs: [Image Generation Skill](https://docs.cowagent.ai/en/skills/image-generation)
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ The voice and streaming building blocks come from a community contribution #2791
|
|||||||
|
|
||||||
## 🔧 Tools and Safety
|
## 🔧 Tools and Safety
|
||||||
|
|
||||||
- **Vision model selection**: `tool.vision.model` config now actually takes effect, with automatic fallback when unconfigured #2792
|
- **Vision model selection**: `tools.vision.model` config now actually takes effect, with automatic fallback when unconfigured #2792
|
||||||
- **Bash safety prompt**: The destructive-deletion confirm prompt is now scoped to paths outside the workspace — routine in-workspace operations are no longer interrupted
|
- **Bash safety prompt**: The destructive-deletion confirm prompt is now scoped to paths outside the workspace — routine in-workspace operations are no longer interrupted
|
||||||
|
|
||||||
## 🐛 Other Fixes
|
## 🐛 Other Fixes
|
||||||
|
|||||||
@@ -87,7 +87,7 @@ Configure ARK_API_KEY as xxx
|
|||||||
To force all image generation through a specific provider's model, add this to `config.json`:
|
To force all image generation through a specific provider's model, add this to `config.json`:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
"skill": {
|
"skills": {
|
||||||
"image-generation": {
|
"image-generation": {
|
||||||
"model": "seedream-5.0-lite"
|
"model": "seedream-5.0-lite"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ To specify a particular model for the vision tool, add to `config.json`:
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"tool": {
|
"tools": {
|
||||||
"vision": {
|
"vision": {
|
||||||
"model": "ernie-4.5-turbo-vl"
|
"model": "ernie-4.5-turbo-vl"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ description: Baidu Qianfan ERNIE モデル設定
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"tool": {
|
"tools": {
|
||||||
"vision": {
|
"vision": {
|
||||||
"model": "ernie-4.5-turbo-vl"
|
"model": "ernie-4.5-turbo-vl"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ description: CowAgent 2.0.7 - 画像生成スキル(6プロバイダー自動
|
|||||||
- **モデル選択不要**:API Key を設定するだけで使用可能、モデルを手動で指定する必要なし。会話で特定モデルを指名することも可能(例:「seedream で猫を描いて」)
|
- **モデル選択不要**:API Key を設定するだけで使用可能、モデルを手動で指定する必要なし。会話で特定モデルを指名することも可能(例:「seedream で猫を描いて」)
|
||||||
- **柔軟な制御**:`quality`(画質)、`size`(解像度、512/1K〜4K)、`aspect_ratio`(アスペクト比)パラメータ対応、各プロバイダーが自動的に有効な値にマッピング
|
- **柔軟な制御**:`quality`(画質)、`size`(解像度、512/1K〜4K)、`aspect_ratio`(アスペクト比)パラメータ対応、各プロバイダーが自動的に有効な値にマッピング
|
||||||
- **画像編集**:既存の画像を渡して編集・スタイル変換・複数画像融合が可能(Seedream は最大 14 枚の参照画像をサポート)
|
- **画像編集**:既存の画像を渡して編集・スタイル変換・複数画像融合が可能(Seedream は最大 14 枚の参照画像をサポート)
|
||||||
- **スキルレベル設定**:`config.json` の `skill.image-generation.model` でデフォルトモデルを固定可能
|
- **スキルレベル設定**:`config.json` の `skills.image-generation.model` でデフォルトモデルを固定可能
|
||||||
- **画像ライトボックス**:Web コンソールのすべての画像がクリックで拡大プレビュー対応
|
- **画像ライトボックス**:Web コンソールのすべての画像がクリックで拡大プレビュー対応
|
||||||
|
|
||||||
ドキュメント:[画像生成スキル](https://docs.cowagent.ai/ja/skills/image-generation)
|
ドキュメント:[画像生成スキル](https://docs.cowagent.ai/ja/skills/image-generation)
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ description: CowAgent 2.0.8 - 飛書チャネル全面アップグレード(
|
|||||||
|
|
||||||
## 🔧 ツールと安全性
|
## 🔧 ツールと安全性
|
||||||
|
|
||||||
- **Vision モデル選択**:`tool.vision.model` 設定が実際に反映されるようになり、未設定時は自動フォールバック #2792
|
- **Vision モデル選択**:`tools.vision.model` 設定が実際に反映されるようになり、未設定時は自動フォールバック #2792
|
||||||
- **Bash セーフティ確認**:破壊的削除の確認プロンプトをワークスペース外のパスに限定。ワークスペース内の通常操作は中断されません
|
- **Bash セーフティ確認**:破壊的削除の確認プロンプトをワークスペース外のパスに限定。ワークスペース内の通常操作は中断されません
|
||||||
|
|
||||||
## 🐛 その他の修正
|
## 🐛 その他の修正
|
||||||
|
|||||||
@@ -87,7 +87,7 @@ ARK_API_KEY を xxx に設定して
|
|||||||
すべての画像生成を特定のプロバイダーのモデルで固定したい場合、`config.json` に以下を追加:
|
すべての画像生成を特定のプロバイダーのモデルで固定したい場合、`config.json` に以下を追加:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
"skill": {
|
"skills": {
|
||||||
"image-generation": {
|
"image-generation": {
|
||||||
"model": "seedream-5.0-lite"
|
"model": "seedream-5.0-lite"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ Vision ツールで使用するモデルを指定するには、`config.json`
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"tool": {
|
"tools": {
|
||||||
"vision": {
|
"vision": {
|
||||||
"model": "ernie-4.5-turbo-vl"
|
"model": "ernie-4.5-turbo-vl"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ description: 百度千帆 ERNIE 模型配置
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"tool": {
|
"tools": {
|
||||||
"vision": {
|
"vision": {
|
||||||
"model": "ernie-4.5-turbo-vl"
|
"model": "ernie-4.5-turbo-vl"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ description: CowAgent 2.0.7 - 图像生成技能(六厂商自动路由)、
|
|||||||
- **开箱即用**:配置 API Key 即可使用,无需手动指定模型。也支持在对话中指定特定模型
|
- **开箱即用**:配置 API Key 即可使用,无需手动指定模型。也支持在对话中指定特定模型
|
||||||
- **灵活控制**:支持 `quality`(画质)、`size`(分辨率,512/1K~4K)、`aspect_ratio`(宽高比)等参数,各厂商自动适配有效值
|
- **灵活控制**:支持 `quality`(画质)、`size`(分辨率,512/1K~4K)、`aspect_ratio`(宽高比)等参数,各厂商自动适配有效值
|
||||||
- **图片编辑**:传入已有图片即可进行编辑、风格迁移、多图融合
|
- **图片编辑**:传入已有图片即可进行编辑、风格迁移、多图融合
|
||||||
- **Skill 级配置**:支持通过 `config.json` 中的 `skill.image-generation.model` 固定默认模型
|
- **Skill 级配置**:支持通过 `config.json` 中的 `skills.image-generation.model` 固定默认模型
|
||||||
|
|
||||||
相关文档:[图像生成技能](https://docs.cowagent.ai/skills/image-generation)
|
相关文档:[图像生成技能](https://docs.cowagent.ai/skills/image-generation)
|
||||||
|
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ description: CowAgent 2.0.8 - 飞书渠道全面升级(语音、流式打字
|
|||||||
|
|
||||||
## 🔧 工具与安全
|
## 🔧 工具与安全
|
||||||
|
|
||||||
- **图像识别模型**:让 `tool.vision.model` 配置真正生效,未配置时自动 fallback #2792 Thanks CNXudiandian
|
- **图像识别模型**:让 `tools.vision.model` 配置真正生效,未配置时自动 fallback #2792 Thanks CNXudiandian
|
||||||
- **Bash 安全确认**:仅对工作区外的破坏性删除做二次确认,工作区内常规操作不再打扰
|
- **Bash 安全确认**:仅对工作区外的破坏性删除做二次确认,工作区内常规操作不再打扰
|
||||||
|
|
||||||
## 🐛 其他修复
|
## 🐛 其他修复
|
||||||
|
|||||||
@@ -88,7 +88,7 @@ description: 文生图 / 图生图 / 多图融合,支持多家厂商自动路
|
|||||||
如果想让所有图像生成固定走某个厂商的模型,可以在 `config.json` 里加:
|
如果想让所有图像生成固定走某个厂商的模型,可以在 `config.json` 里加:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
"skill": {
|
"skills": {
|
||||||
"image-generation": {
|
"image-generation": {
|
||||||
"model": "seedream-5.0-lite"
|
"model": "seedream-5.0-lite"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ Vision 工具采用多级自动选择 + 自动兜底策略,无需手动配置
|
|||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"tool": {
|
"tools": {
|
||||||
"vision": {
|
"vision": {
|
||||||
"model": "gpt-4.1"
|
"model": "gpt-4.1"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1110,7 +1110,7 @@ def main():
|
|||||||
# Model resolution priority:
|
# Model resolution priority:
|
||||||
# 1. Explicit `model` in the call args (agent / user override)
|
# 1. Explicit `model` in the call args (agent / user override)
|
||||||
# 2. SKILL_IMAGE_GENERATION_MODEL env var (synced from
|
# 2. SKILL_IMAGE_GENERATION_MODEL env var (synced from
|
||||||
# config["skill"]["image-generation"]["model"] at startup)
|
# config["skills"]["image-generation"]["model"] at startup)
|
||||||
# 3. None → fall back to automatic provider routing (try every
|
# 3. None → fall back to automatic provider routing (try every
|
||||||
# provider with a configured API key in global priority order)
|
# provider with a configured API key in global priority order)
|
||||||
model = args.get("model") or os.environ.get("SKILL_IMAGE_GENERATION_MODEL") or ""
|
model = args.get("model") or os.environ.get("SKILL_IMAGE_GENERATION_MODEL") or ""
|
||||||
|
|||||||
@@ -394,7 +394,7 @@ class TestQianfanVisionTool(unittest.TestCase):
|
|||||||
"open_ai_api_key": "",
|
"open_ai_api_key": "",
|
||||||
"linkai_api_key": "",
|
"linkai_api_key": "",
|
||||||
"use_linkai": False,
|
"use_linkai": False,
|
||||||
"tool": {},
|
"tools": {},
|
||||||
}
|
}
|
||||||
if values:
|
if values:
|
||||||
data.update(values)
|
data.update(values)
|
||||||
@@ -424,7 +424,7 @@ class TestQianfanVisionTool(unittest.TestCase):
|
|||||||
def test_vision_routes_ernie_model_override_to_qianfan(self):
|
def test_vision_routes_ernie_model_override_to_qianfan(self):
|
||||||
fake_conf = self._fake_conf({
|
fake_conf = self._fake_conf({
|
||||||
"qianfan_api_key": "test-qianfan-key",
|
"qianfan_api_key": "test-qianfan-key",
|
||||||
"tool": {"vision": {"model": "ernie-4.5-turbo-vl-32k"}},
|
"tools": {"vision": {"model": "ernie-4.5-turbo-vl-32k"}},
|
||||||
})
|
})
|
||||||
fake_bot = MagicMock()
|
fake_bot = MagicMock()
|
||||||
fake_bot.call_vision = MagicMock()
|
fake_bot.call_vision = MagicMock()
|
||||||
|
|||||||
@@ -1,20 +1,13 @@
|
|||||||
# encoding:utf-8
|
# encoding:utf-8
|
||||||
"""
|
"""DashScope voice: qwen3-asr-flash (ASR) + qwen3-tts-flash (TTS)
|
||||||
DashScope (Aliyun Bailian) voice service.
|
via dashscope.MultiModalConversation."""
|
||||||
|
import datetime
|
||||||
ASR : qwen3-asr-flash via dashscope.MultiModalConversation
|
|
||||||
TTS : not yet implemented (see CosyVoice / qwen3-tts)
|
|
||||||
|
|
||||||
Why MultiModalConversation instead of the OpenAI-compatible endpoint:
|
|
||||||
- SDK is already a project dep (used by chat/vision)
|
|
||||||
- Native API accepts local file:// paths up to 100 QPS without an OSS
|
|
||||||
round-trip, which is what we need for the "send a short voice
|
|
||||||
message" flow. Public URLs / Base64 also work.
|
|
||||||
"""
|
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import dashscope
|
import dashscope
|
||||||
|
import requests
|
||||||
from dashscope import MultiModalConversation
|
from dashscope import MultiModalConversation
|
||||||
|
|
||||||
from bridge.reply import Reply, ReplyType
|
from bridge.reply import Reply, ReplyType
|
||||||
@@ -25,16 +18,14 @@ from voice.voice import Voice
|
|||||||
|
|
||||||
|
|
||||||
DEFAULT_ASR_MODEL = "qwen3-asr-flash"
|
DEFAULT_ASR_MODEL = "qwen3-asr-flash"
|
||||||
# qwen3-asr-flash hard cap (single file, sync call). Longer audio needs
|
DEFAULT_TTS_MODEL = "qwen3-tts-flash"
|
||||||
# qwen3-asr-flash-filetrans which is async-only and out of scope here.
|
DEFAULT_TTS_VOICE = "Cherry"
|
||||||
MAX_DURATION_SECONDS = 300
|
MAX_DURATION_SECONDS = 300
|
||||||
MAX_FILE_BYTES = 10 * 1024 * 1024
|
MAX_FILE_BYTES = 10 * 1024 * 1024
|
||||||
|
|
||||||
|
|
||||||
class DashScopeVoice(Voice):
|
class DashScopeVoice(Voice):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
# api_key is applied per-call (chat bot does the same) so a live
|
|
||||||
# config change via the web console takes effect without restart.
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def voiceToText(self, voice_file: str):
|
def voiceToText(self, voice_file: str):
|
||||||
@@ -83,14 +74,72 @@ class DashScopeVoice(Voice):
|
|||||||
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
||||||
|
|
||||||
def textToVoice(self, text: str):
|
def textToVoice(self, text: str):
|
||||||
# TTS will be added in a follow-up commit (qwen3-tts / cosyvoice).
|
try:
|
||||||
return Reply(ReplyType.ERROR, "DashScope 语音合成尚未接入")
|
api_key = conf().get("dashscope_api_key", "")
|
||||||
|
if not api_key:
|
||||||
|
logger.error("[DashScopeVoice] dashscope_api_key is not configured")
|
||||||
|
return Reply(ReplyType.ERROR, "未配置 DashScope API key")
|
||||||
|
dashscope.api_key = api_key
|
||||||
|
|
||||||
|
model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
|
||||||
|
voice = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
|
||||||
|
response = MultiModalConversation.call(
|
||||||
|
model=model,
|
||||||
|
api_key=api_key,
|
||||||
|
text=text,
|
||||||
|
voice=voice,
|
||||||
|
stream=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
url = self._extract_audio_url(response)
|
||||||
|
if not url:
|
||||||
|
logger.error(f"[DashScopeVoice] textToVoice failed: {response}")
|
||||||
|
return Reply(ReplyType.ERROR, "语音合成失败")
|
||||||
|
|
||||||
|
local_path = self._download_audio(url)
|
||||||
|
if not local_path:
|
||||||
|
return Reply(ReplyType.ERROR, "语音合成失败")
|
||||||
|
|
||||||
|
logger.info(f"[DashScopeVoice] textToVoice model={model} voice={voice} file={local_path}")
|
||||||
|
return Reply(ReplyType.VOICE, local_path)
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(f"[DashScopeVoice] textToVoice exception: {e}")
|
||||||
|
return Reply(ReplyType.ERROR, "语音合成失败")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_audio_url(response) -> Optional[str]:
|
||||||
|
try:
|
||||||
|
if getattr(response, "status_code", 200) != 200:
|
||||||
|
return None
|
||||||
|
audio = response.output.get("audio") if response.output else None
|
||||||
|
if isinstance(audio, dict):
|
||||||
|
return audio.get("url") or None
|
||||||
|
return getattr(audio, "url", None)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _download_audio(url: str) -> Optional[str]:
|
||||||
|
try:
|
||||||
|
tmp_dir = os.path.join(os.getcwd(), "tmp")
|
||||||
|
os.makedirs(tmp_dir, exist_ok=True)
|
||||||
|
ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
||||||
|
ext = os.path.splitext(url.split("?", 1)[0])[1].lower() or ".wav"
|
||||||
|
if ext not in (".mp3", ".wav", ".m4a", ".aac", ".opus"):
|
||||||
|
ext = ".wav"
|
||||||
|
dst = os.path.join(tmp_dir, f"dashscope_tts_{ts}_{random.randint(0, 9999)}{ext}")
|
||||||
|
resp = requests.get(url, timeout=60)
|
||||||
|
resp.raise_for_status()
|
||||||
|
with open(dst, "wb") as f:
|
||||||
|
f.write(resp.content)
|
||||||
|
return dst
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[DashScopeVoice] download audio failed: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _ensure_compatible_format(voice_file: str) -> str:
|
def _ensure_compatible_format(voice_file: str) -> str:
|
||||||
"""Convert AMR/SILK to mp3 since qwen3-asr-flash doesn't accept them.
|
# qwen3-asr-flash doesn't accept AMR/SILK; mp3/wav/m4a/aac/opus pass through.
|
||||||
Other formats (mp3/wav/m4a/aac/opus/webm) are passed through.
|
|
||||||
"""
|
|
||||||
lower = voice_file.lower()
|
lower = voice_file.lower()
|
||||||
if lower.endswith(".amr") or lower.endswith(".silk") or lower.endswith(".slk"):
|
if lower.endswith(".amr") or lower.endswith(".silk") or lower.endswith(".slk"):
|
||||||
try:
|
try:
|
||||||
@@ -98,20 +147,11 @@ class DashScopeVoice(Voice):
|
|||||||
audio_convert.any_to_mp3(voice_file, mp3_file)
|
audio_convert.any_to_mp3(voice_file, mp3_file)
|
||||||
return mp3_file
|
return mp3_file
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(
|
logger.warning(f"[DashScopeVoice] mp3 convert failed: {e}")
|
||||||
f"[DashScopeVoice] convert {voice_file} to mp3 failed: {e}; "
|
|
||||||
f"submitting original file"
|
|
||||||
)
|
|
||||||
return voice_file
|
return voice_file
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _extract_text(response) -> Optional[str]:
|
def _extract_text(response) -> Optional[str]:
|
||||||
"""Pull the recognized text out of MultiModalConversation response.
|
|
||||||
|
|
||||||
Successful shape (result_format="message"):
|
|
||||||
response.output.choices[0].message.content -> list of {"text": "..."}
|
|
||||||
or in some SDK versions a plain string.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
if getattr(response, "status_code", 200) != 200:
|
if getattr(response, "status_code", 200) != 200:
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -1,16 +1,18 @@
|
|||||||
"""
|
"""LinkAI voice: Whisper ASR + multi-vendor TTS (OpenAI / Doubao / Baidu)
|
||||||
google voice service
|
proxied via https://docs.link-ai.tech/platform/api/voice-speech."""
|
||||||
"""
|
import datetime
|
||||||
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from voice import audio_convert
|
|
||||||
from bridge.reply import Reply, ReplyType
|
from bridge.reply import Reply, ReplyType
|
||||||
|
from common import const
|
||||||
from common.log import logger
|
from common.log import logger
|
||||||
from config import conf
|
from config import conf
|
||||||
|
from voice import audio_convert
|
||||||
from voice.voice import Voice
|
from voice.voice import Voice
|
||||||
from common import const
|
|
||||||
import os
|
|
||||||
import datetime
|
|
||||||
|
|
||||||
class LinkAIVoice(Voice):
|
class LinkAIVoice(Voice):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@@ -21,8 +23,7 @@ class LinkAIVoice(Voice):
|
|||||||
try:
|
try:
|
||||||
url = conf().get("linkai_api_base", "https://api.link-ai.tech") + "/v1/audio/transcriptions"
|
url = conf().get("linkai_api_base", "https://api.link-ai.tech") + "/v1/audio/transcriptions"
|
||||||
headers = {"Authorization": "Bearer " + conf().get("linkai_api_key")}
|
headers = {"Authorization": "Bearer " + conf().get("linkai_api_key")}
|
||||||
model = None
|
# Pin whisper-1: gateway ignores any other ASR model id.
|
||||||
if not conf().get("text_to_voice") or conf().get("voice_to_text") == "openai":
|
|
||||||
model = const.WHISPER_1
|
model = const.WHISPER_1
|
||||||
if voice_file.endswith(".amr"):
|
if voice_file.endswith(".amr"):
|
||||||
try:
|
try:
|
||||||
@@ -30,54 +31,59 @@ class LinkAIVoice(Voice):
|
|||||||
audio_convert.any_to_mp3(voice_file, mp3_file)
|
audio_convert.any_to_mp3(voice_file, mp3_file)
|
||||||
voice_file = mp3_file
|
voice_file = mp3_file
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warn(f"[LinkVoice] amr file transfer failed, directly send amr voice file: {format(e)}")
|
logger.warning(f"[LinkVoice] amr file transfer failed, directly send amr voice file: {e}")
|
||||||
file = open(voice_file, "rb")
|
with open(voice_file, "rb") as file:
|
||||||
file_body = {
|
res = requests.post(
|
||||||
"file": file
|
url,
|
||||||
}
|
files={"file": file},
|
||||||
data = {
|
headers=headers,
|
||||||
"model": model
|
data={"model": model},
|
||||||
}
|
timeout=(5, 60),
|
||||||
res = requests.post(url, files=file_body, headers=headers, data=data, timeout=(5, 60))
|
)
|
||||||
if res.status_code == 200:
|
if res.status_code != 200:
|
||||||
text = res.json().get("text")
|
msg = ""
|
||||||
else:
|
try:
|
||||||
res_json = res.json()
|
msg = res.json().get("message", "")
|
||||||
logger.error(f"[LinkVoice] voiceToText error, status_code={res.status_code}, msg={res_json.get('message')}")
|
except Exception:
|
||||||
|
pass
|
||||||
|
logger.error(f"[LinkVoice] voiceToText error, status_code={res.status_code}, msg={msg}")
|
||||||
return None
|
return None
|
||||||
reply = Reply(ReplyType.TEXT, text)
|
text = res.json().get("text")
|
||||||
logger.info(f"[LinkVoice] voiceToText success, text={text}, file name={voice_file}")
|
logger.info(f"[LinkVoice] voiceToText success, text={text}, file name={voice_file}")
|
||||||
|
return Reply(ReplyType.TEXT, text)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(e)
|
logger.error(e)
|
||||||
return None
|
return None
|
||||||
return reply
|
|
||||||
|
|
||||||
def textToVoice(self, text):
|
def textToVoice(self, text):
|
||||||
try:
|
try:
|
||||||
url = conf().get("linkai_api_base", "https://api.link-ai.tech") + "/v1/audio/speech"
|
url = conf().get("linkai_api_base", "https://api.link-ai.tech") + "/v1/audio/speech"
|
||||||
headers = {"Authorization": "Bearer " + conf().get("linkai_api_key")}
|
headers = {"Authorization": "Bearer " + conf().get("linkai_api_key")}
|
||||||
model = const.TTS_1
|
# Gateway routes by `model` (tts-1 / doubao / baidu) + `voice` from
|
||||||
if not conf().get("text_to_voice") or conf().get("text_to_voice") in ["openai", const.TTS_1, const.TTS_1_HD]:
|
# that engine's catalog. `app_code` is optional workspace override.
|
||||||
model = conf().get("text_to_voice_model") or const.TTS_1
|
|
||||||
data = {
|
data = {
|
||||||
"model": model,
|
|
||||||
"input": text,
|
"input": text,
|
||||||
"voice": conf().get("tts_voice_id"),
|
"voice": conf().get("tts_voice_id"),
|
||||||
"app_code": conf().get("linkai_app_code")
|
"app_code": conf().get("linkai_app_code"),
|
||||||
}
|
}
|
||||||
|
model = conf().get("text_to_voice_model")
|
||||||
|
if model:
|
||||||
|
data["model"] = model
|
||||||
res = requests.post(url, headers=headers, json=data, timeout=(5, 120))
|
res = requests.post(url, headers=headers, json=data, timeout=(5, 120))
|
||||||
if res.status_code == 200:
|
if res.status_code != 200:
|
||||||
|
msg = ""
|
||||||
|
try:
|
||||||
|
msg = res.json().get("message", "")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
logger.error(f"[LinkVoice] textToVoice error, status_code={res.status_code}, msg={msg}")
|
||||||
|
return None
|
||||||
tmp_file_name = "tmp/" + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + str(random.randint(0, 1000)) + ".mp3"
|
tmp_file_name = "tmp/" + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + str(random.randint(0, 1000)) + ".mp3"
|
||||||
|
os.makedirs(os.path.dirname(tmp_file_name), exist_ok=True)
|
||||||
with open(tmp_file_name, 'wb') as f:
|
with open(tmp_file_name, 'wb') as f:
|
||||||
f.write(res.content)
|
f.write(res.content)
|
||||||
reply = Reply(ReplyType.VOICE, tmp_file_name)
|
logger.info(f"[LinkVoice] textToVoice success, input={text}, voice_id={data.get('voice')}")
|
||||||
logger.info(f"[LinkVoice] textToVoice success, input={text}, model={model}, voice_id={data.get('voice')}")
|
return Reply(ReplyType.VOICE, tmp_file_name)
|
||||||
return reply
|
|
||||||
else:
|
|
||||||
res_json = res.json()
|
|
||||||
logger.error(f"[LinkVoice] textToVoice error, status_code={res.status_code}, msg={res_json.get('message')}")
|
|
||||||
return None
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(e)
|
logger.error(e)
|
||||||
# reply = Reply(ReplyType.ERROR, "遇到了一点小问题,请稍后再问我吧")
|
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -1,8 +1,7 @@
|
|||||||
# encoding:utf-8
|
# encoding:utf-8
|
||||||
"""
|
"""MiniMax TTS via /v1/t2a_v2 (SSE stream, hex-encoded mp3 chunks)."""
|
||||||
MiniMax TTS voice service
|
|
||||||
"""
|
|
||||||
import datetime
|
import datetime
|
||||||
|
import json
|
||||||
import random
|
import random
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
@@ -12,24 +11,12 @@ from config import conf
|
|||||||
from voice.voice import Voice
|
from voice.voice import Voice
|
||||||
|
|
||||||
|
|
||||||
MINIMAX_TTS_VOICES = [
|
|
||||||
"English_Graceful_Lady",
|
|
||||||
"English_Insightful_Speaker",
|
|
||||||
"English_radiant_girl",
|
|
||||||
"English_Persuasive_Man",
|
|
||||||
"English_Lucky_Robot",
|
|
||||||
"English_expressive_narrator",
|
|
||||||
"Chinese_Warm_Woman",
|
|
||||||
"Chinese_Gentle_Man",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class MinimaxVoice(Voice):
|
class MinimaxVoice(Voice):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.api_key = conf().get("minimax_api_key")
|
self.api_key = conf().get("minimax_api_key")
|
||||||
self.api_base = conf().get("minimax_api_base") or "https://api.minimax.io"
|
# Mainland endpoint matches `sk-api-0-...` keys; override via
|
||||||
# Strip trailing /v1 if present so we can always append /v1/t2a_v2
|
# `minimax_api_base` for international (api.minimax.io) workspaces.
|
||||||
self.api_base = self.api_base.rstrip("/")
|
self.api_base = (conf().get("minimax_api_base") or "https://api.minimaxi.com").rstrip("/")
|
||||||
if self.api_base.endswith("/v1"):
|
if self.api_base.endswith("/v1"):
|
||||||
self.api_base = self.api_base[:-3]
|
self.api_base = self.api_base[:-3]
|
||||||
|
|
||||||
@@ -68,12 +55,14 @@ class MinimaxVoice(Voice):
|
|||||||
response = requests.post(url, headers=headers, json=payload, stream=True, timeout=60)
|
response = requests.post(url, headers=headers, json=payload, stream=True, timeout=60)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
# Parse SSE stream and collect hex-encoded audio chunks
|
# MiniMax returns HTTP 200 even on errors; capture base_resp for diagnostics.
|
||||||
audio_chunks = []
|
audio_chunks = []
|
||||||
buffer = ""
|
last_base_resp = None
|
||||||
|
event_count = 0
|
||||||
for raw in response.iter_lines():
|
for raw in response.iter_lines():
|
||||||
if not raw:
|
if not raw:
|
||||||
continue
|
continue
|
||||||
|
event_count += 1
|
||||||
line = raw.decode("utf-8") if isinstance(raw, bytes) else raw
|
line = raw.decode("utf-8") if isinstance(raw, bytes) else raw
|
||||||
if not line.startswith("data:"):
|
if not line.startswith("data:"):
|
||||||
continue
|
continue
|
||||||
@@ -81,16 +70,31 @@ class MinimaxVoice(Voice):
|
|||||||
if not json_str or json_str == "[DONE]":
|
if not json_str or json_str == "[DONE]":
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
import json
|
|
||||||
event_data = json.loads(json_str)
|
event_data = json.loads(json_str)
|
||||||
audio_hex = event_data.get("data", {}).get("audio")
|
|
||||||
if audio_hex:
|
|
||||||
audio_chunks.append(bytes.fromhex(audio_hex))
|
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
|
base_resp = event_data.get("base_resp") or {}
|
||||||
|
if base_resp:
|
||||||
|
last_base_resp = base_resp
|
||||||
|
audio_hex = (event_data.get("data") or {}).get("audio")
|
||||||
|
if audio_hex:
|
||||||
|
try:
|
||||||
|
audio_chunks.append(bytes.fromhex(audio_hex))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"[MINIMAX] skip bad audio hex chunk: {e}")
|
||||||
|
|
||||||
if not audio_chunks:
|
if not audio_chunks:
|
||||||
logger.error("[MINIMAX] TTS returned no audio data")
|
ct = response.headers.get("Content-Type", "")
|
||||||
|
if last_base_resp and last_base_resp.get("status_code") not in (None, 0):
|
||||||
|
logger.error(
|
||||||
|
f"[MINIMAX] TTS failed: status_code={last_base_resp.get('status_code')}, "
|
||||||
|
f"status_msg={last_base_resp.get('status_msg')}, model={model}, voice_id={voice_id}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.error(
|
||||||
|
f"[MINIMAX] TTS returned no audio data, model={model}, voice_id={voice_id}, "
|
||||||
|
f"url={url}, http={response.status_code}, content_type={ct!r}, events={event_count}"
|
||||||
|
)
|
||||||
return Reply(ReplyType.ERROR, "语音合成失败,未获取到音频数据")
|
return Reply(ReplyType.ERROR, "语音合成失败,未获取到音频数据")
|
||||||
|
|
||||||
audio_data = b"".join(audio_chunks)
|
audio_data = b"".join(audio_chunks)
|
||||||
|
|||||||
@@ -31,7 +31,8 @@ class OpenaiVoice(Voice):
|
|||||||
"file": file,
|
"file": file,
|
||||||
}
|
}
|
||||||
data = {
|
data = {
|
||||||
"model": "whisper-1",
|
# Override via `voice_to_text_model` (e.g. fall back to whisper-1).
|
||||||
|
"model": conf().get("voice_to_text_model") or "gpt-4o-mini-transcribe",
|
||||||
}
|
}
|
||||||
response = requests.post(url, headers=headers, files=files, data=data)
|
response = requests.post(url, headers=headers, files=files, data=data)
|
||||||
response_data = response.json()
|
response_data = response.json()
|
||||||
|
|||||||
@@ -1,14 +1,8 @@
|
|||||||
# encoding:utf-8
|
# encoding:utf-8
|
||||||
"""
|
"""ZhipuAI voice: glm-asr-2512 (ASR) + glm-tts (TTS) via BigModel REST API."""
|
||||||
ZhipuAI (BigModel) voice service.
|
import datetime
|
||||||
|
|
||||||
ASR : glm-asr-2512 via the OpenAI-compatible /audio/transcriptions endpoint.
|
|
||||||
TTS : not yet implemented.
|
|
||||||
|
|
||||||
Endpoint accepts multipart/form-data with `model`, `file`, and `stream`.
|
|
||||||
File size <= 25MB, duration <= 30s per request.
|
|
||||||
"""
|
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
@@ -20,6 +14,8 @@ from voice.voice import Voice
|
|||||||
|
|
||||||
|
|
||||||
DEFAULT_ASR_MODEL = "glm-asr-2512"
|
DEFAULT_ASR_MODEL = "glm-asr-2512"
|
||||||
|
DEFAULT_TTS_MODEL = "glm-tts"
|
||||||
|
DEFAULT_TTS_VOICE = "tongtong"
|
||||||
DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4"
|
DEFAULT_API_BASE = "https://open.bigmodel.cn/api/paas/v4"
|
||||||
MAX_FILE_BYTES = 25 * 1024 * 1024
|
MAX_FILE_BYTES = 25 * 1024 * 1024
|
||||||
REQUEST_TIMEOUT = (5, 60)
|
REQUEST_TIMEOUT = (5, 60)
|
||||||
@@ -27,7 +23,6 @@ REQUEST_TIMEOUT = (5, 60)
|
|||||||
|
|
||||||
class ZhipuAIVoice(Voice):
|
class ZhipuAIVoice(Voice):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
# api_key / base read per-call so live config edits take effect.
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def voiceToText(self, voice_file: str):
|
def voiceToText(self, voice_file: str):
|
||||||
@@ -81,12 +76,91 @@ class ZhipuAIVoice(Voice):
|
|||||||
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
return Reply(ReplyType.ERROR, "我暂时还无法听清您的语音,请稍后再试吧~")
|
||||||
|
|
||||||
def textToVoice(self, text: str):
|
def textToVoice(self, text: str):
|
||||||
return Reply(ReplyType.ERROR, "ZhipuAI 语音合成尚未接入")
|
try:
|
||||||
|
api_key = conf().get("zhipu_ai_api_key", "")
|
||||||
|
if not api_key:
|
||||||
|
logger.error("[ZhipuAIVoice] zhipu_ai_api_key is not configured")
|
||||||
|
return Reply(ReplyType.ERROR, "未配置 ZhipuAI API key")
|
||||||
|
|
||||||
|
api_base = (conf().get("zhipu_ai_api_base") or DEFAULT_API_BASE).rstrip("/")
|
||||||
|
url = f"{api_base}/audio/speech"
|
||||||
|
model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
|
||||||
|
voice_id = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": model,
|
||||||
|
"input": text,
|
||||||
|
"voice": voice_id,
|
||||||
|
"response_format": "wav",
|
||||||
|
"speed": 1.0,
|
||||||
|
"volume": 1.0,
|
||||||
|
}
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {api_key}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
response = requests.post(
|
||||||
|
url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
logger.error(
|
||||||
|
f"[ZhipuAIVoice] textToVoice failed: status={response.status_code} "
|
||||||
|
f"body={response.text[:500]} model={model} voice={voice_id}"
|
||||||
|
)
|
||||||
|
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
|
||||||
|
|
||||||
|
# Some errors come back as JSON / SSE with HTTP 200.
|
||||||
|
ct = response.headers.get("Content-Type", "")
|
||||||
|
if "application/json" in ct or "text/event-stream" in ct:
|
||||||
|
try:
|
||||||
|
err = response.json()
|
||||||
|
except Exception:
|
||||||
|
err = {"raw": response.text[:500]}
|
||||||
|
logger.error(
|
||||||
|
f"[ZhipuAIVoice] textToVoice unexpected text response "
|
||||||
|
f"(content_type={ct}): {err}"
|
||||||
|
)
|
||||||
|
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
|
||||||
|
|
||||||
|
audio_bytes = response.content
|
||||||
|
ext = self._sniff_audio_ext(audio_bytes) or "wav"
|
||||||
|
|
||||||
|
file_name = (
|
||||||
|
"tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
||||||
|
+ str(random.randint(0, 1000)) + "." + ext
|
||||||
|
)
|
||||||
|
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
||||||
|
with open(file_name, "wb") as f:
|
||||||
|
f.write(audio_bytes)
|
||||||
|
logger.info(
|
||||||
|
f"[ZhipuAIVoice] textToVoice model={model} voice={voice_id} "
|
||||||
|
f"file={file_name} bytes={len(audio_bytes)} ext={ext}"
|
||||||
|
)
|
||||||
|
return Reply(ReplyType.VOICE, file_name)
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(f"[ZhipuAIVoice] textToVoice exception: {e}")
|
||||||
|
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _sniff_audio_ext(data: bytes) -> str:
|
||||||
|
"""Detect audio container by magic bytes; returns '' on unknown."""
|
||||||
|
if len(data) < 12:
|
||||||
|
return ""
|
||||||
|
head = data[:12]
|
||||||
|
if head[:4] == b"RIFF" and head[8:12] == b"WAVE":
|
||||||
|
return "wav"
|
||||||
|
if head[:3] == b"ID3" or head[:2] == b"\xff\xfb" or head[:2] == b"\xff\xf3" or head[:2] == b"\xff\xf2":
|
||||||
|
return "mp3"
|
||||||
|
if head[:4] == b"OggS":
|
||||||
|
return "ogg"
|
||||||
|
if head[:4] == b"fLaC":
|
||||||
|
return "flac"
|
||||||
|
return ""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _ensure_compatible_format(voice_file: str) -> str:
|
def _ensure_compatible_format(voice_file: str) -> str:
|
||||||
# glm-asr-2512 only accepts .wav / .mp3 — convert everything else
|
# glm-asr-2512 only accepts .wav / .mp3
|
||||||
# (webm from the browser mic, m4a/amr/silk from chat channels, etc).
|
|
||||||
lower = voice_file.lower()
|
lower = voice_file.lower()
|
||||||
if lower.endswith(".mp3") or lower.endswith(".wav"):
|
if lower.endswith(".mp3") or lower.endswith(".wav"):
|
||||||
return voice_file
|
return voice_file
|
||||||
@@ -95,8 +169,5 @@ class ZhipuAIVoice(Voice):
|
|||||||
audio_convert.any_to_mp3(voice_file, mp3_file)
|
audio_convert.any_to_mp3(voice_file, mp3_file)
|
||||||
return mp3_file
|
return mp3_file
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(
|
logger.warning(f"[ZhipuAIVoice] mp3 convert failed: {e}")
|
||||||
f"[ZhipuAIVoice] convert {voice_file} to mp3 failed: {e}; "
|
|
||||||
f"submitting original file"
|
|
||||||
)
|
|
||||||
return voice_file
|
return voice_file
|
||||||
|
|||||||
Reference in New Issue
Block a user