From b8333e351c63f6ae75fd199954a4d32591f5b9d8 Mon Sep 17 00:00:00 2001 From: zhayujie Date: Thu, 21 May 2026 16:00:54 +0800 Subject: [PATCH] feat(voice): rework TTS/ASR stack and unify tool/skill config schema --- agent/memory/conversation_store.py | 138 +++- agent/tools/vision/vision.py | 28 +- channel/chat_channel.py | 8 +- channel/feishu/feishu_channel.py | 10 +- channel/web/static/css/console.css | 73 +++ channel/web/static/js/console.js | 456 +++++++++++-- channel/web/web_channel.py | 680 +++++++++++++++++--- config-template.json | 4 +- config.py | 99 ++- docs/en/models/qianfan.mdx | 2 +- docs/en/releases/v2.0.7.mdx | 2 +- docs/en/releases/v2.0.8.mdx | 2 +- docs/en/skills/image-generation.mdx | 2 +- docs/en/tools/vision.mdx | 2 +- docs/ja/models/qianfan.mdx | 2 +- docs/ja/releases/v2.0.7.mdx | 2 +- docs/ja/releases/v2.0.8.mdx | 2 +- docs/ja/skills/image-generation.mdx | 2 +- docs/ja/tools/vision.mdx | 2 +- docs/models/qianfan.mdx | 2 +- docs/releases/v2.0.7.mdx | 2 +- docs/releases/v2.0.8.mdx | 2 +- docs/skills/image-generation.mdx | 2 +- docs/tools/vision.mdx | 2 +- skills/image-generation/scripts/generate.py | 2 +- tests/test_qianfan_provider.py | 4 +- voice/dashscope/dashscope_voice.py | 102 ++- voice/linkai/linkai_voice.py | 90 +-- voice/minimax/minimax_voice.py | 54 +- voice/openai/openai_voice.py | 3 +- voice/zhipuai/zhipuai_voice.py | 105 ++- 31 files changed, 1551 insertions(+), 335 deletions(-) diff --git a/agent/memory/conversation_store.py b/agent/memory/conversation_store.py index c5d215bf..48148f61 100644 --- a/agent/memory/conversation_store.py +++ b/agent/memory/conversation_store.py @@ -44,6 +44,7 @@ CREATE TABLE IF NOT EXISTS messages ( role TEXT NOT NULL, content TEXT NOT NULL, created_at INTEGER NOT NULL, + extras TEXT NOT NULL DEFAULT '', UNIQUE (session_id, seq) ); @@ -67,6 +68,12 @@ _MIGRATION_ADD_CONTEXT_START_SEQ = """ ALTER TABLE sessions ADD COLUMN context_start_seq INTEGER NOT NULL DEFAULT 0; """ +# Generic JSON sidecar for per-message attachments (TTS audio URL, future use). +# Always optional — readers must tolerate missing column / empty / invalid JSON. +_MIGRATION_ADD_MSG_EXTRAS = """ +ALTER TABLE messages ADD COLUMN extras TEXT NOT NULL DEFAULT ''; +""" + DEFAULT_MAX_AGE_DAYS: int = 30 @@ -169,20 +176,26 @@ def _group_into_display_turns( cur_rest: List[tuple] = [] started = False - for role, raw_content, created_at in rows: + for role, raw_content, created_at, raw_extras in rows: try: content = json.loads(raw_content) except Exception: content = raw_content + try: + extras = json.loads(raw_extras) if raw_extras else {} + if not isinstance(extras, dict): + extras = {} + except Exception: + extras = {} if role == "user" and _is_visible_user_message(content): if started: groups.append((cur_user, cur_rest)) - cur_user = (content, created_at) + cur_user = (content, created_at, extras) cur_rest = [] started = True else: - cur_rest.append((role, content, created_at)) + cur_rest.append((role, content, created_at, extras)) if started: groups.append((cur_user, cur_rest)) @@ -195,7 +208,7 @@ def _group_into_display_turns( for user_row, rest in groups: # User turn if user_row: - content, created_at = user_row + content, created_at, _u_extras = user_row text = _extract_display_text(content) if text: turns.append({"role": "user", "content": text, "created_at": created_at}) @@ -206,8 +219,11 @@ def _group_into_display_turns( tool_results: Dict[str, str] = {} final_text = "" final_ts: Optional[int] = None + merged_extras: Dict[str, Any] = {} - for role, content, created_at in rest: + for role, content, created_at, extras in rest: + if role == "assistant" and isinstance(extras, dict): + merged_extras.update(extras) if role == "user": tool_results.update(_extract_tool_results(content)) elif role == "assistant": @@ -256,6 +272,8 @@ def _group_into_display_turns( "steps": steps, "created_at": final_ts or (user_row[1] if user_row else 0), } + if merged_extras: + turn["extras"] = merged_extras turns.append(turn) return turns @@ -411,13 +429,15 @@ class ConversationStore: content = json.dumps( msg.get("content", ""), ensure_ascii=False ) + extras_obj = msg.get("extras") or {} + extras = json.dumps(extras_obj, ensure_ascii=False) if extras_obj else "" conn.execute( """ INSERT OR IGNORE INTO messages - (session_id, seq, role, content, created_at) - VALUES (?, ?, ?, ?, ?) + (session_id, seq, role, content, created_at, extras) + VALUES (?, ?, ?, ?, ?, ?) """, - (session_id, next_seq, role, content, now), + (session_id, next_seq, role, content, now, extras), ) next_seq += 1 @@ -651,6 +671,55 @@ class ConversationStore: logger.info(f"[ConversationStore] Pruned {deleted} expired sessions") return deleted + def attach_extras_to_last_assistant( + self, + session_id: str, + extras: Dict[str, Any], + ) -> Optional[int]: + """ + Merge ``extras`` into the latest assistant message of a session. + + Used by post-processing (e.g. TTS) that needs to annotate an already + persisted bot reply with attachments such as audio URLs. + + Returns the message seq that was updated, or ``None`` if no assistant + message exists or the update could not be applied. + """ + if not extras: + return None + with self._lock: + conn = self._connect() + try: + row = conn.execute( + """ + SELECT seq, extras FROM messages + WHERE session_id = ? AND role = 'assistant' + ORDER BY seq DESC LIMIT 1 + """, + (session_id,), + ).fetchone() + if not row: + return None + seq, raw = row + try: + cur = json.loads(raw) if raw else {} + if not isinstance(cur, dict): + cur = {} + except Exception: + cur = {} + cur.update(extras) + conn.execute( + "UPDATE messages SET extras = ? WHERE session_id = ? AND seq = ?", + (json.dumps(cur, ensure_ascii=False), session_id, seq), + ) + conn.commit() + return seq + except Exception as e: + logger.warning(f"[ConversationStore] attach_extras failed: {e}") + return None + finally: + conn.close() + def load_history_page( self, session_id: str, @@ -698,15 +767,31 @@ class ConversationStore: ).fetchone() ctx_start = ctx_row[0] if ctx_row else 0 - rows = conn.execute( - """ - SELECT seq, role, content, created_at - FROM messages - WHERE session_id = ? - ORDER BY seq ASC - """, - (session_id,), - ).fetchall() + # extras column is added by migration; tolerate older DBs that + # might miss it by falling back to a NULL literal. + try: + rows = conn.execute( + """ + SELECT seq, role, content, created_at, extras + FROM messages + WHERE session_id = ? + ORDER BY seq ASC + """, + (session_id,), + ).fetchall() + except sqlite3.OperationalError: + rows = [ + (seq, role, content, created_at, "") + for (seq, role, content, created_at) in conn.execute( + """ + SELECT seq, role, content, created_at + FROM messages + WHERE session_id = ? + ORDER BY seq ASC + """, + (session_id,), + ).fetchall() + ] finally: conn.close() @@ -719,13 +804,16 @@ class ConversationStore: include_thinking = False # Strip seq for display grouping, but record max seq per visible user group - plain_rows = [(role, content, created_at) for _seq, role, content, created_at in rows] + plain_rows = [ + (role, content, created_at, extras_raw) + for _seq, role, content, created_at, extras_raw in rows + ] visible = _group_into_display_turns(plain_rows, include_thinking=include_thinking) # Build a mapping: find the seq of each visible user message to annotate context boundary. # Walk through rows to find visible user message seqs in order. visible_user_seqs: List[int] = [] - for seq, role, raw_content, _ts in rows: + for seq, role, raw_content, _ts, _extras in rows: if role != "user": continue try: @@ -911,6 +999,18 @@ class ConversationStore: except Exception as e: logger.warning(f"[ConversationStore] Migration (context_start_seq) failed: {e}") + msg_cols = { + row[1] + for row in conn.execute("PRAGMA table_info(messages)").fetchall() + } + if "extras" not in msg_cols: + try: + conn.execute(_MIGRATION_ADD_MSG_EXTRAS) + conn.commit() + logger.info("[ConversationStore] Migrated: added messages.extras column") + except Exception as e: + logger.warning(f"[ConversationStore] Migration (extras) failed: {e}") + def _connect(self) -> sqlite3.Connection: conn = sqlite3.connect(str(self._db_path), timeout=10) conn.execute("PRAGMA journal_mode=WAL") diff --git a/agent/tools/vision/vision.py b/agent/tools/vision/vision.py index a1c3265f..6fe6250f 100644 --- a/agent/tools/vision/vision.py +++ b/agent/tools/vision/vision.py @@ -3,7 +3,7 @@ Vision tool - Analyze images using Vision API. Supports local files (auto base64-encoded) and HTTP URLs. Provider resolution: - - tool.vision.model (if set) means "prefer this model first; fall back to + - tools.vision.model (if set) means "prefer this model first; fall back to other configured providers if it fails". The model name is mapped to its native provider (e.g. doubao-* → Doubao, kimi-* → Moonshot, gpt-* → OpenAI/LinkAI). That provider is tried first, then the standard auto @@ -60,7 +60,7 @@ _DISCOVERABLE_MODELS = [ ] # Model name prefix → discoverable provider display_name. -# Used to auto-route tool.vision.model to its native provider. +# Used to auto-route tools.vision.model to its native provider. # Matched case-insensitively; longest prefix wins. _MODEL_PREFIX_TO_PROVIDER = [ ("doubao-", "Doubao"), @@ -154,7 +154,7 @@ class Vision(BaseTool): # Default model is only used as a last-resort placeholder for providers # whose VisionProvider.model_override is None (e.g. raw OpenAI provider - # when the user did not configure tool.vision.model). + # when the user did not configure tools.vision.model). return self._call_with_fallback(providers, DEFAULT_MODEL, question, image_content) def _call_with_fallback(self, providers: List[VisionProvider], model: str, @@ -193,12 +193,12 @@ class Vision(BaseTool): """ Build an ordered list of providers to try. - Semantics of `tool.vision.model`: + Semantics of `tools.vision.model`: "Prefer this model first; fall back to other configured providers if it fails." Order: - 1. The provider that natively serves `tool.vision.model` (if any + 1. The provider that natively serves `tools.vision.model` (if any and its API key is configured) — using the user-specified model name verbatim. 2. Auto-discovery chain as fallback: @@ -213,7 +213,7 @@ class Vision(BaseTool): user_model = self._resolve_user_vision_model() providers: List[VisionProvider] = [] - # Step 1: preferred provider derived from tool.vision.model + # Step 1: preferred provider derived from tools.vision.model if user_model: preferred = self._route_by_model_name(user_model) if preferred: @@ -251,11 +251,11 @@ class Vision(BaseTool): @staticmethod def _resolve_user_vision_model() -> Optional[str]: - """Read tool.vision.model from config; return None if unset/blank.""" - tool_conf = conf().get("tool", {}) - if not isinstance(tool_conf, dict): + """Read tools.vision.model (singular ``tool`` kept as runtime fallback).""" + tools_conf = conf().get("tools") or conf().get("tool") or {} + if not isinstance(tools_conf, dict): return None - vision_conf = tool_conf.get("vision", {}) + vision_conf = tools_conf.get("vision", {}) if not isinstance(vision_conf, dict): return None m = vision_conf.get("model") @@ -303,7 +303,7 @@ class Vision(BaseTool): self._append_provider(providers, lambda: self._build_linkai_provider(user_model)) if providers: return providers - logger.warning(f"[Vision] tool.vision.model='{user_model}' looks like an OpenAI " + logger.warning(f"[Vision] tools.vision.model='{user_model}' looks like an OpenAI " f"model but neither OPENAI_API_KEY nor LINKAI_API_KEY is configured.") return None # fall through to auto @@ -317,7 +317,7 @@ class Vision(BaseTool): continue api_key = conf().get(config_key, "") if not api_key or not api_key.strip(): - logger.warning(f"[Vision] tool.vision.model='{user_model}' routes to " + logger.warning(f"[Vision] tools.vision.model='{user_model}' routes to " f"'{display_name}' but '{config_key}' is not configured. " f"Falling back to auto-discovery.") return None # fall through to auto @@ -452,8 +452,8 @@ class Vision(BaseTool): if not self._main_bot_supports_vision(bot): return None - # Use the configured main model name; do NOT inject tool.vision.model - # here, because by the time we reach this branch the tool.vision.model + # Use the configured main model name; do NOT inject tools.vision.model + # here, because by the time we reach this branch the tools.vision.model # routing has already been attempted (and either matched the main bot # or failed to find a provider). main_model_name = conf().get("model") or None diff --git a/channel/chat_channel.py b/channel/chat_channel.py index 3251c286..760bf860 100644 --- a/channel/chat_channel.py +++ b/channel/chat_channel.py @@ -171,7 +171,13 @@ class ChatChannel(Channel): if "desire_rtype" not in context and conf().get("always_reply_voice") and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE: context["desire_rtype"] = ReplyType.VOICE elif context.type == ContextType.VOICE: - if "desire_rtype" not in context and conf().get("voice_reply_voice") and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE: + # Voice input replies with voice when either voice_reply_voice + # (mirror voice) or the global always_reply_voice toggle is on. + if ( + "desire_rtype" not in context + and (conf().get("voice_reply_voice") or conf().get("always_reply_voice")) + and ReplyType.VOICE not in self.NOT_SUPPORT_REPLYTYPE + ): context["desire_rtype"] = ReplyType.VOICE return context diff --git a/channel/feishu/feishu_channel.py b/channel/feishu/feishu_channel.py index f479394a..ca18e64b 100644 --- a/channel/feishu/feishu_channel.py +++ b/channel/feishu/feishu_channel.py @@ -1515,10 +1515,16 @@ class FeiShuChanel(ChatChannel): else: context.type = ContextType.TEXT context.content = content.strip() + # Text input opts into voice replies only when the always-on toggle is set. + if "desire_rtype" not in context and conf().get("always_reply_voice"): + context["desire_rtype"] = ReplyType.VOICE elif context.type == ContextType.VOICE: - # 2.语音请求 - if "desire_rtype" not in context and conf().get("voice_reply_voice"): + # 2.语音请求: voice input replies with voice if either + # voice_reply_voice (mirror reply) or always_reply_voice is on. + if "desire_rtype" not in context and ( + conf().get("voice_reply_voice") or conf().get("always_reply_voice") + ): context["desire_rtype"] = ReplyType.VOICE return context diff --git a/channel/web/static/css/console.css b/channel/web/static/css/console.css index cbb2a39b..35fc307b 100644 --- a/channel/web/static/css/console.css +++ b/channel/web/static/css/console.css @@ -1294,3 +1294,76 @@ overflow: hidden; min-height: 2.5em; /* ~2 lines at text-sm leading-relaxed */ } + +/* -------------------------------------------------------------------- + * Voice pill — compact custom audio player used by mic uploads and TTS + * replies. Replaces the bulky native