feat(voice): rework TTS/ASR stack and unify tool/skill config schema

This commit is contained in:
zhayujie
2026-05-21 16:00:54 +08:00
parent 2b90f377e6
commit b8333e351c
31 changed files with 1551 additions and 335 deletions

View File

@@ -44,6 +44,7 @@ CREATE TABLE IF NOT EXISTS messages (
role TEXT NOT NULL,
content TEXT NOT NULL,
created_at INTEGER NOT NULL,
extras TEXT NOT NULL DEFAULT '',
UNIQUE (session_id, seq)
);
@@ -67,6 +68,12 @@ _MIGRATION_ADD_CONTEXT_START_SEQ = """
ALTER TABLE sessions ADD COLUMN context_start_seq INTEGER NOT NULL DEFAULT 0;
"""
# Generic JSON sidecar for per-message attachments (TTS audio URL, future use).
# Always optional — readers must tolerate missing column / empty / invalid JSON.
_MIGRATION_ADD_MSG_EXTRAS = """
ALTER TABLE messages ADD COLUMN extras TEXT NOT NULL DEFAULT '';
"""
DEFAULT_MAX_AGE_DAYS: int = 30
@@ -169,20 +176,26 @@ def _group_into_display_turns(
cur_rest: List[tuple] = []
started = False
for role, raw_content, created_at in rows:
for role, raw_content, created_at, raw_extras in rows:
try:
content = json.loads(raw_content)
except Exception:
content = raw_content
try:
extras = json.loads(raw_extras) if raw_extras else {}
if not isinstance(extras, dict):
extras = {}
except Exception:
extras = {}
if role == "user" and _is_visible_user_message(content):
if started:
groups.append((cur_user, cur_rest))
cur_user = (content, created_at)
cur_user = (content, created_at, extras)
cur_rest = []
started = True
else:
cur_rest.append((role, content, created_at))
cur_rest.append((role, content, created_at, extras))
if started:
groups.append((cur_user, cur_rest))
@@ -195,7 +208,7 @@ def _group_into_display_turns(
for user_row, rest in groups:
# User turn
if user_row:
content, created_at = user_row
content, created_at, _u_extras = user_row
text = _extract_display_text(content)
if text:
turns.append({"role": "user", "content": text, "created_at": created_at})
@@ -206,8 +219,11 @@ def _group_into_display_turns(
tool_results: Dict[str, str] = {}
final_text = ""
final_ts: Optional[int] = None
merged_extras: Dict[str, Any] = {}
for role, content, created_at in rest:
for role, content, created_at, extras in rest:
if role == "assistant" and isinstance(extras, dict):
merged_extras.update(extras)
if role == "user":
tool_results.update(_extract_tool_results(content))
elif role == "assistant":
@@ -256,6 +272,8 @@ def _group_into_display_turns(
"steps": steps,
"created_at": final_ts or (user_row[1] if user_row else 0),
}
if merged_extras:
turn["extras"] = merged_extras
turns.append(turn)
return turns
@@ -411,13 +429,15 @@ class ConversationStore:
content = json.dumps(
msg.get("content", ""), ensure_ascii=False
)
extras_obj = msg.get("extras") or {}
extras = json.dumps(extras_obj, ensure_ascii=False) if extras_obj else ""
conn.execute(
"""
INSERT OR IGNORE INTO messages
(session_id, seq, role, content, created_at)
VALUES (?, ?, ?, ?, ?)
(session_id, seq, role, content, created_at, extras)
VALUES (?, ?, ?, ?, ?, ?)
""",
(session_id, next_seq, role, content, now),
(session_id, next_seq, role, content, now, extras),
)
next_seq += 1
@@ -651,6 +671,55 @@ class ConversationStore:
logger.info(f"[ConversationStore] Pruned {deleted} expired sessions")
return deleted
def attach_extras_to_last_assistant(
self,
session_id: str,
extras: Dict[str, Any],
) -> Optional[int]:
"""
Merge ``extras`` into the latest assistant message of a session.
Used by post-processing (e.g. TTS) that needs to annotate an already
persisted bot reply with attachments such as audio URLs.
Returns the message seq that was updated, or ``None`` if no assistant
message exists or the update could not be applied.
"""
if not extras:
return None
with self._lock:
conn = self._connect()
try:
row = conn.execute(
"""
SELECT seq, extras FROM messages
WHERE session_id = ? AND role = 'assistant'
ORDER BY seq DESC LIMIT 1
""",
(session_id,),
).fetchone()
if not row:
return None
seq, raw = row
try:
cur = json.loads(raw) if raw else {}
if not isinstance(cur, dict):
cur = {}
except Exception:
cur = {}
cur.update(extras)
conn.execute(
"UPDATE messages SET extras = ? WHERE session_id = ? AND seq = ?",
(json.dumps(cur, ensure_ascii=False), session_id, seq),
)
conn.commit()
return seq
except Exception as e:
logger.warning(f"[ConversationStore] attach_extras failed: {e}")
return None
finally:
conn.close()
def load_history_page(
self,
session_id: str,
@@ -698,15 +767,31 @@ class ConversationStore:
).fetchone()
ctx_start = ctx_row[0] if ctx_row else 0
rows = conn.execute(
"""
SELECT seq, role, content, created_at
FROM messages
WHERE session_id = ?
ORDER BY seq ASC
""",
(session_id,),
).fetchall()
# extras column is added by migration; tolerate older DBs that
# might miss it by falling back to a NULL literal.
try:
rows = conn.execute(
"""
SELECT seq, role, content, created_at, extras
FROM messages
WHERE session_id = ?
ORDER BY seq ASC
""",
(session_id,),
).fetchall()
except sqlite3.OperationalError:
rows = [
(seq, role, content, created_at, "")
for (seq, role, content, created_at) in conn.execute(
"""
SELECT seq, role, content, created_at
FROM messages
WHERE session_id = ?
ORDER BY seq ASC
""",
(session_id,),
).fetchall()
]
finally:
conn.close()
@@ -719,13 +804,16 @@ class ConversationStore:
include_thinking = False
# Strip seq for display grouping, but record max seq per visible user group
plain_rows = [(role, content, created_at) for _seq, role, content, created_at in rows]
plain_rows = [
(role, content, created_at, extras_raw)
for _seq, role, content, created_at, extras_raw in rows
]
visible = _group_into_display_turns(plain_rows, include_thinking=include_thinking)
# Build a mapping: find the seq of each visible user message to annotate context boundary.
# Walk through rows to find visible user message seqs in order.
visible_user_seqs: List[int] = []
for seq, role, raw_content, _ts in rows:
for seq, role, raw_content, _ts, _extras in rows:
if role != "user":
continue
try:
@@ -911,6 +999,18 @@ class ConversationStore:
except Exception as e:
logger.warning(f"[ConversationStore] Migration (context_start_seq) failed: {e}")
msg_cols = {
row[1]
for row in conn.execute("PRAGMA table_info(messages)").fetchall()
}
if "extras" not in msg_cols:
try:
conn.execute(_MIGRATION_ADD_MSG_EXTRAS)
conn.commit()
logger.info("[ConversationStore] Migrated: added messages.extras column")
except Exception as e:
logger.warning(f"[ConversationStore] Migration (extras) failed: {e}")
def _connect(self) -> sqlite3.Connection:
conn = sqlite3.connect(str(self._db_path), timeout=10)
conn.execute("PRAGMA journal_mode=WAL")

View File

@@ -3,7 +3,7 @@ Vision tool - Analyze images using Vision API.
Supports local files (auto base64-encoded) and HTTP URLs.
Provider resolution:
- tool.vision.model (if set) means "prefer this model first; fall back to
- tools.vision.model (if set) means "prefer this model first; fall back to
other configured providers if it fails". The model name is mapped to its
native provider (e.g. doubao-* → Doubao, kimi-* → Moonshot, gpt-* →
OpenAI/LinkAI). That provider is tried first, then the standard auto
@@ -60,7 +60,7 @@ _DISCOVERABLE_MODELS = [
]
# Model name prefix → discoverable provider display_name.
# Used to auto-route tool.vision.model to its native provider.
# Used to auto-route tools.vision.model to its native provider.
# Matched case-insensitively; longest prefix wins.
_MODEL_PREFIX_TO_PROVIDER = [
("doubao-", "Doubao"),
@@ -154,7 +154,7 @@ class Vision(BaseTool):
# Default model is only used as a last-resort placeholder for providers
# whose VisionProvider.model_override is None (e.g. raw OpenAI provider
# when the user did not configure tool.vision.model).
# when the user did not configure tools.vision.model).
return self._call_with_fallback(providers, DEFAULT_MODEL, question, image_content)
def _call_with_fallback(self, providers: List[VisionProvider], model: str,
@@ -193,12 +193,12 @@ class Vision(BaseTool):
"""
Build an ordered list of providers to try.
Semantics of `tool.vision.model`:
Semantics of `tools.vision.model`:
"Prefer this model first; fall back to other configured providers
if it fails."
Order:
1. The provider that natively serves `tool.vision.model` (if any
1. The provider that natively serves `tools.vision.model` (if any
and its API key is configured) — using the user-specified model
name verbatim.
2. Auto-discovery chain as fallback:
@@ -213,7 +213,7 @@ class Vision(BaseTool):
user_model = self._resolve_user_vision_model()
providers: List[VisionProvider] = []
# Step 1: preferred provider derived from tool.vision.model
# Step 1: preferred provider derived from tools.vision.model
if user_model:
preferred = self._route_by_model_name(user_model)
if preferred:
@@ -251,11 +251,11 @@ class Vision(BaseTool):
@staticmethod
def _resolve_user_vision_model() -> Optional[str]:
"""Read tool.vision.model from config; return None if unset/blank."""
tool_conf = conf().get("tool", {})
if not isinstance(tool_conf, dict):
"""Read tools.vision.model (singular ``tool`` kept as runtime fallback)."""
tools_conf = conf().get("tools") or conf().get("tool") or {}
if not isinstance(tools_conf, dict):
return None
vision_conf = tool_conf.get("vision", {})
vision_conf = tools_conf.get("vision", {})
if not isinstance(vision_conf, dict):
return None
m = vision_conf.get("model")
@@ -303,7 +303,7 @@ class Vision(BaseTool):
self._append_provider(providers, lambda: self._build_linkai_provider(user_model))
if providers:
return providers
logger.warning(f"[Vision] tool.vision.model='{user_model}' looks like an OpenAI "
logger.warning(f"[Vision] tools.vision.model='{user_model}' looks like an OpenAI "
f"model but neither OPENAI_API_KEY nor LINKAI_API_KEY is configured.")
return None # fall through to auto
@@ -317,7 +317,7 @@ class Vision(BaseTool):
continue
api_key = conf().get(config_key, "")
if not api_key or not api_key.strip():
logger.warning(f"[Vision] tool.vision.model='{user_model}' routes to "
logger.warning(f"[Vision] tools.vision.model='{user_model}' routes to "
f"'{display_name}' but '{config_key}' is not configured. "
f"Falling back to auto-discovery.")
return None # fall through to auto
@@ -452,8 +452,8 @@ class Vision(BaseTool):
if not self._main_bot_supports_vision(bot):
return None
# Use the configured main model name; do NOT inject tool.vision.model
# here, because by the time we reach this branch the tool.vision.model
# Use the configured main model name; do NOT inject tools.vision.model
# here, because by the time we reach this branch the tools.vision.model
# routing has already been attempted (and either matched the main bot
# or failed to find a provider).
main_model_name = conf().get("model") or None