diff --git a/agent/tools/vision/vision.py b/agent/tools/vision/vision.py index 3f8ad308..8a2756c2 100644 --- a/agent/tools/vision/vision.py +++ b/agent/tools/vision/vision.py @@ -1,7 +1,13 @@ """ -Vision tool - Analyze images using OpenAI-compatible Vision API. +Vision tool - Analyze images using Vision API. Supports local files (auto base64-encoded) and HTTP URLs. -Providers are tried in priority order with automatic fallback on failure. + +Provider priority (default): + 1. Main model via bot.call_vision — zero extra cost + 2. Other models whose API key is configured — auto-discovered + 3. OpenAI / LinkAI raw HTTP — reliable fallback + When use_linkai=true, LinkAI is promoted to #1. + When tool.vision.model is set, that model is used exclusively first. """ import base64 @@ -14,10 +20,11 @@ from typing import Any, Dict, List, Optional import requests from agent.tools.base_tool import BaseTool, ToolResult +from common import const from common.log import logger from config import conf -DEFAULT_MODEL = "gpt-4.1-mini" +DEFAULT_MODEL = const.GPT_41_MINI DEFAULT_TIMEOUT = 60 MAX_TOKENS = 1000 COMPRESS_THRESHOLD = 1_048_576 # 1 MB @@ -30,8 +37,20 @@ SUPPORTED_EXTENSIONS = { "webp": "image/webp", } +_MAIN_MODEL_PROVIDER_NAME = "MainModel" -OPENAI_COMPATIBLE_BOT_TYPES = {"openai", "openAI", "chatGPT"} +# (config_key_for_api_key, bot_type, default_vision_model, provider_display_name) +# Auto-discovered as fallback vision providers when their API key is configured. +# OpenAI and LinkAI are handled separately (raw HTTP providers), so not listed here. +_DISCOVERABLE_MODELS = [ + ("moonshot_api_key", const.MOONSHOT, const.KIMI_K2_5, "Moonshot"), + ("ark_api_key", const.DOUBAO, const.DOUBAO_SEED_2_PRO, "Doubao"), + ("dashscope_api_key", const.QWEN_DASHSCOPE, const.QWEN36_PLUS, "DashScope"), + ("claude_api_key", const.CLAUDEAPI, const.CLAUDE_4_6_SONNET, "Claude"), + ("gemini_api_key", const.GEMINI, const.GEMINI_31_FLASH_LITE_PRE, "Gemini"), + ("zhipu_ai_api_key", const.ZHIPU_AI, const.GLM_4_7, "ZhipuAI"), + ("minimax_api_key", const.MiniMax, const.MINIMAX_M2_7, "MiniMax"), +] @dataclass @@ -42,6 +61,8 @@ class VisionProvider: api_base: str extra_headers: dict = field(default_factory=dict) model_override: Optional[str] = None + use_bot: bool = False # When True, call via bot.call_vision instead of raw HTTP + fallback_bot: Any = None # Bot instance for non-main-model providers class VisionAPIError(Exception): @@ -50,13 +71,12 @@ class VisionAPIError(Exception): class Vision(BaseTool): - """Analyze images using OpenAI-compatible Vision API""" + """Analyze images using Vision API""" name: str = "vision" description: str = ( "Analyze a local image or image URL (jpg/jpeg/png) using Vision API. " "Can describe content, extract text, identify objects, colors, etc. " - "Requires OPENAI_API_KEY or LINKAI_API_KEY." ) params: dict = { @@ -70,13 +90,6 @@ class Vision(BaseTool): "type": "string", "description": "Question to ask about the image", }, - "model": { - "type": "string", - "description": ( - f"Vision model to use (default: {DEFAULT_MODEL}). " - "Options: gpt-4.1-mini, gpt-4.1, gpt-4o-mini, gpt-4o" - ), - }, }, "required": ["image", "question"], } @@ -86,15 +99,11 @@ class Vision(BaseTool): @staticmethod def is_available() -> bool: - return bool( - conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY") - or conf().get("linkai_api_key") or os.environ.get("LINKAI_API_KEY") - ) + return True def execute(self, args: Dict[str, Any]) -> ToolResult: image = args.get("image", "").strip() question = args.get("question", "").strip() - model = args.get("model", DEFAULT_MODEL).strip() or DEFAULT_MODEL if not image: return ToolResult.fail("Error: 'image' parameter is required") @@ -104,11 +113,12 @@ class Vision(BaseTool): providers = self._resolve_providers() if not providers: return ToolResult.fail( - "Error: No API key configured for Vision.\n" - "Please configure one of the following using env_config tool:\n" - " 1. OPENAI_API_KEY (preferred): env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n" - " 2. LINKAI_API_KEY (fallback): env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")\n\n" - "Get your key at: https://platform.openai.com/api-keys or https://link-ai.tech" + "Error: No model available for Vision.\n" + "The main model does not support vision and no other API keys are configured.\n" + "Options:\n" + " 1. Switch to a multimodal model (e.g. qwen3.6-plus, claude-sonnet-4-6, gemini-2.0-flash)\n" + " 2. Configure OPENAI_API_KEY: env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n" + " 3. Configure LINKAI_API_KEY: env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")" ) try: @@ -116,7 +126,7 @@ class Vision(BaseTool): except Exception as e: return ToolResult.fail(f"Error: {e}") - return self._call_with_fallback(providers, model, question, image_content) + return self._call_with_fallback(providers, DEFAULT_MODEL, question, image_content) def _call_with_fallback(self, providers: List[VisionProvider], model: str, question: str, image_content: dict) -> ToolResult: @@ -125,9 +135,14 @@ class Vision(BaseTool): for i, provider in enumerate(providers): use_model = provider.model_override or model try: - logger.debug(f"[Vision] Trying provider '{provider.name}' " - f"with model '{use_model}' ({i + 1}/{len(providers)})") - return self._call_api(provider, use_model, question, image_content) + logger.info(f"[Vision] Trying provider '{provider.name}' " + f"with model '{use_model}' ({i + 1}/{len(providers)})") + if provider.use_bot: + result = self._call_via_bot(use_model, question, image_content, provider) + else: + result = self._call_api(provider, use_model, question, image_content) + logger.info(f"[Vision] ✅ Success via {provider.name} (model={use_model})") + return result except VisionAPIError as e: errors.append(f"[{provider.name}/{use_model}] {e}") logger.warning(f"[Vision] Provider '{provider.name}' failed: {e}") @@ -148,35 +163,113 @@ class Vision(BaseTool): def _resolve_providers(self) -> List[VisionProvider]: """ Build an ordered list of available providers. - Each provider builder returns a VisionProvider or None. - To add a new provider, append a builder method to _PROVIDER_BUILDERS. + + Priority: + - use_linkai=true → [LinkAI, MainModel, OtherModels…, OpenAI] + - default → [MainModel, OtherModels…, OpenAI, LinkAI] + + "OtherModels" are auto-discovered from configured API keys. + The main model's bot_type is excluded from OtherModels to avoid + duplicating the MainModel provider. """ + use_linkai = conf().get("use_linkai", False) and conf().get("linkai_api_key") providers: List[VisionProvider] = [] - for builder in self._PROVIDER_BUILDERS: - provider = builder(self) - if provider: - providers.append(provider) + + if use_linkai: + self._append_provider(providers, self._build_linkai_provider) + self._append_provider(providers, self._build_main_model_provider) + self._append_other_model_providers(providers) + self._append_provider(providers, self._build_openai_provider) + else: + self._append_provider(providers, self._build_main_model_provider) + self._append_other_model_providers(providers) + self._append_provider(providers, self._build_openai_provider) + self._append_provider(providers, self._build_linkai_provider) + return providers - def _build_custom_model_provider(self) -> Optional[VisionProvider]: + @staticmethod + def _append_provider(providers: List[VisionProvider], builder) -> None: + p = builder() + if p: + providers.append(p) + + def _append_other_model_providers(self, providers: List[VisionProvider]) -> None: """ - When bot_type is openai-compatible and a custom model is configured, - try the user's own model first — it may already support multimodal input. + Auto-discover other models whose API key is configured. + Skip the main model's own bot_type (already covered by MainModel provider). + Skip bot_types that already have a provider in the list (e.g. OpenAI). """ - bot_type = conf().get("bot_type", "") - if bot_type not in OPENAI_COMPATIBLE_BOT_TYPES: + # Determine main model's bot_type so we can skip it + main_bot_type = None + if self.model and hasattr(self.model, '_resolve_bot_type'): + main_bot_type = self.model._resolve_bot_type(conf().get("model", "")) + + existing_names = {p.name for p in providers} + + for config_key, bot_type, default_model, display_name in _DISCOVERABLE_MODELS: + if display_name in existing_names: + continue + if bot_type == main_bot_type: + continue + api_key = conf().get(config_key, "") + if not api_key or not api_key.strip(): + continue + + # Create a bot instance and check if it supports call_vision + try: + from models.bot_factory import create_bot + bot = create_bot(bot_type) + if not hasattr(bot, 'call_vision'): + continue + except Exception: + continue + + providers.append(VisionProvider( + name=display_name, + api_key="", + api_base="", + model_override=default_model, + use_bot=True, + fallback_bot=bot, + )) + + def _resolve_vision_model(self) -> Optional[str]: + """ + Determine which model to use for vision. + + 1. User explicit config: tool.vision.model in config.json + 2. Fallback to the main configured model name + """ + tool_conf = conf().get("tool", {}) + user_vision_model = tool_conf.get("vision", {}).get("model") if isinstance(tool_conf, dict) else None + if user_vision_model: + return user_vision_model + model_name = conf().get("model", "") + return model_name or None + + def _build_main_model_provider(self) -> Optional[VisionProvider]: + """ + Use the vendor's own model for vision via bot.call_vision. + Only available when the bot class has call_vision. + """ + if not (self.model and hasattr(self.model, 'bot')): return None - custom_model = conf().get("model", "") - if not custom_model or custom_model == DEFAULT_MODEL: + try: + bot = self.model.bot + if not hasattr(bot, 'call_vision'): + return None + except Exception: return None - api_key = conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY") - if not api_key: - return None - api_base = (conf().get("open_ai_api_base") or os.environ.get("OPENAI_API_BASE", "")).rstrip("/") \ - or "https://api.openai.com/v1" + + vision_model = self._resolve_vision_model() + return VisionProvider( - name="CustomModel", api_key=api_key, api_base=self._ensure_v1(api_base), - model_override=custom_model, + name=_MAIN_MODEL_PROVIDER_NAME, + api_key="", + api_base="", + model_override=vision_model, + use_bot=True, ) def _build_openai_provider(self) -> Optional[VisionProvider]: @@ -200,7 +293,54 @@ class Vision(BaseTool): return VisionProvider(name="LinkAI", api_key=api_key, api_base=self._ensure_v1(api_base), extra_headers=extra) - _PROVIDER_BUILDERS = [_build_custom_model_provider, _build_openai_provider, _build_linkai_provider] + def _call_via_bot(self, model: str, question: str, image_content: dict, + provider: Optional[VisionProvider] = None) -> ToolResult: + """ + Call a model's call_vision with vendor-native API format. + Uses the provider's _fallback_bot if set, otherwise the main model bot. + Raises VisionAPIError on failure so fallback can proceed. + """ + try: + bot = (provider and provider.fallback_bot) or self.model.bot + except Exception as e: + raise VisionAPIError(f"Cannot access bot: {e}") + + # Extract the raw image URL from the OpenAI-format image_content block + image_url = image_content.get("image_url", {}).get("url", "") + if not image_url: + raise VisionAPIError("No image URL in content block") + + try: + response = bot.call_vision( + image_url=image_url, + question=question, + model=model, + max_tokens=MAX_TOKENS, + ) + except Exception as e: + raise VisionAPIError(f"call_vision failed: {e}") + + if response is NotImplemented: + raise VisionAPIError("Bot does not support vision") + + if isinstance(response, dict) and response.get("error"): + raise VisionAPIError(f"API error - {response.get('message', 'Unknown')}") + + content = response.get("content", "") if isinstance(response, dict) else "" + if not content: + raise VisionAPIError("Empty response from main model") + + usage_info = response.get("usage", {}) if isinstance(response, dict) else {} + + # Use the actual model name from the bot response if available + actual_model = response.get("model", model) if isinstance(response, dict) else model + provider_name = provider.name if provider else _MAIN_MODEL_PROVIDER_NAME + return ToolResult.success({ + "model": actual_model, + "provider": provider_name, + "content": content, + "usage": usage_info, + }) @staticmethod def _ensure_v1(api_base: str) -> str: @@ -213,9 +353,13 @@ class Vision(BaseTool): return api_base.rstrip("/") + "/v1" def _build_image_content(self, image: str) -> dict: - """Build the image_url content block for the API request.""" + """ + Build the image_url content block. + Both remote URLs and local files are converted to base64 data URLs + so every bot backend can consume them without extra downloads. + """ if image.startswith(("http://", "https://")): - return {"type": "image_url", "image_url": {"url": image}} + return self._download_to_data_url(image) if not os.path.isfile(image): raise FileNotFoundError(f"Image file not found: {image}") @@ -239,6 +383,19 @@ class Vision(BaseTool): data_url = f"data:{mime_type};base64,{b64}" return {"type": "image_url", "image_url": {"url": data_url}} + @staticmethod + def _download_to_data_url(url: str) -> dict: + """Download a remote image and return it as a base64 data URL.""" + resp = requests.get(url, timeout=30) + if resp.status_code != 200: + raise VisionAPIError(f"Failed to download image: HTTP {resp.status_code}") + content_type = resp.headers.get("Content-Type", "image/jpeg").split(";")[0].strip() + if not content_type.startswith("image/"): + content_type = "image/jpeg" + b64 = base64.b64encode(resp.content).decode("ascii") + data_url = f"data:{content_type};base64,{b64}" + return {"type": "image_url", "image_url": {"url": data_url}} + @staticmethod def _maybe_compress(path: str) -> str: """Compress image to under COMPRESS_THRESHOLD with max long-edge 1536px.""" @@ -312,7 +469,6 @@ class Vision(BaseTool): ], } ], - "max_completion_tokens": MAX_TOKENS, } headers = { diff --git a/bridge/agent_bridge.py b/bridge/agent_bridge.py index 665abd22..073cfa83 100644 --- a/bridge/agent_bridge.py +++ b/bridge/agent_bridge.py @@ -124,14 +124,15 @@ class AgentLLMModel(LLMModel): @property def bot(self): - """Lazy load the bot, re-create when model changes""" + """Lazy load the bot, re-create when model or bot_type changes""" from models.bot_factory import create_bot cur_model = self.model - if self._bot is None or self._bot_model != cur_model: - bot_type = self._resolve_bot_type(cur_model) - self._bot = create_bot(bot_type) + cur_bot_type = self._resolve_bot_type(cur_model) + if self._bot is None or self._bot_model != cur_model or getattr(self, '_bot_type', None) != cur_bot_type: + self._bot = create_bot(cur_bot_type) self._bot = add_openai_compatible_support(self._bot) self._bot_model = cur_model + self._bot_type = cur_bot_type return self._bot def call(self, request: LLMRequest): @@ -509,15 +510,15 @@ class AgentBridge: def _migrate_config_to_env(self, workspace_root: str): """ - Migrate API keys from config.json to .env file if not already set - + Sync API keys from config.json to .env file. + Adds new keys and updates changed values on each startup. + Args: workspace_root: Workspace directory path (not used, kept for compatibility) """ from config import conf import os - # Mapping from config.json keys to environment variable names key_mapping = { "open_ai_api_key": "OPENAI_API_KEY", "open_ai_api_base": "OPENAI_API_BASE", @@ -526,10 +527,9 @@ class AgentBridge: "linkai_api_key": "LINKAI_API_KEY", } - # Use fixed secure location for .env file env_file = expand_path("~/.cow/.env") - # Read existing env vars from .env file + # Read existing env vars (key -> value) existing_env_vars = {} if os.path.exists(env_file): try: @@ -537,48 +537,46 @@ class AgentBridge: for line in f: line = line.strip() if line and not line.startswith('#') and '=' in line: - key, _ = line.split('=', 1) - existing_env_vars[key.strip()] = True + key, val = line.split('=', 1) + existing_env_vars[key.strip()] = val.strip() except Exception as e: logger.warning(f"[AgentBridge] Failed to read .env file: {e}") - # Check which keys need to be migrated - keys_to_migrate = {} + # Sync config.json values into .env (add/update/remove) + updated = False for config_key, env_key in key_mapping.items(): - # Skip if already in .env file - if env_key in existing_env_vars: - continue - - # Get value from config.json - value = conf().get(config_key, "") - if value and value.strip(): # Only migrate non-empty values - keys_to_migrate[env_key] = value.strip() - - # Log summary if there are keys to skip - if existing_env_vars: - logger.debug(f"[AgentBridge] {len(existing_env_vars)} env vars already in .env") - - # Write new keys to .env file - if keys_to_migrate: + raw = conf().get(config_key, "") + value = raw.strip() if raw else "" + old_value = existing_env_vars.get(env_key) + + if value: + if old_value == value: + continue + existing_env_vars[env_key] = value + os.environ[env_key] = value + updated = True + else: + if old_value is None: + continue + existing_env_vars.pop(env_key, None) + os.environ.pop(env_key, None) + updated = True + updated = True + + if updated: try: - # Ensure ~/.cow directory and .env file exist env_dir = os.path.dirname(env_file) - if not os.path.exists(env_dir): - os.makedirs(env_dir, exist_ok=True) - if not os.path.exists(env_file): - open(env_file, 'a').close() - - # Append new keys - with open(env_file, 'a', encoding='utf-8') as f: - f.write('\n# Auto-migrated from config.json\n') - for key, value in keys_to_migrate.items(): + os.makedirs(env_dir, exist_ok=True) + + with open(env_file, 'w', encoding='utf-8') as f: + f.write('# Environment variables for agent\n') + f.write('# Auto-managed - synced from config.json on startup\n\n') + for key, value in sorted(existing_env_vars.items()): f.write(f'{key}={value}\n') - # Also set in current process - os.environ[key] = value - - logger.info(f"[AgentBridge] Migrated {len(keys_to_migrate)} API keys from config.json to .env: {list(keys_to_migrate.keys())}") + + logger.info(f"[AgentBridge] Synced API keys from config.json to .env") except Exception as e: - logger.warning(f"[AgentBridge] Failed to migrate API keys: {e}") + logger.warning(f"[AgentBridge] Failed to sync API keys: {e}") def _persist_messages( self, session_id: str, new_messages: list, channel_type: str = "" diff --git a/bridge/agent_initializer.py b/bridge/agent_initializer.py index 58bbbfb3..5e0fe01b 100644 --- a/bridge/agent_initializer.py +++ b/bridge/agent_initializer.py @@ -490,7 +490,7 @@ class AgentInitializer: env_file = expand_path("~/.cow/.env") - # Read existing env vars + # Read existing env vars (key -> value) existing_env_vars = {} if os.path.exists(env_file): try: @@ -498,38 +498,46 @@ class AgentInitializer: for line in f: line = line.strip() if line and not line.startswith('#') and '=' in line: - key, _ = line.split('=', 1) - existing_env_vars[key.strip()] = True + key, val = line.split('=', 1) + existing_env_vars[key.strip()] = val.strip() except Exception as e: logger.warning(f"[AgentInitializer] Failed to read .env file: {e}") - # Check which keys need migration - keys_to_migrate = {} + # Sync config.json values into .env (add/update/remove) + updated = False for config_key, env_key in key_mapping.items(): - if env_key in existing_env_vars: - continue - value = conf().get(config_key, "") - if value and value.strip(): - keys_to_migrate[env_key] = value.strip() - - # Write new keys - if keys_to_migrate: + raw = conf().get(config_key, "") + value = raw.strip() if raw else "" + old_value = existing_env_vars.get(env_key) + + if value: + if old_value == value: + continue + existing_env_vars[env_key] = value + os.environ[env_key] = value + updated = True + else: + if old_value is None: + continue + existing_env_vars.pop(env_key, None) + os.environ.pop(env_key, None) + updated = True + + if updated: try: env_dir = os.path.dirname(env_file) - if not os.path.exists(env_dir): - os.makedirs(env_dir, exist_ok=True) - if not os.path.exists(env_file): - open(env_file, 'a').close() - - with open(env_file, 'a', encoding='utf-8') as f: - f.write('\n# Auto-migrated from config.json\n') - for key, value in keys_to_migrate.items(): + os.makedirs(env_dir, exist_ok=True) + + # Rewrite the entire .env file to ensure consistency + with open(env_file, 'w', encoding='utf-8') as f: + f.write('# Environment variables for agent\n') + f.write('# Auto-managed - synced from config.json on startup\n\n') + for key, value in sorted(existing_env_vars.items()): f.write(f'{key}={value}\n') - os.environ[key] = value - - logger.info(f"[AgentInitializer] Migrated {len(keys_to_migrate)} API keys to .env: {list(keys_to_migrate.keys())}") + + logger.info(f"[AgentInitializer] Synced API keys from config.json to .env") except Exception as e: - logger.warning(f"[AgentInitializer] Failed to migrate API keys: {e}") + logger.warning(f"[AgentInitializer] Failed to sync API keys: {e}") def _start_daily_flush_timer(self): """Start a background thread that flushes all agents' memory daily at 23:55.""" diff --git a/channel/web/static/js/console.js b/channel/web/static/js/console.js index 24e120be..0f6c2a29 100644 --- a/channel/web/static/js/console.js +++ b/channel/web/static/js/console.js @@ -806,15 +806,17 @@ function sendMessage() { } function startSSE(requestId, loadingEl, timestamp) { - const es = new EventSource(`/stream?request_id=${encodeURIComponent(requestId)}`); - activeStreams[requestId] = es; - let botEl = null; let stepsEl = null; // .agent-steps (thinking summaries + tool indicators) let contentEl = null; // .answer-content (final streaming answer) let mediaEl = null; // .media-content (images & file attachments) let accumulatedText = ''; let currentToolEl = null; + let done = false; + + const MAX_RECONNECTS = 10; + const RECONNECT_BASE_MS = 1000; + let reconnectCount = 0; function ensureBotEl() { if (botEl) return; @@ -839,180 +841,204 @@ function startSSE(requestId, loadingEl, timestamp) { mediaEl = botEl.querySelector('.media-content'); } - es.onmessage = function(e) { - let item; - try { item = JSON.parse(e.data); } catch (_) { return; } + function connect() { + const es = new EventSource(`/stream?request_id=${encodeURIComponent(requestId)}`); + activeStreams[requestId] = es; - if (item.type === 'delta') { - ensureBotEl(); - accumulatedText += item.content; - contentEl.innerHTML = renderMarkdown(accumulatedText); - scrollChatToBottom(); + es.onmessage = function(e) { + let item; + try { item = JSON.parse(e.data); } catch (_) { return; } - } else if (item.type === 'tool_start') { - ensureBotEl(); + // Successful data received, reset reconnect counter + reconnectCount = 0; - // Save current thinking as a collapsible step - if (accumulatedText.trim()) { - const fullText = accumulatedText.trim(); - const oneLine = fullText.replace(/\n+/g, ' '); - const needsTruncate = oneLine.length > 80; - const stepEl = document.createElement('div'); - stepEl.className = 'agent-step agent-thinking-step' + (needsTruncate ? '' : ' no-expand'); - if (needsTruncate) { - const truncated = oneLine.substring(0, 80) + '…'; - stepEl.innerHTML = ` -
${argsStr}
+ // Add tool execution indicator (collapsible)
+ currentToolEl = document.createElement('div');
+ currentToolEl.className = 'agent-step agent-tool-step';
+ const argsStr = formatToolArgs(item.arguments || {});
+ currentToolEl.innerHTML = `
+ ${argsStr}
+ ${escapeHtml(String(item.result))}`;
+ }
+
+ if (isError) currentToolEl.classList.add('tool-failed');
+ currentToolEl = null;
}
- // Fill output section
- const outputSection = currentToolEl.querySelector('.tool-output-section');
- if (outputSection && item.result) {
- outputSection.innerHTML = `
- ${escapeHtml(String(item.result))}`;
- }
+ } else if (item.type === 'image') {
+ ensureBotEl();
+ const imgEl = document.createElement('img');
+ imgEl.src = item.content;
+ imgEl.alt = 'screenshot';
+ imgEl.style.cssText = 'max-width:600px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);';
+ imgEl.onclick = () => window.open(item.content, '_blank');
+ mediaEl.appendChild(imgEl);
+ scrollChatToBottom();
- if (isError) currentToolEl.classList.add('tool-failed');
- currentToolEl = null;
+ } else if (item.type === 'text') {
+ // Intermediate text sent before media items; display it but keep SSE open.
+ ensureBotEl();
+ contentEl.classList.remove('sse-streaming');
+ const textContent = item.content || accumulatedText;
+ if (textContent) contentEl.innerHTML = renderMarkdown(textContent);
+ applyHighlighting(botEl);
+ scrollChatToBottom();
+
+ } else if (item.type === 'video') {
+ ensureBotEl();
+ const wrapper = document.createElement('div');
+ wrapper.innerHTML = _buildVideoHtml(item.content);
+ mediaEl.appendChild(wrapper.firstElementChild || wrapper);
+ scrollChatToBottom();
+
+ } else if (item.type === 'file') {
+ ensureBotEl();
+ const fileName = item.file_name || item.content.split('/').pop();
+ const fileEl = document.createElement('a');
+ fileEl.href = item.content;
+ fileEl.download = fileName;
+ fileEl.target = '_blank';
+ fileEl.className = 'file-attachment';
+ fileEl.style.cssText = 'display:inline-flex;align-items:center;gap:6px;padding:8px 14px;margin:8px 0;border-radius:8px;background:var(--bg-secondary,#f3f4f6);color:var(--text-primary,#374151);text-decoration:none;font-size:14px;border:1px solid var(--border-color,#e5e7eb);';
+ fileEl.innerHTML = ` ${fileName}`;
+ mediaEl.appendChild(fileEl);
+ scrollChatToBottom();
+
+ } else if (item.type === 'phase') {
+ // Coarse progress (e.g. cow install-browser); must not close SSE (unlike "done")
+ ensureBotEl();
+ const wrap = document.createElement('div');
+ wrap.className = 'text-xs sm:text-sm text-slate-600 dark:text-slate-400 border-l-2 border-primary-400 pl-2 py-1 my-0.5';
+ wrap.textContent = String(item.content || '');
+ stepsEl.appendChild(wrap);
+ scrollChatToBottom();
+
+ } else if (item.type === 'done') {
+ done = true;
+ es.close();
+ delete activeStreams[requestId];
+
+ // item.content may be empty when "done" is only a stream-close signal after media.
+ const finalText = item.content || accumulatedText;
+
+ if (!botEl && finalText) {
+ if (loadingEl) { loadingEl.remove(); loadingEl = null; }
+ addBotMessage(finalText, new Date((item.timestamp || Date.now() / 1000) * 1000), requestId);
+ } else if (botEl) {
+ contentEl.classList.remove('sse-streaming');
+ // Only update text content when there is something new to show.
+ if (finalText) contentEl.innerHTML = renderMarkdown(finalText);
+ applyHighlighting(botEl);
+ }
+ scrollChatToBottom();
+
+ } else if (item.type === 'error') {
+ done = true;
+ es.close();
+ delete activeStreams[requestId];
+ if (loadingEl) { loadingEl.remove(); loadingEl = null; }
+ addBotMessage(t('error_send'), new Date());
}
+ };
- } else if (item.type === 'image') {
- ensureBotEl();
- const imgEl = document.createElement('img');
- imgEl.src = item.content;
- imgEl.alt = 'screenshot';
- imgEl.style.cssText = 'max-width:600px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);';
- imgEl.onclick = () => window.open(item.content, '_blank');
- mediaEl.appendChild(imgEl);
- scrollChatToBottom();
-
- } else if (item.type === 'text') {
- // Intermediate text sent before media items; display it but keep SSE open.
- ensureBotEl();
- contentEl.classList.remove('sse-streaming');
- const textContent = item.content || accumulatedText;
- if (textContent) contentEl.innerHTML = renderMarkdown(textContent);
- applyHighlighting(botEl);
- scrollChatToBottom();
-
- } else if (item.type === 'video') {
- ensureBotEl();
- const wrapper = document.createElement('div');
- wrapper.innerHTML = _buildVideoHtml(item.content);
- mediaEl.appendChild(wrapper.firstElementChild || wrapper);
- scrollChatToBottom();
-
- } else if (item.type === 'file') {
- ensureBotEl();
- const fileName = item.file_name || item.content.split('/').pop();
- const fileEl = document.createElement('a');
- fileEl.href = item.content;
- fileEl.download = fileName;
- fileEl.target = '_blank';
- fileEl.className = 'file-attachment';
- fileEl.style.cssText = 'display:inline-flex;align-items:center;gap:6px;padding:8px 14px;margin:8px 0;border-radius:8px;background:var(--bg-secondary,#f3f4f6);color:var(--text-primary,#374151);text-decoration:none;font-size:14px;border:1px solid var(--border-color,#e5e7eb);';
- fileEl.innerHTML = ` ${fileName}`;
- mediaEl.appendChild(fileEl);
- scrollChatToBottom();
-
- } else if (item.type === 'phase') {
- // Coarse progress (e.g. cow install-browser); must not close SSE (unlike "done")
- ensureBotEl();
- const wrap = document.createElement('div');
- wrap.className = 'text-xs sm:text-sm text-slate-600 dark:text-slate-400 border-l-2 border-primary-400 pl-2 py-1 my-0.5';
- wrap.textContent = String(item.content || '');
- stepsEl.appendChild(wrap);
- scrollChatToBottom();
-
- } else if (item.type === 'done') {
+ es.onerror = function() {
es.close();
delete activeStreams[requestId];
- // item.content may be empty when "done" is only a stream-close signal after media.
- const finalText = item.content || accumulatedText;
+ if (done) return;
- if (!botEl && finalText) {
- if (loadingEl) { loadingEl.remove(); loadingEl = null; }
- addBotMessage(finalText, new Date((item.timestamp || Date.now() / 1000) * 1000), requestId);
- } else if (botEl) {
+ if (reconnectCount < MAX_RECONNECTS) {
+ reconnectCount++;
+ const delay = Math.min(RECONNECT_BASE_MS * reconnectCount, 5000);
+ console.warn(`[SSE] connection lost for ${requestId}, reconnecting in ${delay}ms (attempt ${reconnectCount}/${MAX_RECONNECTS})`);
+ setTimeout(connect, delay);
+ return;
+ }
+
+ // Exhausted retries, show whatever we have
+ if (loadingEl) { loadingEl.remove(); loadingEl = null; }
+ if (!botEl) {
+ addBotMessage(t('error_send'), new Date());
+ } else if (accumulatedText) {
contentEl.classList.remove('sse-streaming');
- // Only update text content when there is something new to show.
- if (finalText) contentEl.innerHTML = renderMarkdown(finalText);
+ contentEl.innerHTML = renderMarkdown(accumulatedText);
applyHighlighting(botEl);
}
- scrollChatToBottom();
+ };
+ }
- } else if (item.type === 'error') {
- es.close();
- delete activeStreams[requestId];
- if (loadingEl) { loadingEl.remove(); loadingEl = null; }
- addBotMessage(t('error_send'), new Date());
- }
- };
-
- es.onerror = function() {
- es.close();
- delete activeStreams[requestId];
- if (loadingEl) { loadingEl.remove(); loadingEl = null; }
- if (!botEl) {
- addBotMessage(t('error_send'), new Date());
- } else if (accumulatedText) {
- contentEl.classList.remove('sse-streaming');
- contentEl.innerHTML = renderMarkdown(accumulatedText);
- applyHighlighting(botEl);
- }
- };
+ connect();
}
function startPolling() {
diff --git a/channel/web/web_channel.py b/channel/web/web_channel.py
index 32b27062..bd686f9f 100644
--- a/channel/web/web_channel.py
+++ b/channel/web/web_channel.py
@@ -329,14 +329,18 @@ class WebChannel(ChatChannel):
"""
SSE generator for a given request_id.
Yields UTF-8 encoded bytes to avoid WSGI Latin-1 mangling.
+ Supports client reconnection: the queue is only removed after a
+ "done" event is consumed, so a new GET /stream with the same
+ request_id can resume reading remaining events.
"""
if request_id not in self.sse_queues:
yield b"data: {\"type\": \"error\", \"message\": \"invalid request_id\"}\n\n"
return
q = self.sse_queues[request_id]
- timeout = 300 # 5 minutes max
- deadline = time.time() + timeout
+ idle_timeout = 600 # 10 minutes without any real event
+ deadline = time.time() + idle_timeout
+ done = False
try:
while time.time() < deadline:
@@ -346,13 +350,18 @@ class WebChannel(ChatChannel):
yield b": keepalive\n\n"
continue
+ # Real event received, reset idle deadline
+ deadline = time.time() + idle_timeout
+
payload = json.dumps(item, ensure_ascii=False)
yield f"data: {payload}\n\n".encode("utf-8")
if item.get("type") == "done":
+ done = True
break
finally:
- self.sse_queues.pop(request_id, None)
+ if done:
+ self.sse_queues.pop(request_id, None)
def poll_response(self):
"""
diff --git a/docs/en/tools/vision.mdx b/docs/en/tools/vision.mdx
new file mode 100644
index 00000000..cebecbea
--- /dev/null
+++ b/docs/en/tools/vision.mdx
@@ -0,0 +1,72 @@
+---
+title: vision - Image Analysis
+description: Analyze image content (recognition, description, OCR, etc.)
+---
+
+Analyze local images or image URLs using Vision API. Supports content description, text extraction (OCR), object recognition, and more.
+
+## Model Selection
+
+The vision tool uses a multi-level auto-selection strategy with automatic fallback — no manual configuration required:
+
+1. **Main model** — uses the currently configured main model for image recognition (zero extra cost)
+2. **Other configured models** — auto-discovers other models with configured API keys as alternatives
+3. **OpenAI** — uses `open_ai_api_key` to call gpt-4.1-mini
+4. **LinkAI** — uses `linkai_api_key` to call LinkAI vision service
+
+When `use_linkai=true`, LinkAI is promoted to the highest priority.
+
+If the current provider fails, the tool automatically tries the next one until it succeeds or all fail.
+
+### Supported Models
+
+| Vendor | Vision Model | Notes |
+| --- | --- | --- |
+| OpenAI / Compatible | Main model | All OpenAI-compatible multimodal models |
+| Qwen (DashScope) | Main model | Via MultiModalConversation API |
+| Claude | Main model | Anthropic native image format |
+| Gemini | Main model | inlineData format |
+| Doubao | Main model | doubao-seed-2-0 series natively supported |
+| Kimi (Moonshot) | Main model | kimi-k2.5 natively supported |
+| ZhipuAI | glm-5v-turbo | Always uses dedicated vision model |
+| MiniMax | MiniMax-Text-01 | Always uses dedicated vision model |
+
+