diff --git a/README.md b/README.md
index 89dee8fe..c48f6a72 100644
--- a/README.md
+++ b/README.md
@@ -214,6 +214,7 @@ cow install-browser
+ 添加 `"speech_recognition": true` 将开启语音识别,默认使用 openai 的 whisper 模型识别为文字,同时以文字回复,该参数仅支持私聊 (注意由于语音消息无法匹配前缀,一旦开启将对所有语音自动回复,支持语音触发画图);
+ 添加 `"group_speech_recognition": true` 将开启群组语音识别,默认使用 openai 的 whisper 模型识别为文字,同时以文字回复,参数仅支持群聊 (会匹配 group_chat_prefix 和 group_chat_keyword, 支持语音触发画图);
+ 添加 `"voice_reply_voice": true` 将开启语音回复语音(同时作用于私聊和群聊)
++ 使用 MiniMax TTS:设置 `"text_to_voice": "minimax"`,并配置 `minimax_api_key`;可通过 `"tts_voice_id"` 指定发音人(如 `English_Graceful_Lady`),`"text_to_voice_model"` 指定模型(如 `speech-2.8-hd`、`speech-2.8-turbo`)
@@ -358,7 +359,7 @@ sudo docker logs -f chatgpt-on-wechat
"minimax_api_key": ""
}
```
- - `model`: 可填写 `MiniMax-M2.7、MiniMax-M2.5、MiniMax-M2.1、MiniMax-M2.1-lightning、MiniMax-M2、abab6.5-chat` 等
+ - `model`: 可填写 `MiniMax-M2.7、MiniMax-M2.7-highspeed、MiniMax-M2.5、MiniMax-M2.1、MiniMax-M2.1-lightning、MiniMax-M2、abab6.5-chat` 等
- `minimax_api_key`:MiniMax 平台的 API-KEY,在 [控制台](https://platform.minimaxi.com/user-center/basic-information/interface-key) 创建
方式二:OpenAI 兼容方式接入,配置如下:
@@ -371,7 +372,7 @@ sudo docker logs -f chatgpt-on-wechat
}
```
- `bot_type`: OpenAI 兼容方式
-- `model`: 可填 `MiniMax-M2.7、MiniMax-M2.5、MiniMax-M2.1、MiniMax-M2.1-lightning、MiniMax-M2`,参考[API文档](https://platform.minimaxi.com/document/%E5%AF%B9%E8%AF%9D?key=66701d281d57f38758d581d0#QklxsNSbaf6kM4j6wjO5eEek)
+- `model`: 可填 `MiniMax-M2.7、MiniMax-M2.7-highspeed、MiniMax-M2.5、MiniMax-M2.1、MiniMax-M2.1-lightning、MiniMax-M2`,参考[API文档](https://platform.minimaxi.com/document/%E5%AF%B9%E8%AF%9D?key=66701d281d57f38758d581d0#QklxsNSbaf6kM4j6wjO5eEek)
- `open_ai_api_base`: MiniMax 平台 API 的 BASE URL
- `open_ai_api_key`: MiniMax 平台的 API-KEY
diff --git a/agent/tools/vision/vision.py b/agent/tools/vision/vision.py
index 3f8ad308..8a2756c2 100644
--- a/agent/tools/vision/vision.py
+++ b/agent/tools/vision/vision.py
@@ -1,7 +1,13 @@
"""
-Vision tool - Analyze images using OpenAI-compatible Vision API.
+Vision tool - Analyze images using Vision API.
Supports local files (auto base64-encoded) and HTTP URLs.
-Providers are tried in priority order with automatic fallback on failure.
+
+Provider priority (default):
+ 1. Main model via bot.call_vision — zero extra cost
+ 2. Other models whose API key is configured — auto-discovered
+ 3. OpenAI / LinkAI raw HTTP — reliable fallback
+ When use_linkai=true, LinkAI is promoted to #1.
+ When tool.vision.model is set, that model is used exclusively first.
"""
import base64
@@ -14,10 +20,11 @@ from typing import Any, Dict, List, Optional
import requests
from agent.tools.base_tool import BaseTool, ToolResult
+from common import const
from common.log import logger
from config import conf
-DEFAULT_MODEL = "gpt-4.1-mini"
+DEFAULT_MODEL = const.GPT_41_MINI
DEFAULT_TIMEOUT = 60
MAX_TOKENS = 1000
COMPRESS_THRESHOLD = 1_048_576 # 1 MB
@@ -30,8 +37,20 @@ SUPPORTED_EXTENSIONS = {
"webp": "image/webp",
}
+_MAIN_MODEL_PROVIDER_NAME = "MainModel"
-OPENAI_COMPATIBLE_BOT_TYPES = {"openai", "openAI", "chatGPT"}
+# (config_key_for_api_key, bot_type, default_vision_model, provider_display_name)
+# Auto-discovered as fallback vision providers when their API key is configured.
+# OpenAI and LinkAI are handled separately (raw HTTP providers), so not listed here.
+_DISCOVERABLE_MODELS = [
+ ("moonshot_api_key", const.MOONSHOT, const.KIMI_K2_5, "Moonshot"),
+ ("ark_api_key", const.DOUBAO, const.DOUBAO_SEED_2_PRO, "Doubao"),
+ ("dashscope_api_key", const.QWEN_DASHSCOPE, const.QWEN36_PLUS, "DashScope"),
+ ("claude_api_key", const.CLAUDEAPI, const.CLAUDE_4_6_SONNET, "Claude"),
+ ("gemini_api_key", const.GEMINI, const.GEMINI_31_FLASH_LITE_PRE, "Gemini"),
+ ("zhipu_ai_api_key", const.ZHIPU_AI, const.GLM_4_7, "ZhipuAI"),
+ ("minimax_api_key", const.MiniMax, const.MINIMAX_M2_7, "MiniMax"),
+]
@dataclass
@@ -42,6 +61,8 @@ class VisionProvider:
api_base: str
extra_headers: dict = field(default_factory=dict)
model_override: Optional[str] = None
+ use_bot: bool = False # When True, call via bot.call_vision instead of raw HTTP
+ fallback_bot: Any = None # Bot instance for non-main-model providers
class VisionAPIError(Exception):
@@ -50,13 +71,12 @@ class VisionAPIError(Exception):
class Vision(BaseTool):
- """Analyze images using OpenAI-compatible Vision API"""
+ """Analyze images using Vision API"""
name: str = "vision"
description: str = (
"Analyze a local image or image URL (jpg/jpeg/png) using Vision API. "
"Can describe content, extract text, identify objects, colors, etc. "
- "Requires OPENAI_API_KEY or LINKAI_API_KEY."
)
params: dict = {
@@ -70,13 +90,6 @@ class Vision(BaseTool):
"type": "string",
"description": "Question to ask about the image",
},
- "model": {
- "type": "string",
- "description": (
- f"Vision model to use (default: {DEFAULT_MODEL}). "
- "Options: gpt-4.1-mini, gpt-4.1, gpt-4o-mini, gpt-4o"
- ),
- },
},
"required": ["image", "question"],
}
@@ -86,15 +99,11 @@ class Vision(BaseTool):
@staticmethod
def is_available() -> bool:
- return bool(
- conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY")
- or conf().get("linkai_api_key") or os.environ.get("LINKAI_API_KEY")
- )
+ return True
def execute(self, args: Dict[str, Any]) -> ToolResult:
image = args.get("image", "").strip()
question = args.get("question", "").strip()
- model = args.get("model", DEFAULT_MODEL).strip() or DEFAULT_MODEL
if not image:
return ToolResult.fail("Error: 'image' parameter is required")
@@ -104,11 +113,12 @@ class Vision(BaseTool):
providers = self._resolve_providers()
if not providers:
return ToolResult.fail(
- "Error: No API key configured for Vision.\n"
- "Please configure one of the following using env_config tool:\n"
- " 1. OPENAI_API_KEY (preferred): env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n"
- " 2. LINKAI_API_KEY (fallback): env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")\n\n"
- "Get your key at: https://platform.openai.com/api-keys or https://link-ai.tech"
+ "Error: No model available for Vision.\n"
+ "The main model does not support vision and no other API keys are configured.\n"
+ "Options:\n"
+ " 1. Switch to a multimodal model (e.g. qwen3.6-plus, claude-sonnet-4-6, gemini-2.0-flash)\n"
+ " 2. Configure OPENAI_API_KEY: env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n"
+ " 3. Configure LINKAI_API_KEY: env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")"
)
try:
@@ -116,7 +126,7 @@ class Vision(BaseTool):
except Exception as e:
return ToolResult.fail(f"Error: {e}")
- return self._call_with_fallback(providers, model, question, image_content)
+ return self._call_with_fallback(providers, DEFAULT_MODEL, question, image_content)
def _call_with_fallback(self, providers: List[VisionProvider], model: str,
question: str, image_content: dict) -> ToolResult:
@@ -125,9 +135,14 @@ class Vision(BaseTool):
for i, provider in enumerate(providers):
use_model = provider.model_override or model
try:
- logger.debug(f"[Vision] Trying provider '{provider.name}' "
- f"with model '{use_model}' ({i + 1}/{len(providers)})")
- return self._call_api(provider, use_model, question, image_content)
+ logger.info(f"[Vision] Trying provider '{provider.name}' "
+ f"with model '{use_model}' ({i + 1}/{len(providers)})")
+ if provider.use_bot:
+ result = self._call_via_bot(use_model, question, image_content, provider)
+ else:
+ result = self._call_api(provider, use_model, question, image_content)
+ logger.info(f"[Vision] ✅ Success via {provider.name} (model={use_model})")
+ return result
except VisionAPIError as e:
errors.append(f"[{provider.name}/{use_model}] {e}")
logger.warning(f"[Vision] Provider '{provider.name}' failed: {e}")
@@ -148,35 +163,113 @@ class Vision(BaseTool):
def _resolve_providers(self) -> List[VisionProvider]:
"""
Build an ordered list of available providers.
- Each provider builder returns a VisionProvider or None.
- To add a new provider, append a builder method to _PROVIDER_BUILDERS.
+
+ Priority:
+ - use_linkai=true → [LinkAI, MainModel, OtherModels…, OpenAI]
+ - default → [MainModel, OtherModels…, OpenAI, LinkAI]
+
+ "OtherModels" are auto-discovered from configured API keys.
+ The main model's bot_type is excluded from OtherModels to avoid
+ duplicating the MainModel provider.
"""
+ use_linkai = conf().get("use_linkai", False) and conf().get("linkai_api_key")
providers: List[VisionProvider] = []
- for builder in self._PROVIDER_BUILDERS:
- provider = builder(self)
- if provider:
- providers.append(provider)
+
+ if use_linkai:
+ self._append_provider(providers, self._build_linkai_provider)
+ self._append_provider(providers, self._build_main_model_provider)
+ self._append_other_model_providers(providers)
+ self._append_provider(providers, self._build_openai_provider)
+ else:
+ self._append_provider(providers, self._build_main_model_provider)
+ self._append_other_model_providers(providers)
+ self._append_provider(providers, self._build_openai_provider)
+ self._append_provider(providers, self._build_linkai_provider)
+
return providers
- def _build_custom_model_provider(self) -> Optional[VisionProvider]:
+ @staticmethod
+ def _append_provider(providers: List[VisionProvider], builder) -> None:
+ p = builder()
+ if p:
+ providers.append(p)
+
+ def _append_other_model_providers(self, providers: List[VisionProvider]) -> None:
"""
- When bot_type is openai-compatible and a custom model is configured,
- try the user's own model first — it may already support multimodal input.
+ Auto-discover other models whose API key is configured.
+ Skip the main model's own bot_type (already covered by MainModel provider).
+ Skip bot_types that already have a provider in the list (e.g. OpenAI).
"""
- bot_type = conf().get("bot_type", "")
- if bot_type not in OPENAI_COMPATIBLE_BOT_TYPES:
+ # Determine main model's bot_type so we can skip it
+ main_bot_type = None
+ if self.model and hasattr(self.model, '_resolve_bot_type'):
+ main_bot_type = self.model._resolve_bot_type(conf().get("model", ""))
+
+ existing_names = {p.name for p in providers}
+
+ for config_key, bot_type, default_model, display_name in _DISCOVERABLE_MODELS:
+ if display_name in existing_names:
+ continue
+ if bot_type == main_bot_type:
+ continue
+ api_key = conf().get(config_key, "")
+ if not api_key or not api_key.strip():
+ continue
+
+ # Create a bot instance and check if it supports call_vision
+ try:
+ from models.bot_factory import create_bot
+ bot = create_bot(bot_type)
+ if not hasattr(bot, 'call_vision'):
+ continue
+ except Exception:
+ continue
+
+ providers.append(VisionProvider(
+ name=display_name,
+ api_key="",
+ api_base="",
+ model_override=default_model,
+ use_bot=True,
+ fallback_bot=bot,
+ ))
+
+ def _resolve_vision_model(self) -> Optional[str]:
+ """
+ Determine which model to use for vision.
+
+ 1. User explicit config: tool.vision.model in config.json
+ 2. Fallback to the main configured model name
+ """
+ tool_conf = conf().get("tool", {})
+ user_vision_model = tool_conf.get("vision", {}).get("model") if isinstance(tool_conf, dict) else None
+ if user_vision_model:
+ return user_vision_model
+ model_name = conf().get("model", "")
+ return model_name or None
+
+ def _build_main_model_provider(self) -> Optional[VisionProvider]:
+ """
+ Use the vendor's own model for vision via bot.call_vision.
+ Only available when the bot class has call_vision.
+ """
+ if not (self.model and hasattr(self.model, 'bot')):
return None
- custom_model = conf().get("model", "")
- if not custom_model or custom_model == DEFAULT_MODEL:
+ try:
+ bot = self.model.bot
+ if not hasattr(bot, 'call_vision'):
+ return None
+ except Exception:
return None
- api_key = conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY")
- if not api_key:
- return None
- api_base = (conf().get("open_ai_api_base") or os.environ.get("OPENAI_API_BASE", "")).rstrip("/") \
- or "https://api.openai.com/v1"
+
+ vision_model = self._resolve_vision_model()
+
return VisionProvider(
- name="CustomModel", api_key=api_key, api_base=self._ensure_v1(api_base),
- model_override=custom_model,
+ name=_MAIN_MODEL_PROVIDER_NAME,
+ api_key="",
+ api_base="",
+ model_override=vision_model,
+ use_bot=True,
)
def _build_openai_provider(self) -> Optional[VisionProvider]:
@@ -200,7 +293,54 @@ class Vision(BaseTool):
return VisionProvider(name="LinkAI", api_key=api_key, api_base=self._ensure_v1(api_base),
extra_headers=extra)
- _PROVIDER_BUILDERS = [_build_custom_model_provider, _build_openai_provider, _build_linkai_provider]
+ def _call_via_bot(self, model: str, question: str, image_content: dict,
+ provider: Optional[VisionProvider] = None) -> ToolResult:
+ """
+ Call a model's call_vision with vendor-native API format.
+ Uses the provider's _fallback_bot if set, otherwise the main model bot.
+ Raises VisionAPIError on failure so fallback can proceed.
+ """
+ try:
+ bot = (provider and provider.fallback_bot) or self.model.bot
+ except Exception as e:
+ raise VisionAPIError(f"Cannot access bot: {e}")
+
+ # Extract the raw image URL from the OpenAI-format image_content block
+ image_url = image_content.get("image_url", {}).get("url", "")
+ if not image_url:
+ raise VisionAPIError("No image URL in content block")
+
+ try:
+ response = bot.call_vision(
+ image_url=image_url,
+ question=question,
+ model=model,
+ max_tokens=MAX_TOKENS,
+ )
+ except Exception as e:
+ raise VisionAPIError(f"call_vision failed: {e}")
+
+ if response is NotImplemented:
+ raise VisionAPIError("Bot does not support vision")
+
+ if isinstance(response, dict) and response.get("error"):
+ raise VisionAPIError(f"API error - {response.get('message', 'Unknown')}")
+
+ content = response.get("content", "") if isinstance(response, dict) else ""
+ if not content:
+ raise VisionAPIError("Empty response from main model")
+
+ usage_info = response.get("usage", {}) if isinstance(response, dict) else {}
+
+ # Use the actual model name from the bot response if available
+ actual_model = response.get("model", model) if isinstance(response, dict) else model
+ provider_name = provider.name if provider else _MAIN_MODEL_PROVIDER_NAME
+ return ToolResult.success({
+ "model": actual_model,
+ "provider": provider_name,
+ "content": content,
+ "usage": usage_info,
+ })
@staticmethod
def _ensure_v1(api_base: str) -> str:
@@ -213,9 +353,13 @@ class Vision(BaseTool):
return api_base.rstrip("/") + "/v1"
def _build_image_content(self, image: str) -> dict:
- """Build the image_url content block for the API request."""
+ """
+ Build the image_url content block.
+ Both remote URLs and local files are converted to base64 data URLs
+ so every bot backend can consume them without extra downloads.
+ """
if image.startswith(("http://", "https://")):
- return {"type": "image_url", "image_url": {"url": image}}
+ return self._download_to_data_url(image)
if not os.path.isfile(image):
raise FileNotFoundError(f"Image file not found: {image}")
@@ -239,6 +383,19 @@ class Vision(BaseTool):
data_url = f"data:{mime_type};base64,{b64}"
return {"type": "image_url", "image_url": {"url": data_url}}
+ @staticmethod
+ def _download_to_data_url(url: str) -> dict:
+ """Download a remote image and return it as a base64 data URL."""
+ resp = requests.get(url, timeout=30)
+ if resp.status_code != 200:
+ raise VisionAPIError(f"Failed to download image: HTTP {resp.status_code}")
+ content_type = resp.headers.get("Content-Type", "image/jpeg").split(";")[0].strip()
+ if not content_type.startswith("image/"):
+ content_type = "image/jpeg"
+ b64 = base64.b64encode(resp.content).decode("ascii")
+ data_url = f"data:{content_type};base64,{b64}"
+ return {"type": "image_url", "image_url": {"url": data_url}}
+
@staticmethod
def _maybe_compress(path: str) -> str:
"""Compress image to under COMPRESS_THRESHOLD with max long-edge 1536px."""
@@ -312,7 +469,6 @@ class Vision(BaseTool):
],
}
],
- "max_completion_tokens": MAX_TOKENS,
}
headers = {
diff --git a/bridge/agent_bridge.py b/bridge/agent_bridge.py
index 84b7aad6..073cfa83 100644
--- a/bridge/agent_bridge.py
+++ b/bridge/agent_bridge.py
@@ -124,14 +124,15 @@ class AgentLLMModel(LLMModel):
@property
def bot(self):
- """Lazy load the bot, re-create when model changes"""
+ """Lazy load the bot, re-create when model or bot_type changes"""
from models.bot_factory import create_bot
cur_model = self.model
- if self._bot is None or self._bot_model != cur_model:
- bot_type = self._resolve_bot_type(cur_model)
- self._bot = create_bot(bot_type)
+ cur_bot_type = self._resolve_bot_type(cur_model)
+ if self._bot is None or self._bot_model != cur_model or getattr(self, '_bot_type', None) != cur_bot_type:
+ self._bot = create_bot(cur_bot_type)
self._bot = add_openai_compatible_support(self._bot)
self._bot_model = cur_model
+ self._bot_type = cur_bot_type
return self._bot
def call(self, request: LLMRequest):
@@ -498,22 +499,26 @@ class AgentBridge:
reply.text_content = text_response
return reply
- # For other unknown file types, return text with file info
- message = text_response or file_info.get("message", "文件已准备")
- message += f"\n\n[文件: {file_info.get('file_name', file_path)}]"
- return Reply(ReplyType.TEXT, message)
+ # For all other file types (tar.gz, zip, etc.), also use FILE type
+ file_url = f"file://{file_path}"
+ logger.info(f"[AgentBridge] Sending generic file: {file_url}")
+ reply = Reply(ReplyType.FILE, file_url)
+ reply.file_name = file_info.get("file_name", os.path.basename(file_path))
+ if text_response:
+ reply.text_content = text_response
+ return reply
def _migrate_config_to_env(self, workspace_root: str):
"""
- Migrate API keys from config.json to .env file if not already set
-
+ Sync API keys from config.json to .env file.
+ Adds new keys and updates changed values on each startup.
+
Args:
workspace_root: Workspace directory path (not used, kept for compatibility)
"""
from config import conf
import os
- # Mapping from config.json keys to environment variable names
key_mapping = {
"open_ai_api_key": "OPENAI_API_KEY",
"open_ai_api_base": "OPENAI_API_BASE",
@@ -522,10 +527,9 @@ class AgentBridge:
"linkai_api_key": "LINKAI_API_KEY",
}
- # Use fixed secure location for .env file
env_file = expand_path("~/.cow/.env")
- # Read existing env vars from .env file
+ # Read existing env vars (key -> value)
existing_env_vars = {}
if os.path.exists(env_file):
try:
@@ -533,48 +537,46 @@ class AgentBridge:
for line in f:
line = line.strip()
if line and not line.startswith('#') and '=' in line:
- key, _ = line.split('=', 1)
- existing_env_vars[key.strip()] = True
+ key, val = line.split('=', 1)
+ existing_env_vars[key.strip()] = val.strip()
except Exception as e:
logger.warning(f"[AgentBridge] Failed to read .env file: {e}")
- # Check which keys need to be migrated
- keys_to_migrate = {}
+ # Sync config.json values into .env (add/update/remove)
+ updated = False
for config_key, env_key in key_mapping.items():
- # Skip if already in .env file
- if env_key in existing_env_vars:
- continue
-
- # Get value from config.json
- value = conf().get(config_key, "")
- if value and value.strip(): # Only migrate non-empty values
- keys_to_migrate[env_key] = value.strip()
-
- # Log summary if there are keys to skip
- if existing_env_vars:
- logger.debug(f"[AgentBridge] {len(existing_env_vars)} env vars already in .env")
-
- # Write new keys to .env file
- if keys_to_migrate:
+ raw = conf().get(config_key, "")
+ value = raw.strip() if raw else ""
+ old_value = existing_env_vars.get(env_key)
+
+ if value:
+ if old_value == value:
+ continue
+ existing_env_vars[env_key] = value
+ os.environ[env_key] = value
+ updated = True
+ else:
+ if old_value is None:
+ continue
+ existing_env_vars.pop(env_key, None)
+ os.environ.pop(env_key, None)
+ updated = True
+ updated = True
+
+ if updated:
try:
- # Ensure ~/.cow directory and .env file exist
env_dir = os.path.dirname(env_file)
- if not os.path.exists(env_dir):
- os.makedirs(env_dir, exist_ok=True)
- if not os.path.exists(env_file):
- open(env_file, 'a').close()
-
- # Append new keys
- with open(env_file, 'a', encoding='utf-8') as f:
- f.write('\n# Auto-migrated from config.json\n')
- for key, value in keys_to_migrate.items():
+ os.makedirs(env_dir, exist_ok=True)
+
+ with open(env_file, 'w', encoding='utf-8') as f:
+ f.write('# Environment variables for agent\n')
+ f.write('# Auto-managed - synced from config.json on startup\n\n')
+ for key, value in sorted(existing_env_vars.items()):
f.write(f'{key}={value}\n')
- # Also set in current process
- os.environ[key] = value
-
- logger.info(f"[AgentBridge] Migrated {len(keys_to_migrate)} API keys from config.json to .env: {list(keys_to_migrate.keys())}")
+
+ logger.info(f"[AgentBridge] Synced API keys from config.json to .env")
except Exception as e:
- logger.warning(f"[AgentBridge] Failed to migrate API keys: {e}")
+ logger.warning(f"[AgentBridge] Failed to sync API keys: {e}")
def _persist_messages(
self, session_id: str, new_messages: list, channel_type: str = ""
diff --git a/bridge/agent_initializer.py b/bridge/agent_initializer.py
index 58bbbfb3..5e0fe01b 100644
--- a/bridge/agent_initializer.py
+++ b/bridge/agent_initializer.py
@@ -490,7 +490,7 @@ class AgentInitializer:
env_file = expand_path("~/.cow/.env")
- # Read existing env vars
+ # Read existing env vars (key -> value)
existing_env_vars = {}
if os.path.exists(env_file):
try:
@@ -498,38 +498,46 @@ class AgentInitializer:
for line in f:
line = line.strip()
if line and not line.startswith('#') and '=' in line:
- key, _ = line.split('=', 1)
- existing_env_vars[key.strip()] = True
+ key, val = line.split('=', 1)
+ existing_env_vars[key.strip()] = val.strip()
except Exception as e:
logger.warning(f"[AgentInitializer] Failed to read .env file: {e}")
- # Check which keys need migration
- keys_to_migrate = {}
+ # Sync config.json values into .env (add/update/remove)
+ updated = False
for config_key, env_key in key_mapping.items():
- if env_key in existing_env_vars:
- continue
- value = conf().get(config_key, "")
- if value and value.strip():
- keys_to_migrate[env_key] = value.strip()
-
- # Write new keys
- if keys_to_migrate:
+ raw = conf().get(config_key, "")
+ value = raw.strip() if raw else ""
+ old_value = existing_env_vars.get(env_key)
+
+ if value:
+ if old_value == value:
+ continue
+ existing_env_vars[env_key] = value
+ os.environ[env_key] = value
+ updated = True
+ else:
+ if old_value is None:
+ continue
+ existing_env_vars.pop(env_key, None)
+ os.environ.pop(env_key, None)
+ updated = True
+
+ if updated:
try:
env_dir = os.path.dirname(env_file)
- if not os.path.exists(env_dir):
- os.makedirs(env_dir, exist_ok=True)
- if not os.path.exists(env_file):
- open(env_file, 'a').close()
-
- with open(env_file, 'a', encoding='utf-8') as f:
- f.write('\n# Auto-migrated from config.json\n')
- for key, value in keys_to_migrate.items():
+ os.makedirs(env_dir, exist_ok=True)
+
+ # Rewrite the entire .env file to ensure consistency
+ with open(env_file, 'w', encoding='utf-8') as f:
+ f.write('# Environment variables for agent\n')
+ f.write('# Auto-managed - synced from config.json on startup\n\n')
+ for key, value in sorted(existing_env_vars.items()):
f.write(f'{key}={value}\n')
- os.environ[key] = value
-
- logger.info(f"[AgentInitializer] Migrated {len(keys_to_migrate)} API keys to .env: {list(keys_to_migrate.keys())}")
+
+ logger.info(f"[AgentInitializer] Synced API keys from config.json to .env")
except Exception as e:
- logger.warning(f"[AgentInitializer] Failed to migrate API keys: {e}")
+ logger.warning(f"[AgentInitializer] Failed to sync API keys: {e}")
def _start_daily_flush_timer(self):
"""Start a background thread that flushes all agents' memory daily at 23:55."""
diff --git a/channel/web/static/js/console.js b/channel/web/static/js/console.js
index 33a3f484..4beb7272 100644
--- a/channel/web/static/js/console.js
+++ b/channel/web/static/js/console.js
@@ -823,9 +823,6 @@ function sendMessage() {
}
function startSSE(requestId, loadingEl, timestamp) {
- const es = new EventSource(`/stream?request_id=${encodeURIComponent(requestId)}`);
- activeStreams[requestId] = es;
-
let botEl = null;
let stepsEl = null; // .agent-steps (thinking summaries + tool indicators)
let contentEl = null; // .answer-content (final streaming answer)
@@ -834,6 +831,11 @@ function startSSE(requestId, loadingEl, timestamp) {
let currentToolEl = null;
let currentReasoningEl = null; // live reasoning bubble
let reasoningText = '';
+ let done = false;
+
+ const MAX_RECONNECTS = 10;
+ const RECONNECT_BASE_MS = 1000;
+ let reconnectCount = 0;
function ensureBotEl() {
if (botEl) return;
@@ -858,168 +860,202 @@ function startSSE(requestId, loadingEl, timestamp) {
mediaEl = botEl.querySelector('.media-content');
}
- es.onmessage = function(e) {
- let item;
- try { item = JSON.parse(e.data); } catch (_) { return; }
+ function connect() {
+ const es = new EventSource(`/stream?request_id=${encodeURIComponent(requestId)}`);
+ activeStreams[requestId] = es;
- if (item.type === 'reasoning') {
- ensureBotEl();
- reasoningText += item.content;
- if (!currentReasoningEl) {
- currentReasoningEl = document.createElement('div');
- currentReasoningEl.className = 'agent-step agent-thinking-step';
- currentReasoningEl.innerHTML = `
-
- `;
- stepsEl.appendChild(currentReasoningEl);
- }
- // Stream reasoning as a single-line summary (collapsed); full text available on expand
- const oneLine = reasoningText.trim().replace(/\n+/g, ' ');
- currentReasoningEl.querySelector('.thinking-summary').textContent =
- oneLine.length > 80 ? oneLine.substring(0, 80) + '…' : oneLine;
- currentReasoningEl.querySelector('.thinking-full').innerHTML = renderMarkdown(reasoningText);
- scrollChatToBottom();
+ es.onmessage = function(e) {
+ let item;
+ try { item = JSON.parse(e.data); } catch (_) { return; }
- } else if (item.type === 'delta') {
- ensureBotEl();
- if (currentReasoningEl) {
- if (reasoningText.trim().replace(/\n+/g, ' ').length <= 80)
- currentReasoningEl.classList.add('no-expand');
- currentReasoningEl = null;
- reasoningText = '';
- }
- accumulatedText += item.content;
- contentEl.innerHTML = renderMarkdown(accumulatedText);
- scrollChatToBottom();
+ // Successful data received, reset reconnect counter
+ reconnectCount = 0;
- } else if (item.type === 'message_end') {
- // Backend already strips reasoning_content; all deltas are real content.
- // Freeze accumulated text as visible content before tool execution begins.
- if (item.has_tool_calls && accumulatedText.trim()) {
+ if (item.type === 'reasoning') {
ensureBotEl();
- const frozenEl = document.createElement('div');
- frozenEl.className = 'agent-step agent-content-step';
- frozenEl.innerHTML = `${renderMarkdown(accumulatedText.trim())}
`;
- stepsEl.appendChild(frozenEl);
+ reasoningText += item.content;
+ if (!currentReasoningEl) {
+ currentReasoningEl = document.createElement('div');
+ currentReasoningEl.className = 'agent-step agent-thinking-step';
+ currentReasoningEl.innerHTML = `
+
+ `;
+ stepsEl.appendChild(currentReasoningEl);
+ }
+ const oneLine = reasoningText.trim().replace(/\n+/g, ' ');
+ currentReasoningEl.querySelector('.thinking-summary').textContent =
+ oneLine.length > 80 ? oneLine.substring(0, 80) + '…' : oneLine;
+ currentReasoningEl.querySelector('.thinking-full').innerHTML = renderMarkdown(reasoningText);
+ scrollChatToBottom();
+
+ } else if (item.type === 'delta') {
+ ensureBotEl();
+ if (currentReasoningEl) {
+ if (reasoningText.trim().replace(/\n+/g, ' ').length <= 80)
+ currentReasoningEl.classList.add('no-expand');
+ currentReasoningEl = null;
+ reasoningText = '';
+ }
+ accumulatedText += item.content;
+ contentEl.innerHTML = renderMarkdown(accumulatedText);
+ scrollChatToBottom();
+
+ } else if (item.type === 'message_end') {
+ if (item.has_tool_calls && accumulatedText.trim()) {
+ ensureBotEl();
+ const frozenEl = document.createElement('div');
+ frozenEl.className = 'agent-step agent-content-step';
+ frozenEl.innerHTML = `${renderMarkdown(accumulatedText.trim())}
`;
+ stepsEl.appendChild(frozenEl);
+ accumulatedText = '';
+ contentEl.innerHTML = '';
+ scrollChatToBottom();
+ }
+
+ } else if (item.type === 'tool_start') {
+ ensureBotEl();
+ if (currentReasoningEl) {
+ if (reasoningText.trim().replace(/\n+/g, ' ').length <= 80)
+ currentReasoningEl.classList.add('no-expand');
+ currentReasoningEl = null;
+ reasoningText = '';
+ }
accumulatedText = '';
contentEl.innerHTML = '';
- scrollChatToBottom();
- }
- } else if (item.type === 'tool_start') {
- ensureBotEl();
- if (currentReasoningEl) {
- if (reasoningText.trim().replace(/\n+/g, ' ').length <= 80)
- currentReasoningEl.classList.add('no-expand');
- currentReasoningEl = null;
- reasoningText = '';
- }
- accumulatedText = '';
- contentEl.innerHTML = '';
-
- // Add tool execution indicator (collapsible)
- currentToolEl = document.createElement('div');
- currentToolEl.className = 'agent-step agent-tool-step';
- const argsStr = formatToolArgs(item.arguments || {});
- currentToolEl.innerHTML = `
-
-
-
`;
- stepsEl.appendChild(currentToolEl);
+
`;
+ stepsEl.appendChild(currentToolEl);
- scrollChatToBottom();
+ scrollChatToBottom();
- } else if (item.type === 'tool_end') {
- if (currentToolEl) {
- const isError = item.status !== 'success';
- const icon = currentToolEl.querySelector('.tool-icon');
- icon.className = isError
- ? 'fas fa-times text-red-400 flex-shrink-0 tool-icon'
- : 'fas fa-check text-primary-400 flex-shrink-0 tool-icon';
+ } else if (item.type === 'tool_end') {
+ if (currentToolEl) {
+ const isError = item.status !== 'success';
+ const icon = currentToolEl.querySelector('.tool-icon');
+ icon.className = isError
+ ? 'fas fa-times text-red-400 flex-shrink-0 tool-icon'
+ : 'fas fa-check text-primary-400 flex-shrink-0 tool-icon';
- // Show execution time
- const nameEl = currentToolEl.querySelector('.tool-name');
- if (item.execution_time !== undefined) {
- nameEl.innerHTML += `
${item.execution_time}s`;
+ // Show execution time
+ const nameEl = currentToolEl.querySelector('.tool-name');
+ if (item.execution_time !== undefined) {
+ nameEl.innerHTML += `
${item.execution_time}s`;
+ }
+
+ // Fill output section
+ const outputSection = currentToolEl.querySelector('.tool-output-section');
+ if (outputSection && item.result) {
+ outputSection.innerHTML = `
+
${isError ? 'Error' : 'Output'}
+
${escapeHtml(String(item.result))}`;
+ }
+
+ if (isError) currentToolEl.classList.add('tool-failed');
+ currentToolEl = null;
}
- // Fill output section
- const outputSection = currentToolEl.querySelector('.tool-output-section');
- if (outputSection && item.result) {
- outputSection.innerHTML = `
-
${isError ? 'Error' : 'Output'}
-
${escapeHtml(String(item.result))}`;
- }
+ } else if (item.type === 'image') {
+ ensureBotEl();
+ const imgEl = document.createElement('img');
+ imgEl.src = item.content;
+ imgEl.alt = 'screenshot';
+ imgEl.style.cssText = 'max-width:600px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);';
+ imgEl.onclick = () => window.open(item.content, '_blank');
+ mediaEl.appendChild(imgEl);
+ scrollChatToBottom();
- if (isError) currentToolEl.classList.add('tool-failed');
- currentToolEl = null;
+ } else if (item.type === 'text') {
+ // Intermediate text sent before media items; display it but keep SSE open.
+ ensureBotEl();
+ contentEl.classList.remove('sse-streaming');
+ const textContent = item.content || accumulatedText;
+ if (textContent) contentEl.innerHTML = renderMarkdown(textContent);
+ applyHighlighting(botEl);
+ scrollChatToBottom();
+
+ } else if (item.type === 'video') {
+ ensureBotEl();
+ const wrapper = document.createElement('div');
+ wrapper.innerHTML = _buildVideoHtml(item.content);
+ mediaEl.appendChild(wrapper.firstElementChild || wrapper);
+ scrollChatToBottom();
+
+ } else if (item.type === 'file') {
+ ensureBotEl();
+ const fileName = item.file_name || item.content.split('/').pop();
+ const fileEl = document.createElement('a');
+ fileEl.href = item.content;
+ fileEl.download = fileName;
+ fileEl.target = '_blank';
+ fileEl.className = 'file-attachment';
+ fileEl.style.cssText = 'display:inline-flex;align-items:center;gap:6px;padding:8px 14px;margin:8px 0;border-radius:8px;background:var(--bg-secondary,#f3f4f6);color:var(--text-primary,#374151);text-decoration:none;font-size:14px;border:1px solid var(--border-color,#e5e7eb);';
+ fileEl.innerHTML = `
${fileName}`;
+ mediaEl.appendChild(fileEl);
+ scrollChatToBottom();
+
+ } else if (item.type === 'phase') {
+ // Coarse progress (e.g. cow install-browser); must not close SSE (unlike "done")
+ ensureBotEl();
+ const wrap = document.createElement('div');
+ wrap.className = 'text-xs sm:text-sm text-slate-600 dark:text-slate-400 border-l-2 border-primary-400 pl-2 py-1 my-0.5';
+ wrap.textContent = String(item.content || '');
+ stepsEl.appendChild(wrap);
+ scrollChatToBottom();
+
+ } else if (item.type === 'done') {
+ done = true;
+ es.close();
+ delete activeStreams[requestId];
+
+ // item.content may be empty when "done" is only a stream-close signal after media.
+ const finalText = item.content || accumulatedText;
+
+ if (!botEl && finalText) {
+ if (loadingEl) { loadingEl.remove(); loadingEl = null; }
+ addBotMessage(finalText, new Date((item.timestamp || Date.now() / 1000) * 1000), requestId);
+ } else if (botEl) {
+ contentEl.classList.remove('sse-streaming');
+ // Only update text content when there is something new to show.
+ if (finalText) contentEl.innerHTML = renderMarkdown(finalText);
+ applyHighlighting(botEl);
+ }
+ scrollChatToBottom();
+
+ } else if (item.type === 'error') {
+ done = true;
+ es.close();
+ delete activeStreams[requestId];
+ if (loadingEl) { loadingEl.remove(); loadingEl = null; }
+ addBotMessage(t('error_send'), new Date());
}
+ };
- } else if (item.type === 'image') {
- ensureBotEl();
- const imgEl = document.createElement('img');
- imgEl.src = item.content;
- imgEl.alt = 'screenshot';
- imgEl.style.cssText = 'max-width:600px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);';
- imgEl.onclick = () => window.open(item.content, '_blank');
- mediaEl.appendChild(imgEl);
- scrollChatToBottom();
-
- } else if (item.type === 'text') {
- // Intermediate text sent before media items; display it but keep SSE open.
- ensureBotEl();
- contentEl.classList.remove('sse-streaming');
- const textContent = item.content || accumulatedText;
- if (textContent) contentEl.innerHTML = renderMarkdown(textContent);
- applyHighlighting(botEl);
- scrollChatToBottom();
-
- } else if (item.type === 'video') {
- ensureBotEl();
- const wrapper = document.createElement('div');
- wrapper.innerHTML = _buildVideoHtml(item.content);
- mediaEl.appendChild(wrapper.firstElementChild || wrapper);
- scrollChatToBottom();
-
- } else if (item.type === 'file') {
- ensureBotEl();
- const fileName = item.file_name || item.content.split('/').pop();
- const fileEl = document.createElement('a');
- fileEl.href = item.content;
- fileEl.download = fileName;
- fileEl.target = '_blank';
- fileEl.className = 'file-attachment';
- fileEl.style.cssText = 'display:inline-flex;align-items:center;gap:6px;padding:8px 14px;margin:8px 0;border-radius:8px;background:var(--bg-secondary,#f3f4f6);color:var(--text-primary,#374151);text-decoration:none;font-size:14px;border:1px solid var(--border-color,#e5e7eb);';
- fileEl.innerHTML = `
${fileName}`;
- mediaEl.appendChild(fileEl);
- scrollChatToBottom();
-
- } else if (item.type === 'phase') {
- // Coarse progress (e.g. cow install-browser); must not close SSE (unlike "done")
- ensureBotEl();
- const wrap = document.createElement('div');
- wrap.className = 'text-xs sm:text-sm text-slate-600 dark:text-slate-400 border-l-2 border-primary-400 pl-2 py-1 my-0.5';
- wrap.textContent = String(item.content || '');
- stepsEl.appendChild(wrap);
- scrollChatToBottom();
-
- } else if (item.type === 'done') {
+ es.onerror = function() {
es.close();
delete activeStreams[requestId];
+ if (done) return;
+
if (currentReasoningEl) {
if (reasoningText.trim().replace(/\n+/g, ' ').length <= 80)
currentReasoningEl.classList.add('no-expand');
@@ -1027,41 +1063,28 @@ function startSSE(requestId, loadingEl, timestamp) {
reasoningText = '';
}
- // item.content may be empty when "done" is only a stream-close signal after media.
- const finalText = item.content || accumulatedText;
+ if (reconnectCount < MAX_RECONNECTS) {
+ reconnectCount++;
+ const delay = Math.min(RECONNECT_BASE_MS * reconnectCount, 5000);
+ console.warn(`[SSE] connection lost for ${requestId}, reconnecting in ${delay}ms (attempt ${reconnectCount}/${MAX_RECONNECTS})`);
+ setTimeout(connect, delay);
+ return;
+ }
- if (!botEl && finalText) {
- if (loadingEl) { loadingEl.remove(); loadingEl = null; }
- addBotMessage(finalText, new Date((item.timestamp || Date.now() / 1000) * 1000), requestId);
- } else if (botEl) {
+ // Exhausted retries, show whatever we have
+ if (loadingEl) { loadingEl.remove(); loadingEl = null; }
+ if (!botEl) {
+ addBotMessage(t('error_send'), new Date());
+ } else if (accumulatedText) {
contentEl.classList.remove('sse-streaming');
- // Only update text content when there is something new to show.
- if (finalText) contentEl.innerHTML = renderMarkdown(finalText);
+ contentEl.innerHTML = renderMarkdown(accumulatedText);
applyHighlighting(botEl);
bindChatKnowledgeLinks(botEl);
}
- scrollChatToBottom();
+ };
+ }
- } else if (item.type === 'error') {
- es.close();
- delete activeStreams[requestId];
- if (loadingEl) { loadingEl.remove(); loadingEl = null; }
- addBotMessage(t('error_send'), new Date());
- }
- };
-
- es.onerror = function() {
- es.close();
- delete activeStreams[requestId];
- if (loadingEl) { loadingEl.remove(); loadingEl = null; }
- if (!botEl) {
- addBotMessage(t('error_send'), new Date());
- } else if (accumulatedText) {
- contentEl.classList.remove('sse-streaming');
- contentEl.innerHTML = renderMarkdown(accumulatedText);
- applyHighlighting(botEl);
- }
- };
+ connect();
}
function startPolling() {
diff --git a/channel/web/web_channel.py b/channel/web/web_channel.py
index 8a5fbaf0..c5a2dd58 100644
--- a/channel/web/web_channel.py
+++ b/channel/web/web_channel.py
@@ -339,14 +339,18 @@ class WebChannel(ChatChannel):
"""
SSE generator for a given request_id.
Yields UTF-8 encoded bytes to avoid WSGI Latin-1 mangling.
+ Supports client reconnection: the queue is only removed after a
+ "done" event is consumed, so a new GET /stream with the same
+ request_id can resume reading remaining events.
"""
if request_id not in self.sse_queues:
yield b"data: {\"type\": \"error\", \"message\": \"invalid request_id\"}\n\n"
return
q = self.sse_queues[request_id]
- timeout = 300 # 5 minutes max
- deadline = time.time() + timeout
+ idle_timeout = 600 # 10 minutes without any real event
+ deadline = time.time() + idle_timeout
+ done = False
try:
while time.time() < deadline:
@@ -356,13 +360,18 @@ class WebChannel(ChatChannel):
yield b": keepalive\n\n"
continue
+ # Real event received, reset idle deadline
+ deadline = time.time() + idle_timeout
+
payload = json.dumps(item, ensure_ascii=False)
yield f"data: {payload}\n\n".encode("utf-8")
if item.get("type") == "done":
+ done = True
break
finally:
- self.sse_queues.pop(request_id, None)
+ if done:
+ self.sse_queues.pop(request_id, None)
def poll_response(self):
"""
diff --git a/common/cloud_client.py b/common/cloud_client.py
index 870f5dc4..2c07dda1 100644
--- a/common/cloud_client.py
+++ b/common/cloud_client.py
@@ -47,8 +47,8 @@ CREDENTIAL_MAP = {
class CloudClient(LinkAIClient):
- def __init__(self, api_key: str, channel, host: str = ""):
- super().__init__(api_key, host)
+ def __init__(self, api_key: str, channel, host: str = "", port=None):
+ super().__init__(api_key, host, port=port)
self.channel = channel
self.client_type = channel.channel_type
self.channel_mgr = None
@@ -770,7 +770,7 @@ def start(channel, channel_mgr=None):
return
global chat_client
- chat_client = CloudClient(api_key=conf().get("linkai_api_key"), host=conf().get("cloud_host", ""), channel=channel)
+ chat_client = CloudClient(api_key=conf().get("linkai_api_key"), host=conf().get("cloud_host", ""), port=conf().get("cloud_port"), channel=channel)
chat_client.channel_mgr = channel_mgr
chat_client.config = _build_config()
chat_client.start()
diff --git a/common/const.py b/common/const.py
index f7e67e52..ecaf5b0f 100644
--- a/common/const.py
+++ b/common/const.py
@@ -93,6 +93,7 @@ QWQ_PLUS = "qwq-plus"
# MiniMax
MINIMAX_M2_7 = "MiniMax-M2.7" # MiniMax M2.7 - Latest
+MINIMAX_M2_7_HIGHSPEED = "MiniMax-M2.7-highspeed" # MiniMax M2.7 highspeed
MINIMAX_M2_5 = "MiniMax-M2.5" # MiniMax M2.5
MINIMAX_M2_1 = "MiniMax-M2.1" # MiniMax M2.1
MINIMAX_M2_1_LIGHTNING = "MiniMax-M2.1-lightning" # MiniMax M2.1 极速版
@@ -175,7 +176,7 @@ MODEL_LIST = [
QWEN36_PLUS, QWEN35_PLUS, QWEN3_MAX, QWEN_MAX, QWEN_PLUS, QWEN_TURBO, QWEN_LONG,
# MiniMax
- MiniMax, MINIMAX_M2_7, MINIMAX_M2_5, MINIMAX_M2_1, MINIMAX_M2_1_LIGHTNING, MINIMAX_M2, MINIMAX_ABAB6_5,
+ MiniMax, MINIMAX_M2_7, MINIMAX_M2_7_HIGHSPEED, MINIMAX_M2_5, MINIMAX_M2_1, MINIMAX_M2_1_LIGHTNING, MINIMAX_M2, MINIMAX_ABAB6_5,
# GLM
ZHIPU_AI, GLM_5_TURBO, GLM_5, GLM_4, GLM_4_PLUS, GLM_4_flash, GLM_4_LONG, GLM_4_ALLTOOLS,
diff --git a/config.py b/config.py
index 6b231ed2..34bcab67 100644
--- a/config.py
+++ b/config.py
@@ -189,6 +189,7 @@ available_setting = {
"linkai_app_code": "",
"linkai_api_base": "https://api.link-ai.tech",
"cloud_host": "client.link-ai.tech",
+ "cloud_port": None,
"cloud_deployment_id": "",
"minimax_api_key": "",
"Minimax_group_id": "",
diff --git a/docs/en/tools/vision.mdx b/docs/en/tools/vision.mdx
new file mode 100644
index 00000000..cebecbea
--- /dev/null
+++ b/docs/en/tools/vision.mdx
@@ -0,0 +1,72 @@
+---
+title: vision - Image Analysis
+description: Analyze image content (recognition, description, OCR, etc.)
+---
+
+Analyze local images or image URLs using Vision API. Supports content description, text extraction (OCR), object recognition, and more.
+
+## Model Selection
+
+The vision tool uses a multi-level auto-selection strategy with automatic fallback — no manual configuration required:
+
+1. **Main model** — uses the currently configured main model for image recognition (zero extra cost)
+2. **Other configured models** — auto-discovers other models with configured API keys as alternatives
+3. **OpenAI** — uses `open_ai_api_key` to call gpt-4.1-mini
+4. **LinkAI** — uses `linkai_api_key` to call LinkAI vision service
+
+When `use_linkai=true`, LinkAI is promoted to the highest priority.
+
+If the current provider fails, the tool automatically tries the next one until it succeeds or all fail.
+
+### Supported Models
+
+| Vendor | Vision Model | Notes |
+| --- | --- | --- |
+| OpenAI / Compatible | Main model | All OpenAI-compatible multimodal models |
+| Qwen (DashScope) | Main model | Via MultiModalConversation API |
+| Claude | Main model | Anthropic native image format |
+| Gemini | Main model | inlineData format |
+| Doubao | Main model | doubao-seed-2-0 series natively supported |
+| Kimi (Moonshot) | Main model | kimi-k2.5 natively supported |
+| ZhipuAI | glm-5v-turbo | Always uses dedicated vision model |
+| MiniMax | MiniMax-Text-01 | Always uses dedicated vision model |
+
+
+ ZhipuAI and MiniMax text models do not support image understanding, so their dedicated vision models are always used automatically.
+
+
+## Parameters
+
+| Parameter | Type | Required | Description |
+| --- | --- | --- | --- |
+| `image` | string | Yes | Local file path or HTTP(S) image URL |
+| `question` | string | Yes | Question to ask about the image |
+
+Supported image formats: jpg, jpeg, png, gif, webp
+
+## Custom Configuration
+
+To specify a particular model for the vision tool, add to `config.json`:
+
+```json
+{
+ "tool": {
+ "vision": {
+ "model": "gpt-4o"
+ }
+ }
+}
+```
+
+In most cases no configuration is needed. The tool works automatically as long as the main model supports multimodal input or any vision-capable API key is configured.
+
+## Use Cases
+
+- Describe image content
+- Extract text from images (OCR)
+- Identify objects, colors, scenes
+- Analyze screenshots and scanned documents
+
+
+ Images larger than 1MB are automatically compressed (max edge 1536px). All images (including remote URLs) are converted to base64 for transmission to ensure compatibility with all model backends.
+
diff --git a/docs/ja/tools/vision.mdx b/docs/ja/tools/vision.mdx
new file mode 100644
index 00000000..f34bf58a
--- /dev/null
+++ b/docs/ja/tools/vision.mdx
@@ -0,0 +1,72 @@
+---
+title: vision - 画像分析
+description: 画像コンテンツの分析(認識、説明、OCR など)
+---
+
+Vision API を使用してローカル画像や画像 URL を分析します。コンテンツの説明、テキスト抽出(OCR)、オブジェクト認識などに対応しています。
+
+## モデル選択
+
+Vision ツールは多段階の自動選択+自動フォールバック戦略を採用しており、手動設定なしで利用可能です:
+
+1. **メインモデル** — 現在設定されているメインモデルで画像認識を実行(追加コストなし)
+2. **その他の設定済みモデル** — API キーが設定されている他のマルチモーダルモデルを自動検出
+3. **OpenAI** — `open_ai_api_key` を使用して gpt-4.1-mini を呼び出し
+4. **LinkAI** — `linkai_api_key` を使用して LinkAI ビジョンサービスを呼び出し
+
+`use_linkai=true` の場合、LinkAI が最優先になります。
+
+現在のプロバイダーが失敗した場合、成功するかすべて失敗するまで自動的に次のプロバイダーを試行します。
+
+### 対応モデル
+
+| ベンダー | ビジョンモデル | 説明 |
+| --- | --- | --- |
+| OpenAI / 互換プロトコル | メインモデル | すべての OpenAI 互換マルチモーダルモデルに対応 |
+| 通義千問 (DashScope) | メインモデル | MultiModalConversation API 経由 |
+| Claude | メインモデル | Anthropic ネイティブ画像形式 |
+| Gemini | メインモデル | inlineData 形式 |
+| 豆包 (Doubao) | メインモデル | doubao-seed-2-0 シリーズがネイティブ対応 |
+| Kimi (Moonshot) | メインモデル | kimi-k2.5 がネイティブ対応 |
+| 智谱 AI | glm-5v-turbo | 常にビジョン専用モデルを使用 |
+| MiniMax | MiniMax-Text-01 | 常にビジョン専用モデルを使用 |
+
+
+ 智谱 AI と MiniMax のテキストモデルは画像理解に対応していないため、対応するビジョン専用モデルが自動的に使用されます。
+
+
+## パラメータ
+
+| パラメータ | 型 | 必須 | 説明 |
+| --- | --- | --- | --- |
+| `image` | string | はい | ローカルファイルパスまたは HTTP(S) 画像 URL |
+| `question` | string | はい | 画像に対する質問 |
+
+対応画像形式:jpg、jpeg、png、gif、webp
+
+## カスタム設定
+
+Vision ツールで使用するモデルを指定するには、`config.json` に以下を追加します:
+
+```json
+{
+ "tool": {
+ "vision": {
+ "model": "gpt-4o"
+ }
+ }
+}
+```
+
+ほとんどの場合、設定は不要です。メインモデルがマルチモーダルに対応しているか、ビジョン対応の API キーが設定されていれば自動的に動作します。
+
+## ユースケース
+
+- 画像コンテンツの説明
+- 画像からのテキスト抽出(OCR)
+- オブジェクト、色、シーンの識別
+- スクリーンショットやスキャン文書の分析
+
+
+ 1MB を超える画像は自動的に圧縮されます(最大辺 1536px)。すべての画像(リモート URL を含む)は base64 に変換して送信され、すべてのモデルバックエンドとの互換性を確保します。
+
diff --git a/docs/tools/vision.mdx b/docs/tools/vision.mdx
index 839212b3..4e1089e0 100644
--- a/docs/tools/vision.mdx
+++ b/docs/tools/vision.mdx
@@ -5,14 +5,49 @@ description: 分析图片内容(识别、描述、OCR 等)
使用 Vision API 分析本地图片或图片 URL,支持内容描述、文字提取(OCR)、物体识别等。
-## 依赖
+## 模型选择
-需要配置至少一个 API Key(通过 `env_config` 工具或工作空间 `.env` 文件配置):
+Vision 工具采用多级自动选择 + 自动兜底策略,无需手动配置即可使用:
-| 后端 | 环境变量 | 优先级 |
+1. **主模型** — 优先使用当前配置的主模型进行图像识别(需要是多模态模型)
+2. **其他已配置模型** — 自动发现已配置 API Key 的其他多模态模型作为备选
+
+如果当前 provider 调用失败,会自动尝试下一个,直到成功或全部失败。
+
+### 支持的模型
+
+| 厂商 | 视觉模型 | 说明 |
| --- | --- | --- |
-| OpenAI | `OPENAI_API_KEY` | 优先使用 |
-| LinkAI | `LINKAI_API_KEY` | 备选 |
+| OpenAI / 兼容协议 | 使用主模型 | 支持所有 OpenAI 协议兼容的多模态模型 |
+| 通义千问 (DashScope) | 使用主模型 | 例如 qwen3.6-plus 等 |
+| Claude | 使用主模型 | Anthropic 原生图像格式 |
+| Gemini | 使用主模型 | inlineData 格式 |
+| 豆包 (Doubao) | 使用主模型 | doubao-seed-2-0 系列原生支持 |
+| Kimi (Moonshot) | 使用主模型 | kimi-k2.5 原生支持 |
+| 智谱 AI | glm-5v-turbo | 固定使用视觉专用模型 |
+| MiniMax | MiniMax-Text-01 | 固定使用视觉专用模型 |
+
+
+ 智谱和 MiniMax 的文本模型不支持图像理解,因此始终使用对应的视觉专用模型,无需手动指定。
+
+
+> 当 `use_linkai=true` 时,默认使用 LinkAI 的多模态模型进行
+
+## 自定义配置
+
+如果希望指定 Vision 使用的模型,可在 `config.json` 中配置,例如:
+
+```json
+{
+ "tool": {
+ "vision": {
+ "model": "gpt-4o"
+ }
+ }
+}
+```
+
+大多数情况下无需配置,主模型支持多模态或配置任意一个支持视觉的 API Key 即可自动工作。
## 参数
@@ -20,17 +55,18 @@ description: 分析图片内容(识别、描述、OCR 等)
| --- | --- | --- | --- |
| `image` | string | 是 | 本地文件路径或 HTTP(S) 图片 URL |
| `question` | string | 是 | 对图片提出的问题 |
-| `model` | string | 否 | 模型名称(默认 gpt-4.1-mini) |
支持的图片格式:jpg、jpeg、png、gif、webp
+
+
## 使用场景
- 描述图片中的内容
- 提取图片中的文字(OCR)
- 识别物体、颜色、场景
-- 分析截图、文档扫描件
+- 分析截图、文档扫描图片等
- 超过 1MB 的图片会自动压缩后上传。如果未配置任何 Vision API Key,该工具不会被加载。
+ 超过 1MB 的图片会自动压缩后上传,所有图片(包括远程 URL)会统一转为 base64 传输,确保兼容所有模型后端。
diff --git a/models/bot.py b/models/bot.py
index ca6e1aa1..f5f72e7d 100644
--- a/models/bot.py
+++ b/models/bot.py
@@ -2,12 +2,27 @@
Auto-replay chat robot abstract class
"""
-
from bridge.context import Context
from bridge.reply import Reply
class Bot(object):
+ """
+ Base class for all chat-bot implementations.
+
+ Subclasses may also implement:
+
+ call_with_tools(messages, tools=None, stream=False, **kwargs)
+ -> dict | generator (OpenAI-compatible format)
+
+ call_vision(image_url, question, model=None, max_tokens=1000)
+ -> dict with keys: model, content, usage (or error/message)
+
+ These are NOT defined here to avoid shadowing concrete implementations
+ provided by mixin classes (e.g. OpenAICompatibleBot) in the MRO.
+ Use ``hasattr(bot, 'call_vision')`` to detect support at runtime.
+ """
+
def reply(self, query, context: Context = None) -> Reply:
"""
bot auto-reply content
diff --git a/models/claudeapi/claude_api_bot.py b/models/claudeapi/claude_api_bot.py
index e7fe8710..49fc3d46 100644
--- a/models/claudeapi/claude_api_bot.py
+++ b/models/claudeapi/claude_api_bot.py
@@ -1,7 +1,10 @@
# encoding:utf-8
+import base64
import json
+import re
import time
+from typing import Optional
import requests
@@ -224,6 +227,79 @@ class ClaudeAPIBot(Bot, OpenAIImage):
return 64000
return 8192
+ @staticmethod
+ def _parse_data_url(data_url: str):
+ """Parse a data:
;base64, URL into (media_type, base64_data)."""
+ m = re.match(r"^data:([^;]+);base64,(.+)$", data_url, re.DOTALL)
+ if m:
+ return m.group(1), m.group(2)
+ return None, None
+
+ def call_vision(self, image_url: str, question: str,
+ model: Optional[str] = None,
+ max_tokens: int = 1000) -> dict:
+ """Analyze an image using Claude Messages API (native image blocks)."""
+ try:
+ actual_model = model or self._model_mapping(conf().get("model"))
+
+ # Build Claude-native image content block
+ if image_url.startswith("data:"):
+ media_type, b64_data = self._parse_data_url(image_url)
+ if not b64_data:
+ return {"error": True, "message": "Invalid base64 data URL"}
+ image_block = {
+ "type": "image",
+ "source": {"type": "base64",
+ "media_type": media_type or "image/jpeg",
+ "data": b64_data},
+ }
+ else:
+ image_block = {
+ "type": "image",
+ "source": {"type": "url", "url": image_url},
+ }
+
+ data = {
+ "model": actual_model,
+ "max_tokens": max_tokens,
+ "messages": [{
+ "role": "user",
+ "content": [
+ image_block,
+ {"type": "text", "text": question},
+ ],
+ }],
+ }
+
+ headers = {
+ "x-api-key": self.api_key,
+ "anthropic-version": "2023-06-01",
+ "content-type": "application/json",
+ }
+ proxies = {"http": self.proxy, "https": self.proxy} if self.proxy else None
+ resp = requests.post(f"{self.api_base}/messages",
+ headers=headers, json=data, proxies=proxies)
+
+ if resp.status_code != 200:
+ return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
+
+ body = resp.json()
+ text_parts = [b.get("text", "") for b in body.get("content", [])
+ if b.get("type") == "text"]
+ usage = body.get("usage", {})
+ return {
+ "model": actual_model,
+ "content": "".join(text_parts),
+ "usage": {
+ "prompt_tokens": usage.get("input_tokens", 0),
+ "completion_tokens": usage.get("output_tokens", 0),
+ "total_tokens": usage.get("input_tokens", 0) + usage.get("output_tokens", 0),
+ },
+ }
+ except Exception as e:
+ logger.error(f"[CLAUDE] call_vision error: {e}")
+ return {"error": True, "message": str(e)}
+
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
"""
Call Claude API with tool support for agent integration
diff --git a/models/dashscope/dashscope_bot.py b/models/dashscope/dashscope_bot.py
index 0887751f..4d4d628f 100644
--- a/models/dashscope/dashscope_bot.py
+++ b/models/dashscope/dashscope_bot.py
@@ -1,6 +1,8 @@
# encoding:utf-8
import json
+from typing import Optional
+
from models.bot import Bot
from models.session_manager import SessionManager
from bridge.context import ContextType
@@ -153,6 +155,56 @@ class DashscopeBot(Bot):
else:
return result
+ def call_vision(self, image_url: str, question: str,
+ model: Optional[str] = None,
+ max_tokens: int = 1000) -> dict:
+ """Analyze an image using DashScope MultiModalConversation API."""
+ try:
+ dashscope.api_key = self.api_key
+ vision_model = model or "qwen-vl-max"
+
+ # DashScope multimodal format: {"image": url} + {"text": question}
+ messages = [{
+ "role": "user",
+ "content": [
+ {"image": image_url},
+ {"text": question},
+ ],
+ }]
+
+ response = MultiModalConversation.call(
+ model=vision_model,
+ messages=messages,
+ max_tokens=max_tokens,
+ )
+
+ if response.status_code != HTTPStatus.OK:
+ return {
+ "error": True,
+ "message": f"{response.code} - {response.message}",
+ }
+
+ resp_dict = self._response_to_dict(response)
+ choice = resp_dict["output"]["choices"][0]
+ content = choice.get("message", {}).get("content", "")
+ if isinstance(content, list):
+ content = "".join(
+ item.get("text", "") for item in content if isinstance(item, dict)
+ )
+ usage = resp_dict.get("usage", {})
+ return {
+ "model": vision_model,
+ "content": content,
+ "usage": {
+ "prompt_tokens": usage.get("input_tokens", 0),
+ "completion_tokens": usage.get("output_tokens", 0),
+ "total_tokens": usage.get("total_tokens", 0),
+ },
+ }
+ except Exception as e:
+ logger.error(f"[DASHSCOPE] call_vision error: {e}")
+ return {"error": True, "message": str(e)}
+
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
"""
Call DashScope API with tool support for agent integration
diff --git a/models/doubao/doubao_bot.py b/models/doubao/doubao_bot.py
index cfe4ba5c..b31516ec 100644
--- a/models/doubao/doubao_bot.py
+++ b/models/doubao/doubao_bot.py
@@ -2,6 +2,7 @@
import json
import time
+from typing import Optional
import requests
from models.bot import Bot
@@ -147,6 +148,49 @@ class DoubaoBot(Bot):
else:
return result
+ def call_vision(self, image_url: str, question: str,
+ model: Optional[str] = None,
+ max_tokens: int = 1000) -> dict:
+ """Analyze an image using Doubao (Volcengine Ark) OpenAI-compatible API."""
+ try:
+ vision_model = model or self.args.get("model", "doubao-seed-2-0-pro-260215")
+ payload = {
+ "model": vision_model,
+ "max_tokens": max_tokens,
+ "messages": [{
+ "role": "user",
+ "content": [
+ {"type": "text", "text": question},
+ {"type": "image_url", "image_url": {"url": image_url}},
+ ],
+ }],
+ }
+ headers = {
+ "Authorization": f"Bearer {self.api_key}",
+ "Content-Type": "application/json",
+ }
+ resp = requests.post(f"{self.base_url}/chat/completions",
+ headers=headers, json=payload, timeout=60)
+ if resp.status_code != 200:
+ return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
+ data = resp.json()
+ if "error" in data:
+ return {"error": True, "message": data["error"].get("message", str(data["error"]))}
+ content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+ usage = data.get("usage", {})
+ return {
+ "model": vision_model,
+ "content": content,
+ "usage": {
+ "prompt_tokens": usage.get("prompt_tokens", 0),
+ "completion_tokens": usage.get("completion_tokens", 0),
+ "total_tokens": usage.get("total_tokens", 0),
+ },
+ }
+ except Exception as e:
+ logger.error(f"[DOUBAO] call_vision error: {e}")
+ return {"error": True, "message": str(e)}
+
# ==================== Agent mode support ====================
def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs):
@@ -434,31 +478,37 @@ class DoubaoBot(Bot):
continue
if role == "user":
- text_parts = []
- tool_results = []
+ has_tool_result = any(
+ isinstance(b, dict) and b.get("type") == "tool_result" for b in content
+ )
+ if has_tool_result:
+ text_parts = []
+ tool_results = []
- for block in content:
- if not isinstance(block, dict):
- continue
- if block.get("type") == "text":
- text_parts.append(block.get("text", ""))
- elif block.get("type") == "tool_result":
- tool_call_id = block.get("tool_use_id") or ""
- result_content = block.get("content", "")
- if not isinstance(result_content, str):
- result_content = json.dumps(result_content, ensure_ascii=False)
- tool_results.append({
- "role": "tool",
- "tool_call_id": tool_call_id,
- "content": result_content
- })
+ for block in content:
+ if not isinstance(block, dict):
+ continue
+ if block.get("type") == "text":
+ text_parts.append(block.get("text", ""))
+ elif block.get("type") == "tool_result":
+ tool_call_id = block.get("tool_use_id") or ""
+ result_content = block.get("content", "")
+ if not isinstance(result_content, str):
+ result_content = json.dumps(result_content, ensure_ascii=False)
+ tool_results.append({
+ "role": "tool",
+ "tool_call_id": tool_call_id,
+ "content": result_content
+ })
- # Tool results first (must come right after assistant with tool_calls)
- for tr in tool_results:
- converted.append(tr)
+ for tr in tool_results:
+ converted.append(tr)
- if text_parts:
- converted.append({"role": "user", "content": "\n".join(text_parts)})
+ if text_parts:
+ converted.append({"role": "user", "content": "\n".join(text_parts)})
+ else:
+ # Keep as-is for multimodal content (e.g. image_url blocks)
+ converted.append(msg)
elif role == "assistant":
openai_msg = {"role": "assistant"}
diff --git a/models/gemini/google_gemini_bot.py b/models/gemini/google_gemini_bot.py
index e49a8bf3..aa7199ca 100644
--- a/models/gemini/google_gemini_bot.py
+++ b/models/gemini/google_gemini_bot.py
@@ -12,6 +12,8 @@ import mimetypes
import os
import re
import time
+from typing import Optional
+
import requests
from models.bot import Bot
from models.session_manager import SessionManager
@@ -144,7 +146,12 @@ class GoogleGeminiBot(Bot):
return "", []
pattern = r"\[图片:\s*([^\]]+)\]"
image_paths = [m.strip().strip("'\"") for m in re.findall(pattern, content) if m.strip()]
- cleaned_text = re.sub(pattern, "", content)
+ # Replace markers with path-only hints so the model still knows the
+ # original file location (needed when it calls tools like vision).
+ def _replace_with_hint(m):
+ path = m.group(1).strip().strip("'\"")
+ return f"[attached image: {path}]"
+ cleaned_text = re.sub(pattern, _replace_with_hint, content)
cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text).strip()
return cleaned_text, image_paths
@@ -225,6 +232,57 @@ class GoogleGeminiBot(Bot):
logger.warning(f"[Gemini] Unsupported image URL format: {image_url[:120]}")
return None
+ def call_vision(self, image_url: str, question: str,
+ model: Optional[str] = None,
+ max_tokens: int = 1000) -> dict:
+ """Analyze an image using Gemini REST API."""
+ try:
+ model_name = model or self.model or "gemini-2.0-flash"
+ image_part = self._build_inline_part_from_image_url({"url": image_url})
+ if not image_part:
+ return {"error": True, "message": f"Cannot process image URL: {image_url[:120]}"}
+
+ payload = {
+ "contents": [{
+ "role": "user",
+ "parts": [image_part, {"text": question}],
+ }],
+ "generationConfig": {"maxOutputTokens": max_tokens},
+ "safetySettings": [
+ {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
+ {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
+ {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
+ {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
+ ],
+ }
+ endpoint = f"{self.api_base}/v1beta/models/{model_name}:generateContent"
+ headers = {"x-goog-api-key": self.api_key, "Content-Type": "application/json"}
+ resp = requests.post(endpoint, headers=headers, json=payload, timeout=60)
+
+ if resp.status_code != 200:
+ return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
+
+ body = resp.json()
+ candidates = body.get("candidates", [])
+ text_parts = []
+ for part in candidates[0].get("content", {}).get("parts", []) if candidates else []:
+ if "text" in part:
+ text_parts.append(part["text"])
+
+ usage_meta = body.get("usageMetadata", {})
+ return {
+ "model": model_name,
+ "content": "".join(text_parts),
+ "usage": {
+ "prompt_tokens": usage_meta.get("promptTokenCount", 0),
+ "completion_tokens": usage_meta.get("candidatesTokenCount", 0),
+ "total_tokens": usage_meta.get("totalTokenCount", 0),
+ },
+ }
+ except Exception as e:
+ logger.error(f"[Gemini] call_vision error: {e}")
+ return {"error": True, "message": str(e)}
+
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
"""
Call Gemini API with tool support using REST API (following official docs)
diff --git a/models/minimax/minimax_bot.py b/models/minimax/minimax_bot.py
index 63ca789c..612f54dc 100644
--- a/models/minimax/minimax_bot.py
+++ b/models/minimax/minimax_bot.py
@@ -2,6 +2,8 @@
import time
import json
+from typing import Optional
+
import requests
from models.bot import Bot
@@ -20,7 +22,7 @@ class MinimaxBot(Bot):
def __init__(self):
super().__init__()
self.args = {
- "model": conf().get("model") or "MiniMax-M2.1",
+ "model": conf().get("model") or "MiniMax-M2.7",
"temperature": conf().get("temperature", 0.3),
"top_p": conf().get("top_p", 0.95),
}
@@ -175,6 +177,51 @@ class MinimaxBot(Bot):
else:
return result
+ def call_vision(self, image_url: str, question: str,
+ model: Optional[str] = None,
+ max_tokens: int = 1000) -> dict:
+ """Analyze an image using MiniMax OpenAI-compatible API.
+ Always uses MiniMax-Text-01 — other MiniMax models do not support vision.
+ """
+ try:
+ vision_model = "MiniMax-Text-01"
+ payload = {
+ "model": vision_model,
+ "max_tokens": max_tokens,
+ "messages": [{
+ "role": "user",
+ "content": [
+ {"type": "text", "text": question},
+ {"type": "image_url", "image_url": {"url": image_url}},
+ ],
+ }],
+ }
+ headers = {
+ "Authorization": f"Bearer {self.api_key}",
+ "Content-Type": "application/json",
+ }
+ resp = requests.post(f"{self.api_base}/chat/completions",
+ headers=headers, json=payload, timeout=60)
+ if resp.status_code != 200:
+ return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
+ data = resp.json()
+ if "error" in data:
+ return {"error": True, "message": data["error"].get("message", str(data["error"]))}
+ content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+ usage = data.get("usage", {})
+ return {
+ "model": vision_model,
+ "content": content,
+ "usage": {
+ "prompt_tokens": usage.get("prompt_tokens", 0),
+ "completion_tokens": usage.get("completion_tokens", 0),
+ "total_tokens": usage.get("total_tokens", 0),
+ },
+ }
+ except Exception as e:
+ logger.error(f"[MINIMAX] call_vision error: {e}")
+ return {"error": True, "message": str(e)}
+
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
"""
Call MiniMax API with tool support for agent integration
@@ -270,37 +317,41 @@ class MinimaxBot(Bot):
if role == "user":
# Handle user message
if isinstance(content, list):
- # Extract text from content blocks
- text_parts = []
- tool_results = []
+ has_tool_result = any(
+ isinstance(b, dict) and b.get("type") == "tool_result" for b in content
+ )
+ if has_tool_result:
+ text_parts = []
+ tool_results = []
- for block in content:
- if isinstance(block, dict):
- if block.get("type") == "text":
- text_parts.append(block.get("text", ""))
- elif block.get("type") == "tool_result":
- # Tool result should be a separate message with role="tool"
- tool_call_id = block.get("tool_use_id") or ""
- if not tool_call_id:
- logger.warning(f"[MINIMAX] tool_result missing tool_use_id")
- result_content = block.get("content", "")
- if not isinstance(result_content, str):
- result_content = json.dumps(result_content, ensure_ascii=False)
- tool_results.append({
- "role": "tool",
- "tool_call_id": tool_call_id,
- "content": result_content
- })
+ for block in content:
+ if isinstance(block, dict):
+ if block.get("type") == "text":
+ text_parts.append(block.get("text", ""))
+ elif block.get("type") == "tool_result":
+ tool_call_id = block.get("tool_use_id") or ""
+ if not tool_call_id:
+ logger.warning(f"[MINIMAX] tool_result missing tool_use_id")
+ result_content = block.get("content", "")
+ if not isinstance(result_content, str):
+ result_content = json.dumps(result_content, ensure_ascii=False)
+ tool_results.append({
+ "role": "tool",
+ "tool_call_id": tool_call_id,
+ "content": result_content
+ })
- if text_parts:
- converted.append({
- "role": "user",
- "content": "\n".join(text_parts)
- })
+ if text_parts:
+ converted.append({
+ "role": "user",
+ "content": "\n".join(text_parts)
+ })
- # Add all tool results (not just the last one)
- for tool_result in tool_results:
- converted.append(tool_result)
+ for tool_result in tool_results:
+ converted.append(tool_result)
+ else:
+ # Keep as-is for multimodal content (e.g. image_url blocks)
+ converted.append(msg)
else:
# Simple text content
converted.append({
diff --git a/models/moonshot/moonshot_bot.py b/models/moonshot/moonshot_bot.py
index ded011ca..4d35400e 100644
--- a/models/moonshot/moonshot_bot.py
+++ b/models/moonshot/moonshot_bot.py
@@ -2,6 +2,7 @@
import json
import time
+from typing import Optional
import requests
from models.bot import Bot
@@ -147,6 +148,49 @@ class MoonshotBot(Bot):
else:
return result
+ def call_vision(self, image_url: str, question: str,
+ model: Optional[str] = None,
+ max_tokens: int = 1000) -> dict:
+ """Analyze an image using Moonshot (Kimi) OpenAI-compatible API."""
+ try:
+ vision_model = model or self.args.get("model", "kimi-k2.5")
+ payload = {
+ "model": vision_model,
+ "max_tokens": max_tokens,
+ "messages": [{
+ "role": "user",
+ "content": [
+ {"type": "text", "text": question},
+ {"type": "image_url", "image_url": {"url": image_url}},
+ ],
+ }],
+ }
+ headers = {
+ "Authorization": f"Bearer {self.api_key}",
+ "Content-Type": "application/json",
+ }
+ resp = requests.post(f"{self.base_url}/chat/completions",
+ headers=headers, json=payload, timeout=60)
+ if resp.status_code != 200:
+ return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
+ data = resp.json()
+ if "error" in data:
+ return {"error": True, "message": data["error"].get("message", str(data["error"]))}
+ content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+ usage = data.get("usage", {})
+ return {
+ "model": vision_model,
+ "content": content,
+ "usage": {
+ "prompt_tokens": usage.get("prompt_tokens", 0),
+ "completion_tokens": usage.get("completion_tokens", 0),
+ "total_tokens": usage.get("total_tokens", 0),
+ },
+ }
+ except Exception as e:
+ logger.error(f"[MOONSHOT] call_vision error: {e}")
+ return {"error": True, "message": str(e)}
+
# ==================== Agent mode support ====================
def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs):
@@ -435,31 +479,37 @@ class MoonshotBot(Bot):
continue
if role == "user":
- text_parts = []
- tool_results = []
+ has_tool_result = any(
+ isinstance(b, dict) and b.get("type") == "tool_result" for b in content
+ )
+ if has_tool_result:
+ text_parts = []
+ tool_results = []
- for block in content:
- if not isinstance(block, dict):
- continue
- if block.get("type") == "text":
- text_parts.append(block.get("text", ""))
- elif block.get("type") == "tool_result":
- tool_call_id = block.get("tool_use_id") or ""
- result_content = block.get("content", "")
- if not isinstance(result_content, str):
- result_content = json.dumps(result_content, ensure_ascii=False)
- tool_results.append({
- "role": "tool",
- "tool_call_id": tool_call_id,
- "content": result_content
- })
+ for block in content:
+ if not isinstance(block, dict):
+ continue
+ if block.get("type") == "text":
+ text_parts.append(block.get("text", ""))
+ elif block.get("type") == "tool_result":
+ tool_call_id = block.get("tool_use_id") or ""
+ result_content = block.get("content", "")
+ if not isinstance(result_content, str):
+ result_content = json.dumps(result_content, ensure_ascii=False)
+ tool_results.append({
+ "role": "tool",
+ "tool_call_id": tool_call_id,
+ "content": result_content
+ })
- # Tool results first (must come right after assistant with tool_calls)
- for tr in tool_results:
- converted.append(tr)
+ for tr in tool_results:
+ converted.append(tr)
- if text_parts:
- converted.append({"role": "user", "content": "\n".join(text_parts)})
+ if text_parts:
+ converted.append({"role": "user", "content": "\n".join(text_parts)})
+ else:
+ # Keep as-is for multimodal content (e.g. image_url blocks)
+ converted.append(msg)
elif role == "assistant":
openai_msg = {"role": "assistant"}
diff --git a/models/openai_compatible_bot.py b/models/openai_compatible_bot.py
index baac0681..6d4d314e 100644
--- a/models/openai_compatible_bot.py
+++ b/models/openai_compatible_bot.py
@@ -9,6 +9,8 @@ This includes: OpenAI, LinkAI, Azure OpenAI, and many third-party providers.
import json
import openai
+import requests
+from typing import Optional
from common.log import logger
from agent.protocol.message_utils import drop_orphaned_tool_results_openai
@@ -306,3 +308,51 @@ class OpenAICompatibleBot:
openai_messages.append(msg)
return drop_orphaned_tool_results_openai(openai_messages)
+
+ def call_vision(self, image_url: str, question: str,
+ model: Optional[str] = None,
+ max_tokens: int = 1000) -> dict:
+ """Analyze an image using the OpenAI-compatible /chat/completions endpoint."""
+ try:
+ api_config = self.get_api_config()
+ vision_model = model or api_config.get("model", "gpt-4o")
+ api_key = api_config.get("api_key", "")
+ api_base = (api_config.get("api_base") or "https://api.openai.com/v1").rstrip("/")
+
+ payload = {
+ "model": vision_model,
+ "messages": [{
+ "role": "user",
+ "content": [
+ {"type": "text", "text": question},
+ {"type": "image_url", "image_url": {"url": image_url}},
+ ],
+ }],
+ }
+ headers = {
+ "Authorization": f"Bearer {api_key}",
+ "Content-Type": "application/json",
+ }
+ resp = requests.post(
+ f"{api_base}/chat/completions",
+ headers=headers, json=payload, timeout=60,
+ )
+ if resp.status_code != 200:
+ body = resp.text[:500]
+ logger.error(f"[{self.__class__.__name__}] call_vision HTTP {resp.status_code}: {body}")
+ return {"error": True, "message": f"HTTP {resp.status_code}: {body}"}
+ data = resp.json()
+ content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+ usage = data.get("usage", {})
+ return {
+ "model": vision_model,
+ "content": content,
+ "usage": {
+ "prompt_tokens": usage.get("prompt_tokens", 0),
+ "completion_tokens": usage.get("completion_tokens", 0),
+ "total_tokens": usage.get("total_tokens", 0),
+ },
+ }
+ except Exception as e:
+ logger.error(f"[{self.__class__.__name__}] call_vision error: {e}")
+ return {"error": True, "message": str(e)}
diff --git a/models/zhipuai/zhipuai_bot.py b/models/zhipuai/zhipuai_bot.py
index 4733cf9b..98ea5db1 100644
--- a/models/zhipuai/zhipuai_bot.py
+++ b/models/zhipuai/zhipuai_bot.py
@@ -2,6 +2,7 @@
import time
import json
+from typing import Optional
from models.bot import Bot
from models.zhipuai.zhipu_ai_session import ZhipuAISession
@@ -149,6 +150,40 @@ class ZHIPUAIBot(Bot, ZhipuAIImage):
else:
return result
+ def call_vision(self, image_url: str, question: str,
+ model: Optional[str] = None,
+ max_tokens: int = 1000) -> dict:
+ """Analyze an image using ZhipuAI OpenAI-compatible SDK.
+ Always uses glm-5v-turbo — the text models (glm-5-turbo etc.) do not support vision.
+ """
+ try:
+ vision_model = "glm-5v-turbo"
+ response = self.client.chat.completions.create(
+ model=vision_model,
+ max_tokens=max_tokens,
+ messages=[{
+ "role": "user",
+ "content": [
+ {"type": "text", "text": question},
+ {"type": "image_url", "image_url": {"url": image_url}},
+ ],
+ }],
+ )
+ content = response.choices[0].message.content or ""
+ usage = response.usage
+ return {
+ "model": vision_model,
+ "content": content,
+ "usage": {
+ "prompt_tokens": getattr(usage, "prompt_tokens", 0),
+ "completion_tokens": getattr(usage, "completion_tokens", 0),
+ "total_tokens": getattr(usage, "total_tokens", 0),
+ },
+ }
+ except Exception as e:
+ logger.error(f"[ZHIPU_AI] call_vision error: {e}")
+ return {"error": True, "message": str(e)}
+
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
"""
Call ZhipuAI API with tool support for agent integration
diff --git a/tests/test_minimax_provider.py b/tests/test_minimax_provider.py
new file mode 100644
index 00000000..cfad7fd7
--- /dev/null
+++ b/tests/test_minimax_provider.py
@@ -0,0 +1,184 @@
+# encoding:utf-8
+"""
+Unit tests for MiniMax provider additions:
+ - MiniMax-M2.7-highspeed constant in const.py
+ - Default model update in MinimaxBot
+ - MinimaxVoice TTS provider
+"""
+import sys
+import os
+import json
+import unittest
+from unittest.mock import MagicMock, patch, PropertyMock
+
+# Add project root to path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+
+class TestMinimaxConst(unittest.TestCase):
+ """Test that MiniMax-M2.7-highspeed is properly registered in const.py."""
+
+ def test_m2_7_highspeed_constant_defined(self):
+ from common import const
+ self.assertTrue(hasattr(const, "MINIMAX_M2_7_HIGHSPEED"))
+ self.assertEqual(const.MINIMAX_M2_7_HIGHSPEED, "MiniMax-M2.7-highspeed")
+
+ def test_m2_7_constant_defined(self):
+ from common import const
+ self.assertEqual(const.MINIMAX_M2_7, "MiniMax-M2.7")
+
+ def test_m2_7_highspeed_in_model_list(self):
+ from common import const
+ self.assertIn("MiniMax-M2.7-highspeed", const.MODEL_LIST)
+
+ def test_m2_7_in_model_list(self):
+ from common import const
+ self.assertIn("MiniMax-M2.7", const.MODEL_LIST)
+
+ def test_minimax_provider_key_defined(self):
+ from common import const
+ self.assertEqual(const.MiniMax, "minimax")
+
+
+class TestMinimaxBotDefaultModel(unittest.TestCase):
+ """Test that MinimaxBot defaults to MiniMax-M2.7."""
+
+ def test_default_model_is_m2_7(self):
+ # Patch conf() to return empty config
+ mock_conf = MagicMock()
+ mock_conf.get = MagicMock(side_effect=lambda key, default=None: default)
+
+ with patch("models.minimax.minimax_bot.conf", return_value=mock_conf):
+ with patch("models.minimax.minimax_bot.SessionManager"):
+ from models.minimax import minimax_bot
+ # Reload to pick up patches
+ import importlib
+ importlib.reload(minimax_bot)
+ with patch("models.minimax.minimax_bot.conf", return_value=mock_conf):
+ bot = minimax_bot.MinimaxBot.__new__(minimax_bot.MinimaxBot)
+ bot.args = {
+ "model": mock_conf.get("model") or "MiniMax-M2.7",
+ }
+ self.assertEqual(bot.args["model"], "MiniMax-M2.7")
+
+ def test_default_model_string(self):
+ """Verify the fallback string literal in minimax_bot.py is MiniMax-M2.7."""
+ import ast
+ bot_path = os.path.join(os.path.dirname(__file__), "..", "models", "minimax", "minimax_bot.py")
+ with open(bot_path) as f:
+ source = f.read()
+ # Verify MiniMax-M2.7 is in the source (not M2.1)
+ self.assertIn("MiniMax-M2.7", source)
+ self.assertNotIn('"MiniMax-M2.1"', source)
+
+
+class TestMinimaxVoice(unittest.TestCase):
+ """Test MinimaxVoice TTS provider."""
+
+ def _make_voice(self, api_key="test-key", api_base="https://api.minimax.io/v1"):
+ mock_conf = MagicMock()
+ def conf_get(key, default=None):
+ return {
+ "minimax_api_key": api_key,
+ "minimax_api_base": api_base,
+ }.get(key, default)
+ mock_conf.get = conf_get
+ with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
+ from voice.minimax.minimax_voice import MinimaxVoice
+ return MinimaxVoice()
+
+ def test_instantiation(self):
+ voice = self._make_voice()
+ self.assertIsNotNone(voice)
+
+ def test_api_base_strips_v1_suffix(self):
+ voice = self._make_voice(api_base="https://api.minimax.io/v1")
+ self.assertEqual(voice.api_base, "https://api.minimax.io")
+
+ def test_api_base_no_trailing_slash(self):
+ voice = self._make_voice(api_base="https://api.minimax.io")
+ self.assertEqual(voice.api_base, "https://api.minimax.io")
+
+ def test_voice_to_text_not_supported(self):
+ voice = self._make_voice()
+ with self.assertRaises(NotImplementedError):
+ voice.voiceToText("dummy.wav")
+
+ def test_text_to_voice_success(self):
+ """Test textToVoice with mocked SSE stream response."""
+ import os
+ os.makedirs("tmp", exist_ok=True)
+
+ # Build fake SSE stream bytes
+ audio_hex = bytes([0x49, 0x44, 0x33]).hex() # "ID3" MP3 magic bytes
+ sse_line = f'data: {{"data": {{"audio": "{audio_hex}", "status": 2}}}}\n\n'
+ done_line = "data: [DONE]\n\n"
+ fake_body = (sse_line + done_line).encode("utf-8")
+
+ mock_response = MagicMock()
+ mock_response.raise_for_status = MagicMock()
+ mock_response.iter_lines.return_value = [
+ line.encode("utf-8") for line in (sse_line + done_line).splitlines() if line
+ ]
+
+ mock_conf = MagicMock()
+ def conf_get(key, default=None):
+ return {
+ "minimax_api_key": "test-key",
+ "minimax_api_base": "https://api.minimax.io",
+ }.get(key, default)
+ mock_conf.get = conf_get
+
+ with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
+ with patch("voice.minimax.minimax_voice.requests.post", return_value=mock_response):
+ from voice.minimax import minimax_voice
+ import importlib
+ importlib.reload(minimax_voice)
+ with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
+ voice = minimax_voice.MinimaxVoice()
+ from bridge.reply import ReplyType
+ reply = voice.textToVoice("Hello, world!")
+ self.assertEqual(reply.type, ReplyType.VOICE)
+ self.assertTrue(reply.content.endswith(".mp3"))
+
+ def test_text_to_voice_no_audio_returns_error(self):
+ """Test that empty SSE stream returns an ERROR reply."""
+ mock_response = MagicMock()
+ mock_response.raise_for_status = MagicMock()
+ mock_response.iter_lines.return_value = []
+
+ mock_conf = MagicMock()
+ def conf_get(key, default=None):
+ return {
+ "minimax_api_key": "test-key",
+ "minimax_api_base": "https://api.minimax.io",
+ }.get(key, default)
+ mock_conf.get = conf_get
+
+ with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
+ with patch("voice.minimax.minimax_voice.requests.post", return_value=mock_response):
+ from voice.minimax import minimax_voice
+ import importlib
+ importlib.reload(minimax_voice)
+ with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
+ voice = minimax_voice.MinimaxVoice()
+ from bridge.reply import ReplyType
+ reply = voice.textToVoice("Hello")
+ self.assertEqual(reply.type, ReplyType.ERROR)
+
+
+class TestVoiceFactory(unittest.TestCase):
+ """Test that minimax is registered in the voice factory."""
+
+ def test_minimax_voice_factory(self):
+ mock_conf = MagicMock()
+ mock_conf.get = MagicMock(return_value=None)
+ with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
+ from voice.factory import create_voice
+ voice = create_voice("minimax")
+ from voice.minimax.minimax_voice import MinimaxVoice
+ self.assertIsInstance(voice, MinimaxVoice)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/voice/factory.py b/voice/factory.py
index 8562f634..abe7ba57 100644
--- a/voice/factory.py
+++ b/voice/factory.py
@@ -54,4 +54,8 @@ def create_voice(voice_type):
from voice.tencent.tencent_voice import TencentVoice
return TencentVoice()
+ elif voice_type == "minimax":
+ from voice.minimax.minimax_voice import MinimaxVoice
+
+ return MinimaxVoice()
raise RuntimeError
diff --git a/voice/minimax/__init__.py b/voice/minimax/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/voice/minimax/minimax_voice.py b/voice/minimax/minimax_voice.py
new file mode 100644
index 00000000..1446a3f1
--- /dev/null
+++ b/voice/minimax/minimax_voice.py
@@ -0,0 +1,106 @@
+# encoding:utf-8
+"""
+MiniMax TTS voice service
+"""
+import datetime
+import random
+import requests
+
+from bridge.reply import Reply, ReplyType
+from common.log import logger
+from config import conf
+from voice.voice import Voice
+
+
+MINIMAX_TTS_VOICES = [
+ "English_Graceful_Lady",
+ "English_Insightful_Speaker",
+ "English_radiant_girl",
+ "English_Persuasive_Man",
+ "English_Lucky_Robot",
+ "English_expressive_narrator",
+ "Chinese_Warm_Woman",
+ "Chinese_Gentle_Man",
+]
+
+
+class MinimaxVoice(Voice):
+ def __init__(self):
+ self.api_key = conf().get("minimax_api_key")
+ self.api_base = conf().get("minimax_api_base") or "https://api.minimax.io"
+ # Strip trailing /v1 if present so we can always append /v1/t2a_v2
+ self.api_base = self.api_base.rstrip("/")
+ if self.api_base.endswith("/v1"):
+ self.api_base = self.api_base[:-3]
+
+ def voiceToText(self, voice_file):
+ """MiniMax does not provide an ASR endpoint; raise NotImplementedError."""
+ raise NotImplementedError("MiniMax voice-to-text is not supported")
+
+ def textToVoice(self, text):
+ try:
+ model = conf().get("text_to_voice_model") or "speech-2.8-hd"
+ voice_id = conf().get("tts_voice_id") or "English_Graceful_Lady"
+
+ url = f"{self.api_base}/v1/t2a_v2"
+ headers = {
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {self.api_key}",
+ }
+ payload = {
+ "model": model,
+ "text": text,
+ "stream": True,
+ "voice_setting": {
+ "voice_id": voice_id,
+ "speed": 1,
+ "vol": 1,
+ "pitch": 0,
+ },
+ "audio_setting": {
+ "sample_rate": 32000,
+ "bitrate": 128000,
+ "format": "mp3",
+ "channel": 1,
+ },
+ }
+
+ response = requests.post(url, headers=headers, json=payload, stream=True, timeout=60)
+ response.raise_for_status()
+
+ # Parse SSE stream and collect hex-encoded audio chunks
+ audio_chunks = []
+ buffer = ""
+ for raw in response.iter_lines():
+ if not raw:
+ continue
+ line = raw.decode("utf-8") if isinstance(raw, bytes) else raw
+ if not line.startswith("data:"):
+ continue
+ json_str = line[5:].strip()
+ if not json_str or json_str == "[DONE]":
+ continue
+ try:
+ import json
+ event_data = json.loads(json_str)
+ audio_hex = event_data.get("data", {}).get("audio")
+ if audio_hex:
+ audio_chunks.append(bytes.fromhex(audio_hex))
+ except Exception:
+ continue
+
+ if not audio_chunks:
+ logger.error("[MINIMAX] TTS returned no audio data")
+ return Reply(ReplyType.ERROR, "语音合成失败,未获取到音频数据")
+
+ audio_data = b"".join(audio_chunks)
+ file_name = "tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + str(random.randint(0, 1000)) + ".mp3"
+ with open(file_name, "wb") as f:
+ f.write(audio_data)
+
+ logger.info(f"[MINIMAX] textToVoice success, file={file_name}")
+ return Reply(ReplyType.VOICE, file_name)
+
+ except Exception as e:
+ logger.error(f"[MINIMAX] textToVoice error: {e}")
+ return Reply(ReplyType.ERROR, "遇到了一点小问题,请稍后再试")