feat(vision): prioritize main model for image recognition with multi-provider fallback

- Add call_vision method to all bot implementations (DashScope, Claude,
  Gemini, ZhipuAI, MiniMax, Doubao, Moonshot, OpenAICompatibleBot)
  using each vendor's native multimodal API format
- Remove call_with_tools/call_vision from Bot base class to fix MRO
  shadowing issue with OpenAICompatibleBot mixin
- Refactor vision tool provider resolution: MainModel → other configured
  models (auto-discovered) → OpenAI → LinkAI, with automatic fallback
- Return actual model name used in call_vision responses
- Sync config.json API keys to .env bidirectionally on startup
- Fix bot instance cache to detect bot_type/use_linkai config changes
- Add SSE reconnection support for web console
- Preserve image path hints in Gemini text for correct vision tool calls
- Update docs/tools/vision.mdx
This commit is contained in:
zhayujie
2026-04-11 19:46:11 +08:00
parent 3cd92ccda3
commit 26693acc3f
17 changed files with 1173 additions and 359 deletions

View File

@@ -1,7 +1,13 @@
""" """
Vision tool - Analyze images using OpenAI-compatible Vision API. Vision tool - Analyze images using Vision API.
Supports local files (auto base64-encoded) and HTTP URLs. Supports local files (auto base64-encoded) and HTTP URLs.
Providers are tried in priority order with automatic fallback on failure.
Provider priority (default):
1. Main model via bot.call_vision — zero extra cost
2. Other models whose API key is configured — auto-discovered
3. OpenAI / LinkAI raw HTTP — reliable fallback
When use_linkai=true, LinkAI is promoted to #1.
When tool.vision.model is set, that model is used exclusively first.
""" """
import base64 import base64
@@ -14,10 +20,11 @@ from typing import Any, Dict, List, Optional
import requests import requests
from agent.tools.base_tool import BaseTool, ToolResult from agent.tools.base_tool import BaseTool, ToolResult
from common import const
from common.log import logger from common.log import logger
from config import conf from config import conf
DEFAULT_MODEL = "gpt-4.1-mini" DEFAULT_MODEL = const.GPT_41_MINI
DEFAULT_TIMEOUT = 60 DEFAULT_TIMEOUT = 60
MAX_TOKENS = 1000 MAX_TOKENS = 1000
COMPRESS_THRESHOLD = 1_048_576 # 1 MB COMPRESS_THRESHOLD = 1_048_576 # 1 MB
@@ -30,8 +37,20 @@ SUPPORTED_EXTENSIONS = {
"webp": "image/webp", "webp": "image/webp",
} }
_MAIN_MODEL_PROVIDER_NAME = "MainModel"
OPENAI_COMPATIBLE_BOT_TYPES = {"openai", "openAI", "chatGPT"} # (config_key_for_api_key, bot_type, default_vision_model, provider_display_name)
# Auto-discovered as fallback vision providers when their API key is configured.
# OpenAI and LinkAI are handled separately (raw HTTP providers), so not listed here.
_DISCOVERABLE_MODELS = [
("moonshot_api_key", const.MOONSHOT, const.KIMI_K2_5, "Moonshot"),
("ark_api_key", const.DOUBAO, const.DOUBAO_SEED_2_PRO, "Doubao"),
("dashscope_api_key", const.QWEN_DASHSCOPE, const.QWEN36_PLUS, "DashScope"),
("claude_api_key", const.CLAUDEAPI, const.CLAUDE_4_6_SONNET, "Claude"),
("gemini_api_key", const.GEMINI, const.GEMINI_31_FLASH_LITE_PRE, "Gemini"),
("zhipu_ai_api_key", const.ZHIPU_AI, const.GLM_4_7, "ZhipuAI"),
("minimax_api_key", const.MiniMax, const.MINIMAX_M2_7, "MiniMax"),
]
@dataclass @dataclass
@@ -42,6 +61,8 @@ class VisionProvider:
api_base: str api_base: str
extra_headers: dict = field(default_factory=dict) extra_headers: dict = field(default_factory=dict)
model_override: Optional[str] = None model_override: Optional[str] = None
use_bot: bool = False # When True, call via bot.call_vision instead of raw HTTP
fallback_bot: Any = None # Bot instance for non-main-model providers
class VisionAPIError(Exception): class VisionAPIError(Exception):
@@ -50,13 +71,12 @@ class VisionAPIError(Exception):
class Vision(BaseTool): class Vision(BaseTool):
"""Analyze images using OpenAI-compatible Vision API""" """Analyze images using Vision API"""
name: str = "vision" name: str = "vision"
description: str = ( description: str = (
"Analyze a local image or image URL (jpg/jpeg/png) using Vision API. " "Analyze a local image or image URL (jpg/jpeg/png) using Vision API. "
"Can describe content, extract text, identify objects, colors, etc. " "Can describe content, extract text, identify objects, colors, etc. "
"Requires OPENAI_API_KEY or LINKAI_API_KEY."
) )
params: dict = { params: dict = {
@@ -70,13 +90,6 @@ class Vision(BaseTool):
"type": "string", "type": "string",
"description": "Question to ask about the image", "description": "Question to ask about the image",
}, },
"model": {
"type": "string",
"description": (
f"Vision model to use (default: {DEFAULT_MODEL}). "
"Options: gpt-4.1-mini, gpt-4.1, gpt-4o-mini, gpt-4o"
),
},
}, },
"required": ["image", "question"], "required": ["image", "question"],
} }
@@ -86,15 +99,11 @@ class Vision(BaseTool):
@staticmethod @staticmethod
def is_available() -> bool: def is_available() -> bool:
return bool( return True
conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY")
or conf().get("linkai_api_key") or os.environ.get("LINKAI_API_KEY")
)
def execute(self, args: Dict[str, Any]) -> ToolResult: def execute(self, args: Dict[str, Any]) -> ToolResult:
image = args.get("image", "").strip() image = args.get("image", "").strip()
question = args.get("question", "").strip() question = args.get("question", "").strip()
model = args.get("model", DEFAULT_MODEL).strip() or DEFAULT_MODEL
if not image: if not image:
return ToolResult.fail("Error: 'image' parameter is required") return ToolResult.fail("Error: 'image' parameter is required")
@@ -104,11 +113,12 @@ class Vision(BaseTool):
providers = self._resolve_providers() providers = self._resolve_providers()
if not providers: if not providers:
return ToolResult.fail( return ToolResult.fail(
"Error: No API key configured for Vision.\n" "Error: No model available for Vision.\n"
"Please configure one of the following using env_config tool:\n" "The main model does not support vision and no other API keys are configured.\n"
" 1. OPENAI_API_KEY (preferred): env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n" "Options:\n"
" 2. LINKAI_API_KEY (fallback): env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")\n\n" " 1. Switch to a multimodal model (e.g. qwen3.6-plus, claude-sonnet-4-6, gemini-2.0-flash)\n"
"Get your key at: https://platform.openai.com/api-keys or https://link-ai.tech" " 2. Configure OPENAI_API_KEY: env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n"
" 3. Configure LINKAI_API_KEY: env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")"
) )
try: try:
@@ -116,7 +126,7 @@ class Vision(BaseTool):
except Exception as e: except Exception as e:
return ToolResult.fail(f"Error: {e}") return ToolResult.fail(f"Error: {e}")
return self._call_with_fallback(providers, model, question, image_content) return self._call_with_fallback(providers, DEFAULT_MODEL, question, image_content)
def _call_with_fallback(self, providers: List[VisionProvider], model: str, def _call_with_fallback(self, providers: List[VisionProvider], model: str,
question: str, image_content: dict) -> ToolResult: question: str, image_content: dict) -> ToolResult:
@@ -125,9 +135,14 @@ class Vision(BaseTool):
for i, provider in enumerate(providers): for i, provider in enumerate(providers):
use_model = provider.model_override or model use_model = provider.model_override or model
try: try:
logger.debug(f"[Vision] Trying provider '{provider.name}' " logger.info(f"[Vision] Trying provider '{provider.name}' "
f"with model '{use_model}' ({i + 1}/{len(providers)})") f"with model '{use_model}' ({i + 1}/{len(providers)})")
return self._call_api(provider, use_model, question, image_content) if provider.use_bot:
result = self._call_via_bot(use_model, question, image_content, provider)
else:
result = self._call_api(provider, use_model, question, image_content)
logger.info(f"[Vision] ✅ Success via {provider.name} (model={use_model})")
return result
except VisionAPIError as e: except VisionAPIError as e:
errors.append(f"[{provider.name}/{use_model}] {e}") errors.append(f"[{provider.name}/{use_model}] {e}")
logger.warning(f"[Vision] Provider '{provider.name}' failed: {e}") logger.warning(f"[Vision] Provider '{provider.name}' failed: {e}")
@@ -148,35 +163,113 @@ class Vision(BaseTool):
def _resolve_providers(self) -> List[VisionProvider]: def _resolve_providers(self) -> List[VisionProvider]:
""" """
Build an ordered list of available providers. Build an ordered list of available providers.
Each provider builder returns a VisionProvider or None.
To add a new provider, append a builder method to _PROVIDER_BUILDERS. Priority:
- use_linkai=true → [LinkAI, MainModel, OtherModels…, OpenAI]
- default → [MainModel, OtherModels…, OpenAI, LinkAI]
"OtherModels" are auto-discovered from configured API keys.
The main model's bot_type is excluded from OtherModels to avoid
duplicating the MainModel provider.
""" """
use_linkai = conf().get("use_linkai", False) and conf().get("linkai_api_key")
providers: List[VisionProvider] = [] providers: List[VisionProvider] = []
for builder in self._PROVIDER_BUILDERS:
provider = builder(self) if use_linkai:
if provider: self._append_provider(providers, self._build_linkai_provider)
providers.append(provider) self._append_provider(providers, self._build_main_model_provider)
self._append_other_model_providers(providers)
self._append_provider(providers, self._build_openai_provider)
else:
self._append_provider(providers, self._build_main_model_provider)
self._append_other_model_providers(providers)
self._append_provider(providers, self._build_openai_provider)
self._append_provider(providers, self._build_linkai_provider)
return providers return providers
def _build_custom_model_provider(self) -> Optional[VisionProvider]: @staticmethod
def _append_provider(providers: List[VisionProvider], builder) -> None:
p = builder()
if p:
providers.append(p)
def _append_other_model_providers(self, providers: List[VisionProvider]) -> None:
""" """
When bot_type is openai-compatible and a custom model is configured, Auto-discover other models whose API key is configured.
try the user's own model first — it may already support multimodal input. Skip the main model's own bot_type (already covered by MainModel provider).
Skip bot_types that already have a provider in the list (e.g. OpenAI).
""" """
bot_type = conf().get("bot_type", "") # Determine main model's bot_type so we can skip it
if bot_type not in OPENAI_COMPATIBLE_BOT_TYPES: main_bot_type = None
if self.model and hasattr(self.model, '_resolve_bot_type'):
main_bot_type = self.model._resolve_bot_type(conf().get("model", ""))
existing_names = {p.name for p in providers}
for config_key, bot_type, default_model, display_name in _DISCOVERABLE_MODELS:
if display_name in existing_names:
continue
if bot_type == main_bot_type:
continue
api_key = conf().get(config_key, "")
if not api_key or not api_key.strip():
continue
# Create a bot instance and check if it supports call_vision
try:
from models.bot_factory import create_bot
bot = create_bot(bot_type)
if not hasattr(bot, 'call_vision'):
continue
except Exception:
continue
providers.append(VisionProvider(
name=display_name,
api_key="",
api_base="",
model_override=default_model,
use_bot=True,
fallback_bot=bot,
))
def _resolve_vision_model(self) -> Optional[str]:
"""
Determine which model to use for vision.
1. User explicit config: tool.vision.model in config.json
2. Fallback to the main configured model name
"""
tool_conf = conf().get("tool", {})
user_vision_model = tool_conf.get("vision", {}).get("model") if isinstance(tool_conf, dict) else None
if user_vision_model:
return user_vision_model
model_name = conf().get("model", "")
return model_name or None
def _build_main_model_provider(self) -> Optional[VisionProvider]:
"""
Use the vendor's own model for vision via bot.call_vision.
Only available when the bot class has call_vision.
"""
if not (self.model and hasattr(self.model, 'bot')):
return None return None
custom_model = conf().get("model", "") try:
if not custom_model or custom_model == DEFAULT_MODEL: bot = self.model.bot
if not hasattr(bot, 'call_vision'):
return None
except Exception:
return None return None
api_key = conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY")
if not api_key: vision_model = self._resolve_vision_model()
return None
api_base = (conf().get("open_ai_api_base") or os.environ.get("OPENAI_API_BASE", "")).rstrip("/") \
or "https://api.openai.com/v1"
return VisionProvider( return VisionProvider(
name="CustomModel", api_key=api_key, api_base=self._ensure_v1(api_base), name=_MAIN_MODEL_PROVIDER_NAME,
model_override=custom_model, api_key="",
api_base="",
model_override=vision_model,
use_bot=True,
) )
def _build_openai_provider(self) -> Optional[VisionProvider]: def _build_openai_provider(self) -> Optional[VisionProvider]:
@@ -200,7 +293,54 @@ class Vision(BaseTool):
return VisionProvider(name="LinkAI", api_key=api_key, api_base=self._ensure_v1(api_base), return VisionProvider(name="LinkAI", api_key=api_key, api_base=self._ensure_v1(api_base),
extra_headers=extra) extra_headers=extra)
_PROVIDER_BUILDERS = [_build_custom_model_provider, _build_openai_provider, _build_linkai_provider] def _call_via_bot(self, model: str, question: str, image_content: dict,
provider: Optional[VisionProvider] = None) -> ToolResult:
"""
Call a model's call_vision with vendor-native API format.
Uses the provider's _fallback_bot if set, otherwise the main model bot.
Raises VisionAPIError on failure so fallback can proceed.
"""
try:
bot = (provider and provider.fallback_bot) or self.model.bot
except Exception as e:
raise VisionAPIError(f"Cannot access bot: {e}")
# Extract the raw image URL from the OpenAI-format image_content block
image_url = image_content.get("image_url", {}).get("url", "")
if not image_url:
raise VisionAPIError("No image URL in content block")
try:
response = bot.call_vision(
image_url=image_url,
question=question,
model=model,
max_tokens=MAX_TOKENS,
)
except Exception as e:
raise VisionAPIError(f"call_vision failed: {e}")
if response is NotImplemented:
raise VisionAPIError("Bot does not support vision")
if isinstance(response, dict) and response.get("error"):
raise VisionAPIError(f"API error - {response.get('message', 'Unknown')}")
content = response.get("content", "") if isinstance(response, dict) else ""
if not content:
raise VisionAPIError("Empty response from main model")
usage_info = response.get("usage", {}) if isinstance(response, dict) else {}
# Use the actual model name from the bot response if available
actual_model = response.get("model", model) if isinstance(response, dict) else model
provider_name = provider.name if provider else _MAIN_MODEL_PROVIDER_NAME
return ToolResult.success({
"model": actual_model,
"provider": provider_name,
"content": content,
"usage": usage_info,
})
@staticmethod @staticmethod
def _ensure_v1(api_base: str) -> str: def _ensure_v1(api_base: str) -> str:
@@ -213,9 +353,13 @@ class Vision(BaseTool):
return api_base.rstrip("/") + "/v1" return api_base.rstrip("/") + "/v1"
def _build_image_content(self, image: str) -> dict: def _build_image_content(self, image: str) -> dict:
"""Build the image_url content block for the API request.""" """
Build the image_url content block.
Both remote URLs and local files are converted to base64 data URLs
so every bot backend can consume them without extra downloads.
"""
if image.startswith(("http://", "https://")): if image.startswith(("http://", "https://")):
return {"type": "image_url", "image_url": {"url": image}} return self._download_to_data_url(image)
if not os.path.isfile(image): if not os.path.isfile(image):
raise FileNotFoundError(f"Image file not found: {image}") raise FileNotFoundError(f"Image file not found: {image}")
@@ -239,6 +383,19 @@ class Vision(BaseTool):
data_url = f"data:{mime_type};base64,{b64}" data_url = f"data:{mime_type};base64,{b64}"
return {"type": "image_url", "image_url": {"url": data_url}} return {"type": "image_url", "image_url": {"url": data_url}}
@staticmethod
def _download_to_data_url(url: str) -> dict:
"""Download a remote image and return it as a base64 data URL."""
resp = requests.get(url, timeout=30)
if resp.status_code != 200:
raise VisionAPIError(f"Failed to download image: HTTP {resp.status_code}")
content_type = resp.headers.get("Content-Type", "image/jpeg").split(";")[0].strip()
if not content_type.startswith("image/"):
content_type = "image/jpeg"
b64 = base64.b64encode(resp.content).decode("ascii")
data_url = f"data:{content_type};base64,{b64}"
return {"type": "image_url", "image_url": {"url": data_url}}
@staticmethod @staticmethod
def _maybe_compress(path: str) -> str: def _maybe_compress(path: str) -> str:
"""Compress image to under COMPRESS_THRESHOLD with max long-edge 1536px.""" """Compress image to under COMPRESS_THRESHOLD with max long-edge 1536px."""
@@ -312,7 +469,6 @@ class Vision(BaseTool):
], ],
} }
], ],
"max_completion_tokens": MAX_TOKENS,
} }
headers = { headers = {

View File

@@ -124,14 +124,15 @@ class AgentLLMModel(LLMModel):
@property @property
def bot(self): def bot(self):
"""Lazy load the bot, re-create when model changes""" """Lazy load the bot, re-create when model or bot_type changes"""
from models.bot_factory import create_bot from models.bot_factory import create_bot
cur_model = self.model cur_model = self.model
if self._bot is None or self._bot_model != cur_model: cur_bot_type = self._resolve_bot_type(cur_model)
bot_type = self._resolve_bot_type(cur_model) if self._bot is None or self._bot_model != cur_model or getattr(self, '_bot_type', None) != cur_bot_type:
self._bot = create_bot(bot_type) self._bot = create_bot(cur_bot_type)
self._bot = add_openai_compatible_support(self._bot) self._bot = add_openai_compatible_support(self._bot)
self._bot_model = cur_model self._bot_model = cur_model
self._bot_type = cur_bot_type
return self._bot return self._bot
def call(self, request: LLMRequest): def call(self, request: LLMRequest):
@@ -505,15 +506,15 @@ class AgentBridge:
def _migrate_config_to_env(self, workspace_root: str): def _migrate_config_to_env(self, workspace_root: str):
""" """
Migrate API keys from config.json to .env file if not already set Sync API keys from config.json to .env file.
Adds new keys and updates changed values on each startup.
Args: Args:
workspace_root: Workspace directory path (not used, kept for compatibility) workspace_root: Workspace directory path (not used, kept for compatibility)
""" """
from config import conf from config import conf
import os import os
# Mapping from config.json keys to environment variable names
key_mapping = { key_mapping = {
"open_ai_api_key": "OPENAI_API_KEY", "open_ai_api_key": "OPENAI_API_KEY",
"open_ai_api_base": "OPENAI_API_BASE", "open_ai_api_base": "OPENAI_API_BASE",
@@ -522,10 +523,9 @@ class AgentBridge:
"linkai_api_key": "LINKAI_API_KEY", "linkai_api_key": "LINKAI_API_KEY",
} }
# Use fixed secure location for .env file
env_file = expand_path("~/.cow/.env") env_file = expand_path("~/.cow/.env")
# Read existing env vars from .env file # Read existing env vars (key -> value)
existing_env_vars = {} existing_env_vars = {}
if os.path.exists(env_file): if os.path.exists(env_file):
try: try:
@@ -533,48 +533,46 @@ class AgentBridge:
for line in f: for line in f:
line = line.strip() line = line.strip()
if line and not line.startswith('#') and '=' in line: if line and not line.startswith('#') and '=' in line:
key, _ = line.split('=', 1) key, val = line.split('=', 1)
existing_env_vars[key.strip()] = True existing_env_vars[key.strip()] = val.strip()
except Exception as e: except Exception as e:
logger.warning(f"[AgentBridge] Failed to read .env file: {e}") logger.warning(f"[AgentBridge] Failed to read .env file: {e}")
# Check which keys need to be migrated # Sync config.json values into .env (add/update/remove)
keys_to_migrate = {} updated = False
for config_key, env_key in key_mapping.items(): for config_key, env_key in key_mapping.items():
# Skip if already in .env file raw = conf().get(config_key, "")
if env_key in existing_env_vars: value = raw.strip() if raw else ""
continue old_value = existing_env_vars.get(env_key)
# Get value from config.json if value:
value = conf().get(config_key, "") if old_value == value:
if value and value.strip(): # Only migrate non-empty values continue
keys_to_migrate[env_key] = value.strip() existing_env_vars[env_key] = value
os.environ[env_key] = value
# Log summary if there are keys to skip updated = True
if existing_env_vars: else:
logger.debug(f"[AgentBridge] {len(existing_env_vars)} env vars already in .env") if old_value is None:
continue
# Write new keys to .env file existing_env_vars.pop(env_key, None)
if keys_to_migrate: os.environ.pop(env_key, None)
updated = True
updated = True
if updated:
try: try:
# Ensure ~/.cow directory and .env file exist
env_dir = os.path.dirname(env_file) env_dir = os.path.dirname(env_file)
if not os.path.exists(env_dir): os.makedirs(env_dir, exist_ok=True)
os.makedirs(env_dir, exist_ok=True)
if not os.path.exists(env_file): with open(env_file, 'w', encoding='utf-8') as f:
open(env_file, 'a').close() f.write('# Environment variables for agent\n')
f.write('# Auto-managed - synced from config.json on startup\n\n')
# Append new keys for key, value in sorted(existing_env_vars.items()):
with open(env_file, 'a', encoding='utf-8') as f:
f.write('\n# Auto-migrated from config.json\n')
for key, value in keys_to_migrate.items():
f.write(f'{key}={value}\n') f.write(f'{key}={value}\n')
# Also set in current process
os.environ[key] = value logger.info(f"[AgentBridge] Synced API keys from config.json to .env")
logger.info(f"[AgentBridge] Migrated {len(keys_to_migrate)} API keys from config.json to .env: {list(keys_to_migrate.keys())}")
except Exception as e: except Exception as e:
logger.warning(f"[AgentBridge] Failed to migrate API keys: {e}") logger.warning(f"[AgentBridge] Failed to sync API keys: {e}")
def _persist_messages( def _persist_messages(
self, session_id: str, new_messages: list, channel_type: str = "" self, session_id: str, new_messages: list, channel_type: str = ""

View File

@@ -490,7 +490,7 @@ class AgentInitializer:
env_file = expand_path("~/.cow/.env") env_file = expand_path("~/.cow/.env")
# Read existing env vars # Read existing env vars (key -> value)
existing_env_vars = {} existing_env_vars = {}
if os.path.exists(env_file): if os.path.exists(env_file):
try: try:
@@ -498,38 +498,46 @@ class AgentInitializer:
for line in f: for line in f:
line = line.strip() line = line.strip()
if line and not line.startswith('#') and '=' in line: if line and not line.startswith('#') and '=' in line:
key, _ = line.split('=', 1) key, val = line.split('=', 1)
existing_env_vars[key.strip()] = True existing_env_vars[key.strip()] = val.strip()
except Exception as e: except Exception as e:
logger.warning(f"[AgentInitializer] Failed to read .env file: {e}") logger.warning(f"[AgentInitializer] Failed to read .env file: {e}")
# Check which keys need migration # Sync config.json values into .env (add/update/remove)
keys_to_migrate = {} updated = False
for config_key, env_key in key_mapping.items(): for config_key, env_key in key_mapping.items():
if env_key in existing_env_vars: raw = conf().get(config_key, "")
continue value = raw.strip() if raw else ""
value = conf().get(config_key, "") old_value = existing_env_vars.get(env_key)
if value and value.strip():
keys_to_migrate[env_key] = value.strip() if value:
if old_value == value:
# Write new keys continue
if keys_to_migrate: existing_env_vars[env_key] = value
os.environ[env_key] = value
updated = True
else:
if old_value is None:
continue
existing_env_vars.pop(env_key, None)
os.environ.pop(env_key, None)
updated = True
if updated:
try: try:
env_dir = os.path.dirname(env_file) env_dir = os.path.dirname(env_file)
if not os.path.exists(env_dir): os.makedirs(env_dir, exist_ok=True)
os.makedirs(env_dir, exist_ok=True)
if not os.path.exists(env_file): # Rewrite the entire .env file to ensure consistency
open(env_file, 'a').close() with open(env_file, 'w', encoding='utf-8') as f:
f.write('# Environment variables for agent\n')
with open(env_file, 'a', encoding='utf-8') as f: f.write('# Auto-managed - synced from config.json on startup\n\n')
f.write('\n# Auto-migrated from config.json\n') for key, value in sorted(existing_env_vars.items()):
for key, value in keys_to_migrate.items():
f.write(f'{key}={value}\n') f.write(f'{key}={value}\n')
os.environ[key] = value
logger.info(f"[AgentInitializer] Synced API keys from config.json to .env")
logger.info(f"[AgentInitializer] Migrated {len(keys_to_migrate)} API keys to .env: {list(keys_to_migrate.keys())}")
except Exception as e: except Exception as e:
logger.warning(f"[AgentInitializer] Failed to migrate API keys: {e}") logger.warning(f"[AgentInitializer] Failed to sync API keys: {e}")
def _start_daily_flush_timer(self): def _start_daily_flush_timer(self):
"""Start a background thread that flushes all agents' memory daily at 23:55.""" """Start a background thread that flushes all agents' memory daily at 23:55."""

View File

@@ -806,15 +806,17 @@ function sendMessage() {
} }
function startSSE(requestId, loadingEl, timestamp) { function startSSE(requestId, loadingEl, timestamp) {
const es = new EventSource(`/stream?request_id=${encodeURIComponent(requestId)}`);
activeStreams[requestId] = es;
let botEl = null; let botEl = null;
let stepsEl = null; // .agent-steps (thinking summaries + tool indicators) let stepsEl = null; // .agent-steps (thinking summaries + tool indicators)
let contentEl = null; // .answer-content (final streaming answer) let contentEl = null; // .answer-content (final streaming answer)
let mediaEl = null; // .media-content (images & file attachments) let mediaEl = null; // .media-content (images & file attachments)
let accumulatedText = ''; let accumulatedText = '';
let currentToolEl = null; let currentToolEl = null;
let done = false;
const MAX_RECONNECTS = 10;
const RECONNECT_BASE_MS = 1000;
let reconnectCount = 0;
function ensureBotEl() { function ensureBotEl() {
if (botEl) return; if (botEl) return;
@@ -839,180 +841,204 @@ function startSSE(requestId, loadingEl, timestamp) {
mediaEl = botEl.querySelector('.media-content'); mediaEl = botEl.querySelector('.media-content');
} }
es.onmessage = function(e) { function connect() {
let item; const es = new EventSource(`/stream?request_id=${encodeURIComponent(requestId)}`);
try { item = JSON.parse(e.data); } catch (_) { return; } activeStreams[requestId] = es;
if (item.type === 'delta') { es.onmessage = function(e) {
ensureBotEl(); let item;
accumulatedText += item.content; try { item = JSON.parse(e.data); } catch (_) { return; }
contentEl.innerHTML = renderMarkdown(accumulatedText);
scrollChatToBottom();
} else if (item.type === 'tool_start') { // Successful data received, reset reconnect counter
ensureBotEl(); reconnectCount = 0;
// Save current thinking as a collapsible step if (item.type === 'delta') {
if (accumulatedText.trim()) { ensureBotEl();
const fullText = accumulatedText.trim(); accumulatedText += item.content;
const oneLine = fullText.replace(/\n+/g, ' '); contentEl.innerHTML = renderMarkdown(accumulatedText);
const needsTruncate = oneLine.length > 80; scrollChatToBottom();
const stepEl = document.createElement('div');
stepEl.className = 'agent-step agent-thinking-step' + (needsTruncate ? '' : ' no-expand'); } else if (item.type === 'tool_start') {
if (needsTruncate) { ensureBotEl();
const truncated = oneLine.substring(0, 80) + '…';
stepEl.innerHTML = ` // Save current thinking as a collapsible step
<div class="thinking-header" onclick="this.parentElement.classList.toggle('expanded')"> if (accumulatedText.trim()) {
<i class="fas fa-lightbulb text-amber-400 flex-shrink-0"></i> const fullText = accumulatedText.trim();
<span class="thinking-summary">${escapeHtml(truncated)}</span> const oneLine = fullText.replace(/\n+/g, ' ');
<i class="fas fa-chevron-right thinking-chevron"></i> const needsTruncate = oneLine.length > 80;
</div> const stepEl = document.createElement('div');
<div class="thinking-full">${renderMarkdown(fullText)}</div>`; stepEl.className = 'agent-step agent-thinking-step' + (needsTruncate ? '' : ' no-expand');
} else { if (needsTruncate) {
stepEl.innerHTML = ` const truncated = oneLine.substring(0, 80) + '…';
<div class="thinking-header no-toggle"> stepEl.innerHTML = `
<i class="fas fa-lightbulb text-amber-400 flex-shrink-0"></i> <div class="thinking-header" onclick="this.parentElement.classList.toggle('expanded')">
<span>${escapeHtml(oneLine)}</span> <i class="fas fa-lightbulb text-amber-400 flex-shrink-0"></i>
</div>`; <span class="thinking-summary">${escapeHtml(truncated)}</span>
<i class="fas fa-chevron-right thinking-chevron"></i>
</div>
<div class="thinking-full">${renderMarkdown(fullText)}</div>`;
} else {
stepEl.innerHTML = `
<div class="thinking-header no-toggle">
<i class="fas fa-lightbulb text-amber-400 flex-shrink-0"></i>
<span>${escapeHtml(oneLine)}</span>
</div>`;
}
stepsEl.appendChild(stepEl);
} }
stepsEl.appendChild(stepEl); accumulatedText = '';
} contentEl.innerHTML = '';
accumulatedText = '';
contentEl.innerHTML = '';
// Add tool execution indicator (collapsible) // Add tool execution indicator (collapsible)
currentToolEl = document.createElement('div'); currentToolEl = document.createElement('div');
currentToolEl.className = 'agent-step agent-tool-step'; currentToolEl.className = 'agent-step agent-tool-step';
const argsStr = formatToolArgs(item.arguments || {}); const argsStr = formatToolArgs(item.arguments || {});
currentToolEl.innerHTML = ` currentToolEl.innerHTML = `
<div class="tool-header" onclick="this.parentElement.classList.toggle('expanded')"> <div class="tool-header" onclick="this.parentElement.classList.toggle('expanded')">
<i class="fas fa-cog fa-spin text-primary-400 flex-shrink-0 tool-icon"></i> <i class="fas fa-cog fa-spin text-primary-400 flex-shrink-0 tool-icon"></i>
<span class="tool-name">${item.tool}</span> <span class="tool-name">${item.tool}</span>
<i class="fas fa-chevron-right tool-chevron"></i> <i class="fas fa-chevron-right tool-chevron"></i>
</div>
<div class="tool-detail">
<div class="tool-detail-section">
<div class="tool-detail-label">Input</div>
<pre class="tool-detail-content">${argsStr}</pre>
</div> </div>
<div class="tool-detail-section tool-output-section"></div> <div class="tool-detail">
</div>`; <div class="tool-detail-section">
stepsEl.appendChild(currentToolEl); <div class="tool-detail-label">Input</div>
<pre class="tool-detail-content">${argsStr}</pre>
</div>
<div class="tool-detail-section tool-output-section"></div>
</div>`;
stepsEl.appendChild(currentToolEl);
scrollChatToBottom(); scrollChatToBottom();
} else if (item.type === 'tool_end') { } else if (item.type === 'tool_end') {
if (currentToolEl) { if (currentToolEl) {
const isError = item.status !== 'success'; const isError = item.status !== 'success';
const icon = currentToolEl.querySelector('.tool-icon'); const icon = currentToolEl.querySelector('.tool-icon');
icon.className = isError icon.className = isError
? 'fas fa-times text-red-400 flex-shrink-0 tool-icon' ? 'fas fa-times text-red-400 flex-shrink-0 tool-icon'
: 'fas fa-check text-primary-400 flex-shrink-0 tool-icon'; : 'fas fa-check text-primary-400 flex-shrink-0 tool-icon';
// Show execution time // Show execution time
const nameEl = currentToolEl.querySelector('.tool-name'); const nameEl = currentToolEl.querySelector('.tool-name');
if (item.execution_time !== undefined) { if (item.execution_time !== undefined) {
nameEl.innerHTML += ` <span class="tool-time">${item.execution_time}s</span>`; nameEl.innerHTML += ` <span class="tool-time">${item.execution_time}s</span>`;
}
// Fill output section
const outputSection = currentToolEl.querySelector('.tool-output-section');
if (outputSection && item.result) {
outputSection.innerHTML = `
<div class="tool-detail-label">${isError ? 'Error' : 'Output'}</div>
<pre class="tool-detail-content ${isError ? 'tool-error-text' : ''}">${escapeHtml(String(item.result))}</pre>`;
}
if (isError) currentToolEl.classList.add('tool-failed');
currentToolEl = null;
} }
// Fill output section } else if (item.type === 'image') {
const outputSection = currentToolEl.querySelector('.tool-output-section'); ensureBotEl();
if (outputSection && item.result) { const imgEl = document.createElement('img');
outputSection.innerHTML = ` imgEl.src = item.content;
<div class="tool-detail-label">${isError ? 'Error' : 'Output'}</div> imgEl.alt = 'screenshot';
<pre class="tool-detail-content ${isError ? 'tool-error-text' : ''}">${escapeHtml(String(item.result))}</pre>`; imgEl.style.cssText = 'max-width:600px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);';
} imgEl.onclick = () => window.open(item.content, '_blank');
mediaEl.appendChild(imgEl);
scrollChatToBottom();
if (isError) currentToolEl.classList.add('tool-failed'); } else if (item.type === 'text') {
currentToolEl = null; // Intermediate text sent before media items; display it but keep SSE open.
ensureBotEl();
contentEl.classList.remove('sse-streaming');
const textContent = item.content || accumulatedText;
if (textContent) contentEl.innerHTML = renderMarkdown(textContent);
applyHighlighting(botEl);
scrollChatToBottom();
} else if (item.type === 'video') {
ensureBotEl();
const wrapper = document.createElement('div');
wrapper.innerHTML = _buildVideoHtml(item.content);
mediaEl.appendChild(wrapper.firstElementChild || wrapper);
scrollChatToBottom();
} else if (item.type === 'file') {
ensureBotEl();
const fileName = item.file_name || item.content.split('/').pop();
const fileEl = document.createElement('a');
fileEl.href = item.content;
fileEl.download = fileName;
fileEl.target = '_blank';
fileEl.className = 'file-attachment';
fileEl.style.cssText = 'display:inline-flex;align-items:center;gap:6px;padding:8px 14px;margin:8px 0;border-radius:8px;background:var(--bg-secondary,#f3f4f6);color:var(--text-primary,#374151);text-decoration:none;font-size:14px;border:1px solid var(--border-color,#e5e7eb);';
fileEl.innerHTML = `<i class="fas fa-file-download" style="color:#6b7280;"></i> ${fileName}`;
mediaEl.appendChild(fileEl);
scrollChatToBottom();
} else if (item.type === 'phase') {
// Coarse progress (e.g. cow install-browser); must not close SSE (unlike "done")
ensureBotEl();
const wrap = document.createElement('div');
wrap.className = 'text-xs sm:text-sm text-slate-600 dark:text-slate-400 border-l-2 border-primary-400 pl-2 py-1 my-0.5';
wrap.textContent = String(item.content || '');
stepsEl.appendChild(wrap);
scrollChatToBottom();
} else if (item.type === 'done') {
done = true;
es.close();
delete activeStreams[requestId];
// item.content may be empty when "done" is only a stream-close signal after media.
const finalText = item.content || accumulatedText;
if (!botEl && finalText) {
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
addBotMessage(finalText, new Date((item.timestamp || Date.now() / 1000) * 1000), requestId);
} else if (botEl) {
contentEl.classList.remove('sse-streaming');
// Only update text content when there is something new to show.
if (finalText) contentEl.innerHTML = renderMarkdown(finalText);
applyHighlighting(botEl);
}
scrollChatToBottom();
} else if (item.type === 'error') {
done = true;
es.close();
delete activeStreams[requestId];
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
addBotMessage(t('error_send'), new Date());
} }
};
} else if (item.type === 'image') { es.onerror = function() {
ensureBotEl();
const imgEl = document.createElement('img');
imgEl.src = item.content;
imgEl.alt = 'screenshot';
imgEl.style.cssText = 'max-width:600px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);';
imgEl.onclick = () => window.open(item.content, '_blank');
mediaEl.appendChild(imgEl);
scrollChatToBottom();
} else if (item.type === 'text') {
// Intermediate text sent before media items; display it but keep SSE open.
ensureBotEl();
contentEl.classList.remove('sse-streaming');
const textContent = item.content || accumulatedText;
if (textContent) contentEl.innerHTML = renderMarkdown(textContent);
applyHighlighting(botEl);
scrollChatToBottom();
} else if (item.type === 'video') {
ensureBotEl();
const wrapper = document.createElement('div');
wrapper.innerHTML = _buildVideoHtml(item.content);
mediaEl.appendChild(wrapper.firstElementChild || wrapper);
scrollChatToBottom();
} else if (item.type === 'file') {
ensureBotEl();
const fileName = item.file_name || item.content.split('/').pop();
const fileEl = document.createElement('a');
fileEl.href = item.content;
fileEl.download = fileName;
fileEl.target = '_blank';
fileEl.className = 'file-attachment';
fileEl.style.cssText = 'display:inline-flex;align-items:center;gap:6px;padding:8px 14px;margin:8px 0;border-radius:8px;background:var(--bg-secondary,#f3f4f6);color:var(--text-primary,#374151);text-decoration:none;font-size:14px;border:1px solid var(--border-color,#e5e7eb);';
fileEl.innerHTML = `<i class="fas fa-file-download" style="color:#6b7280;"></i> ${fileName}`;
mediaEl.appendChild(fileEl);
scrollChatToBottom();
} else if (item.type === 'phase') {
// Coarse progress (e.g. cow install-browser); must not close SSE (unlike "done")
ensureBotEl();
const wrap = document.createElement('div');
wrap.className = 'text-xs sm:text-sm text-slate-600 dark:text-slate-400 border-l-2 border-primary-400 pl-2 py-1 my-0.5';
wrap.textContent = String(item.content || '');
stepsEl.appendChild(wrap);
scrollChatToBottom();
} else if (item.type === 'done') {
es.close(); es.close();
delete activeStreams[requestId]; delete activeStreams[requestId];
// item.content may be empty when "done" is only a stream-close signal after media. if (done) return;
const finalText = item.content || accumulatedText;
if (!botEl && finalText) { if (reconnectCount < MAX_RECONNECTS) {
if (loadingEl) { loadingEl.remove(); loadingEl = null; } reconnectCount++;
addBotMessage(finalText, new Date((item.timestamp || Date.now() / 1000) * 1000), requestId); const delay = Math.min(RECONNECT_BASE_MS * reconnectCount, 5000);
} else if (botEl) { console.warn(`[SSE] connection lost for ${requestId}, reconnecting in ${delay}ms (attempt ${reconnectCount}/${MAX_RECONNECTS})`);
setTimeout(connect, delay);
return;
}
// Exhausted retries, show whatever we have
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
if (!botEl) {
addBotMessage(t('error_send'), new Date());
} else if (accumulatedText) {
contentEl.classList.remove('sse-streaming'); contentEl.classList.remove('sse-streaming');
// Only update text content when there is something new to show. contentEl.innerHTML = renderMarkdown(accumulatedText);
if (finalText) contentEl.innerHTML = renderMarkdown(finalText);
applyHighlighting(botEl); applyHighlighting(botEl);
} }
scrollChatToBottom(); };
}
} else if (item.type === 'error') { connect();
es.close();
delete activeStreams[requestId];
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
addBotMessage(t('error_send'), new Date());
}
};
es.onerror = function() {
es.close();
delete activeStreams[requestId];
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
if (!botEl) {
addBotMessage(t('error_send'), new Date());
} else if (accumulatedText) {
contentEl.classList.remove('sse-streaming');
contentEl.innerHTML = renderMarkdown(accumulatedText);
applyHighlighting(botEl);
}
};
} }
function startPolling() { function startPolling() {

View File

@@ -329,14 +329,18 @@ class WebChannel(ChatChannel):
""" """
SSE generator for a given request_id. SSE generator for a given request_id.
Yields UTF-8 encoded bytes to avoid WSGI Latin-1 mangling. Yields UTF-8 encoded bytes to avoid WSGI Latin-1 mangling.
Supports client reconnection: the queue is only removed after a
"done" event is consumed, so a new GET /stream with the same
request_id can resume reading remaining events.
""" """
if request_id not in self.sse_queues: if request_id not in self.sse_queues:
yield b"data: {\"type\": \"error\", \"message\": \"invalid request_id\"}\n\n" yield b"data: {\"type\": \"error\", \"message\": \"invalid request_id\"}\n\n"
return return
q = self.sse_queues[request_id] q = self.sse_queues[request_id]
timeout = 300 # 5 minutes max idle_timeout = 600 # 10 minutes without any real event
deadline = time.time() + timeout deadline = time.time() + idle_timeout
done = False
try: try:
while time.time() < deadline: while time.time() < deadline:
@@ -346,13 +350,18 @@ class WebChannel(ChatChannel):
yield b": keepalive\n\n" yield b": keepalive\n\n"
continue continue
# Real event received, reset idle deadline
deadline = time.time() + idle_timeout
payload = json.dumps(item, ensure_ascii=False) payload = json.dumps(item, ensure_ascii=False)
yield f"data: {payload}\n\n".encode("utf-8") yield f"data: {payload}\n\n".encode("utf-8")
if item.get("type") == "done": if item.get("type") == "done":
done = True
break break
finally: finally:
self.sse_queues.pop(request_id, None) if done:
self.sse_queues.pop(request_id, None)
def poll_response(self): def poll_response(self):
""" """

72
docs/en/tools/vision.mdx Normal file
View File

@@ -0,0 +1,72 @@
---
title: vision - Image Analysis
description: Analyze image content (recognition, description, OCR, etc.)
---
Analyze local images or image URLs using Vision API. Supports content description, text extraction (OCR), object recognition, and more.
## Model Selection
The vision tool uses a multi-level auto-selection strategy with automatic fallback — no manual configuration required:
1. **Main model** — uses the currently configured main model for image recognition (zero extra cost)
2. **Other configured models** — auto-discovers other models with configured API keys as alternatives
3. **OpenAI** — uses `open_ai_api_key` to call gpt-4.1-mini
4. **LinkAI** — uses `linkai_api_key` to call LinkAI vision service
When `use_linkai=true`, LinkAI is promoted to the highest priority.
If the current provider fails, the tool automatically tries the next one until it succeeds or all fail.
### Supported Models
| Vendor | Vision Model | Notes |
| --- | --- | --- |
| OpenAI / Compatible | Main model | All OpenAI-compatible multimodal models |
| Qwen (DashScope) | Main model | Via MultiModalConversation API |
| Claude | Main model | Anthropic native image format |
| Gemini | Main model | inlineData format |
| Doubao | Main model | doubao-seed-2-0 series natively supported |
| Kimi (Moonshot) | Main model | kimi-k2.5 natively supported |
| ZhipuAI | glm-5v-turbo | Always uses dedicated vision model |
| MiniMax | MiniMax-Text-01 | Always uses dedicated vision model |
<Note>
ZhipuAI and MiniMax text models do not support image understanding, so their dedicated vision models are always used automatically.
</Note>
## Parameters
| Parameter | Type | Required | Description |
| --- | --- | --- | --- |
| `image` | string | Yes | Local file path or HTTP(S) image URL |
| `question` | string | Yes | Question to ask about the image |
Supported image formats: jpg, jpeg, png, gif, webp
## Custom Configuration
To specify a particular model for the vision tool, add to `config.json`:
```json
{
"tool": {
"vision": {
"model": "gpt-4o"
}
}
}
```
In most cases no configuration is needed. The tool works automatically as long as the main model supports multimodal input or any vision-capable API key is configured.
## Use Cases
- Describe image content
- Extract text from images (OCR)
- Identify objects, colors, scenes
- Analyze screenshots and scanned documents
<Note>
Images larger than 1MB are automatically compressed (max edge 1536px). All images (including remote URLs) are converted to base64 for transmission to ensure compatibility with all model backends.
</Note>

72
docs/ja/tools/vision.mdx Normal file
View File

@@ -0,0 +1,72 @@
---
title: vision - 画像分析
description: 画像コンテンツの分析認識、説明、OCR など)
---
Vision API を使用してローカル画像や画像 URL を分析します。コンテンツの説明、テキスト抽出OCR、オブジェクト認識などに対応しています。
## モデル選択
Vision ツールは多段階の自動選択+自動フォールバック戦略を採用しており、手動設定なしで利用可能です:
1. **メインモデル** — 現在設定されているメインモデルで画像認識を実行(追加コストなし)
2. **その他の設定済みモデル** — API キーが設定されている他のマルチモーダルモデルを自動検出
3. **OpenAI** — `open_ai_api_key` を使用して gpt-4.1-mini を呼び出し
4. **LinkAI** — `linkai_api_key` を使用して LinkAI ビジョンサービスを呼び出し
`use_linkai=true` の場合、LinkAI が最優先になります。
現在のプロバイダーが失敗した場合、成功するかすべて失敗するまで自動的に次のプロバイダーを試行します。
### 対応モデル
| ベンダー | ビジョンモデル | 説明 |
| --- | --- | --- |
| OpenAI / 互換プロトコル | メインモデル | すべての OpenAI 互換マルチモーダルモデルに対応 |
| 通義千問 (DashScope) | メインモデル | MultiModalConversation API 経由 |
| Claude | メインモデル | Anthropic ネイティブ画像形式 |
| Gemini | メインモデル | inlineData 形式 |
| 豆包 (Doubao) | メインモデル | doubao-seed-2-0 シリーズがネイティブ対応 |
| Kimi (Moonshot) | メインモデル | kimi-k2.5 がネイティブ対応 |
| 智谱 AI | glm-5v-turbo | 常にビジョン専用モデルを使用 |
| MiniMax | MiniMax-Text-01 | 常にビジョン専用モデルを使用 |
<Note>
智谱 AI と MiniMax のテキストモデルは画像理解に対応していないため、対応するビジョン専用モデルが自動的に使用されます。
</Note>
## パラメータ
| パラメータ | 型 | 必須 | 説明 |
| --- | --- | --- | --- |
| `image` | string | はい | ローカルファイルパスまたは HTTP(S) 画像 URL |
| `question` | string | はい | 画像に対する質問 |
対応画像形式jpg、jpeg、png、gif、webp
## カスタム設定
Vision ツールで使用するモデルを指定するには、`config.json` に以下を追加します:
```json
{
"tool": {
"vision": {
"model": "gpt-4o"
}
}
}
```
ほとんどの場合、設定は不要です。メインモデルがマルチモーダルに対応しているか、ビジョン対応の API キーが設定されていれば自動的に動作します。
## ユースケース
- 画像コンテンツの説明
- 画像からのテキスト抽出OCR
- オブジェクト、色、シーンの識別
- スクリーンショットやスキャン文書の分析
<Note>
1MB を超える画像は自動的に圧縮されます(最大辺 1536px。すべての画像リモート URL を含む)は base64 に変換して送信され、すべてのモデルバックエンドとの互換性を確保します。
</Note>

View File

@@ -5,14 +5,49 @@ description: 分析图片内容识别、描述、OCR 等)
使用 Vision API 分析本地图片或图片 URL支持内容描述、文字提取OCR、物体识别等。 使用 Vision API 分析本地图片或图片 URL支持内容描述、文字提取OCR、物体识别等。
## 依赖 ## 模型选择
需要配置至少一个 API Key通过 `env_config` 工具或工作空间 `.env` 文件配置) Vision 工具采用多级自动选择 + 自动兜底策略,无需手动配置即可使用
| 后端 | 环境变量 | 优先级 | 1. **主模型** — 优先使用当前配置的主模型进行图像识别(需要是多模态模型)
2. **其他已配置模型** — 自动发现已配置 API Key 的其他多模态模型作为备选
如果当前 provider 调用失败,会自动尝试下一个,直到成功或全部失败。
### 支持的模型
| 厂商 | 视觉模型 | 说明 |
| --- | --- | --- | | --- | --- | --- |
| OpenAI | `OPENAI_API_KEY` | 优先使用 | | OpenAI / 兼容协议 | 使用主模型 | 支持所有 OpenAI 协议兼容的多模态模型 |
| LinkAI | `LINKAI_API_KEY` | 备选 | | 通义千问 (DashScope) | 使用主模型 | 例如 qwen3.6-plus 等 |
| Claude | 使用主模型 | Anthropic 原生图像格式 |
| Gemini | 使用主模型 | inlineData 格式 |
| 豆包 (Doubao) | 使用主模型 | doubao-seed-2-0 系列原生支持 |
| Kimi (Moonshot) | 使用主模型 | kimi-k2.5 原生支持 |
| 智谱 AI | glm-5v-turbo | 固定使用视觉专用模型 |
| MiniMax | MiniMax-Text-01 | 固定使用视觉专用模型 |
<Note>
智谱和 MiniMax 的文本模型不支持图像理解,因此始终使用对应的视觉专用模型,无需手动指定。
</Note>
> 当 `use_linkai=true` 时,默认使用 LinkAI 的多模态模型进行
## 自定义配置
如果希望指定 Vision 使用的模型,可在 `config.json` 中配置,例如:
```json
{
"tool": {
"vision": {
"model": "gpt-4o"
}
}
}
```
大多数情况下无需配置,主模型支持多模态或配置任意一个支持视觉的 API Key 即可自动工作。
## 参数 ## 参数
@@ -20,17 +55,18 @@ description: 分析图片内容识别、描述、OCR 等)
| --- | --- | --- | --- | | --- | --- | --- | --- |
| `image` | string | 是 | 本地文件路径或 HTTP(S) 图片 URL | | `image` | string | 是 | 本地文件路径或 HTTP(S) 图片 URL |
| `question` | string | 是 | 对图片提出的问题 | | `question` | string | 是 | 对图片提出的问题 |
| `model` | string | 否 | 模型名称(默认 gpt-4.1-mini |
支持的图片格式jpg、jpeg、png、gif、webp 支持的图片格式jpg、jpeg、png、gif、webp
## 使用场景 ## 使用场景
- 描述图片中的内容 - 描述图片中的内容
- 提取图片中的文字OCR - 提取图片中的文字OCR
- 识别物体、颜色、场景 - 识别物体、颜色、场景
- 分析截图、文档扫描 - 分析截图、文档扫描图片等
<Note> <Note>
超过 1MB 的图片会自动压缩后上传。如果未配置任何 Vision API Key该工具不会被加载 超过 1MB 的图片会自动压缩后上传,所有图片(包括远程 URL会统一转为 base64 传输,确保兼容所有模型后端
</Note> </Note>

View File

@@ -2,12 +2,27 @@
Auto-replay chat robot abstract class Auto-replay chat robot abstract class
""" """
from bridge.context import Context from bridge.context import Context
from bridge.reply import Reply from bridge.reply import Reply
class Bot(object): class Bot(object):
"""
Base class for all chat-bot implementations.
Subclasses may also implement:
call_with_tools(messages, tools=None, stream=False, **kwargs)
-> dict | generator (OpenAI-compatible format)
call_vision(image_url, question, model=None, max_tokens=1000)
-> dict with keys: model, content, usage (or error/message)
These are NOT defined here to avoid shadowing concrete implementations
provided by mixin classes (e.g. OpenAICompatibleBot) in the MRO.
Use ``hasattr(bot, 'call_vision')`` to detect support at runtime.
"""
def reply(self, query, context: Context = None) -> Reply: def reply(self, query, context: Context = None) -> Reply:
""" """
bot auto-reply content bot auto-reply content

View File

@@ -1,7 +1,10 @@
# encoding:utf-8 # encoding:utf-8
import base64
import json import json
import re
import time import time
from typing import Optional
import requests import requests
@@ -224,6 +227,79 @@ class ClaudeAPIBot(Bot, OpenAIImage):
return 64000 return 64000
return 8192 return 8192
@staticmethod
def _parse_data_url(data_url: str):
"""Parse a data:<mime>;base64,<data> URL into (media_type, base64_data)."""
m = re.match(r"^data:([^;]+);base64,(.+)$", data_url, re.DOTALL)
if m:
return m.group(1), m.group(2)
return None, None
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""Analyze an image using Claude Messages API (native image blocks)."""
try:
actual_model = model or self._model_mapping(conf().get("model"))
# Build Claude-native image content block
if image_url.startswith("data:"):
media_type, b64_data = self._parse_data_url(image_url)
if not b64_data:
return {"error": True, "message": "Invalid base64 data URL"}
image_block = {
"type": "image",
"source": {"type": "base64",
"media_type": media_type or "image/jpeg",
"data": b64_data},
}
else:
image_block = {
"type": "image",
"source": {"type": "url", "url": image_url},
}
data = {
"model": actual_model,
"max_tokens": max_tokens,
"messages": [{
"role": "user",
"content": [
image_block,
{"type": "text", "text": question},
],
}],
}
headers = {
"x-api-key": self.api_key,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
}
proxies = {"http": self.proxy, "https": self.proxy} if self.proxy else None
resp = requests.post(f"{self.api_base}/messages",
headers=headers, json=data, proxies=proxies)
if resp.status_code != 200:
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
body = resp.json()
text_parts = [b.get("text", "") for b in body.get("content", [])
if b.get("type") == "text"]
usage = body.get("usage", {})
return {
"model": actual_model,
"content": "".join(text_parts),
"usage": {
"prompt_tokens": usage.get("input_tokens", 0),
"completion_tokens": usage.get("output_tokens", 0),
"total_tokens": usage.get("input_tokens", 0) + usage.get("output_tokens", 0),
},
}
except Exception as e:
logger.error(f"[CLAUDE] call_vision error: {e}")
return {"error": True, "message": str(e)}
def call_with_tools(self, messages, tools=None, stream=False, **kwargs): def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
""" """
Call Claude API with tool support for agent integration Call Claude API with tool support for agent integration

View File

@@ -1,6 +1,8 @@
# encoding:utf-8 # encoding:utf-8
import json import json
from typing import Optional
from models.bot import Bot from models.bot import Bot
from models.session_manager import SessionManager from models.session_manager import SessionManager
from bridge.context import ContextType from bridge.context import ContextType
@@ -153,6 +155,56 @@ class DashscopeBot(Bot):
else: else:
return result return result
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""Analyze an image using DashScope MultiModalConversation API."""
try:
dashscope.api_key = self.api_key
vision_model = model or "qwen-vl-max"
# DashScope multimodal format: {"image": url} + {"text": question}
messages = [{
"role": "user",
"content": [
{"image": image_url},
{"text": question},
],
}]
response = MultiModalConversation.call(
model=vision_model,
messages=messages,
max_tokens=max_tokens,
)
if response.status_code != HTTPStatus.OK:
return {
"error": True,
"message": f"{response.code} - {response.message}",
}
resp_dict = self._response_to_dict(response)
choice = resp_dict["output"]["choices"][0]
content = choice.get("message", {}).get("content", "")
if isinstance(content, list):
content = "".join(
item.get("text", "") for item in content if isinstance(item, dict)
)
usage = resp_dict.get("usage", {})
return {
"model": vision_model,
"content": content,
"usage": {
"prompt_tokens": usage.get("input_tokens", 0),
"completion_tokens": usage.get("output_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
},
}
except Exception as e:
logger.error(f"[DASHSCOPE] call_vision error: {e}")
return {"error": True, "message": str(e)}
def call_with_tools(self, messages, tools=None, stream=False, **kwargs): def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
""" """
Call DashScope API with tool support for agent integration Call DashScope API with tool support for agent integration

View File

@@ -2,6 +2,7 @@
import json import json
import time import time
from typing import Optional
import requests import requests
from models.bot import Bot from models.bot import Bot
@@ -147,6 +148,49 @@ class DoubaoBot(Bot):
else: else:
return result return result
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""Analyze an image using Doubao (Volcengine Ark) OpenAI-compatible API."""
try:
vision_model = model or self.args.get("model", "doubao-seed-2-0-pro-260215")
payload = {
"model": vision_model,
"max_tokens": max_tokens,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": image_url}},
],
}],
}
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
resp = requests.post(f"{self.base_url}/chat/completions",
headers=headers, json=payload, timeout=60)
if resp.status_code != 200:
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
data = resp.json()
if "error" in data:
return {"error": True, "message": data["error"].get("message", str(data["error"]))}
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
usage = data.get("usage", {})
return {
"model": vision_model,
"content": content,
"usage": {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
},
}
except Exception as e:
logger.error(f"[DOUBAO] call_vision error: {e}")
return {"error": True, "message": str(e)}
# ==================== Agent mode support ==================== # ==================== Agent mode support ====================
def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs): def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs):
@@ -434,31 +478,37 @@ class DoubaoBot(Bot):
continue continue
if role == "user": if role == "user":
text_parts = [] has_tool_result = any(
tool_results = [] isinstance(b, dict) and b.get("type") == "tool_result" for b in content
)
if has_tool_result:
text_parts = []
tool_results = []
for block in content: for block in content:
if not isinstance(block, dict): if not isinstance(block, dict):
continue continue
if block.get("type") == "text": if block.get("type") == "text":
text_parts.append(block.get("text", "")) text_parts.append(block.get("text", ""))
elif block.get("type") == "tool_result": elif block.get("type") == "tool_result":
tool_call_id = block.get("tool_use_id") or "" tool_call_id = block.get("tool_use_id") or ""
result_content = block.get("content", "") result_content = block.get("content", "")
if not isinstance(result_content, str): if not isinstance(result_content, str):
result_content = json.dumps(result_content, ensure_ascii=False) result_content = json.dumps(result_content, ensure_ascii=False)
tool_results.append({ tool_results.append({
"role": "tool", "role": "tool",
"tool_call_id": tool_call_id, "tool_call_id": tool_call_id,
"content": result_content "content": result_content
}) })
# Tool results first (must come right after assistant with tool_calls) for tr in tool_results:
for tr in tool_results: converted.append(tr)
converted.append(tr)
if text_parts: if text_parts:
converted.append({"role": "user", "content": "\n".join(text_parts)}) converted.append({"role": "user", "content": "\n".join(text_parts)})
else:
# Keep as-is for multimodal content (e.g. image_url blocks)
converted.append(msg)
elif role == "assistant": elif role == "assistant":
openai_msg = {"role": "assistant"} openai_msg = {"role": "assistant"}

View File

@@ -12,6 +12,8 @@ import mimetypes
import os import os
import re import re
import time import time
from typing import Optional
import requests import requests
from models.bot import Bot from models.bot import Bot
from models.session_manager import SessionManager from models.session_manager import SessionManager
@@ -144,7 +146,12 @@ class GoogleGeminiBot(Bot):
return "", [] return "", []
pattern = r"\[图片:\s*([^\]]+)\]" pattern = r"\[图片:\s*([^\]]+)\]"
image_paths = [m.strip().strip("'\"") for m in re.findall(pattern, content) if m.strip()] image_paths = [m.strip().strip("'\"") for m in re.findall(pattern, content) if m.strip()]
cleaned_text = re.sub(pattern, "", content) # Replace markers with path-only hints so the model still knows the
# original file location (needed when it calls tools like vision).
def _replace_with_hint(m):
path = m.group(1).strip().strip("'\"")
return f"[attached image: {path}]"
cleaned_text = re.sub(pattern, _replace_with_hint, content)
cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text).strip() cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text).strip()
return cleaned_text, image_paths return cleaned_text, image_paths
@@ -225,6 +232,57 @@ class GoogleGeminiBot(Bot):
logger.warning(f"[Gemini] Unsupported image URL format: {image_url[:120]}") logger.warning(f"[Gemini] Unsupported image URL format: {image_url[:120]}")
return None return None
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""Analyze an image using Gemini REST API."""
try:
model_name = model or self.model or "gemini-2.0-flash"
image_part = self._build_inline_part_from_image_url({"url": image_url})
if not image_part:
return {"error": True, "message": f"Cannot process image URL: {image_url[:120]}"}
payload = {
"contents": [{
"role": "user",
"parts": [image_part, {"text": question}],
}],
"generationConfig": {"maxOutputTokens": max_tokens},
"safetySettings": [
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
],
}
endpoint = f"{self.api_base}/v1beta/models/{model_name}:generateContent"
headers = {"x-goog-api-key": self.api_key, "Content-Type": "application/json"}
resp = requests.post(endpoint, headers=headers, json=payload, timeout=60)
if resp.status_code != 200:
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
body = resp.json()
candidates = body.get("candidates", [])
text_parts = []
for part in candidates[0].get("content", {}).get("parts", []) if candidates else []:
if "text" in part:
text_parts.append(part["text"])
usage_meta = body.get("usageMetadata", {})
return {
"model": model_name,
"content": "".join(text_parts),
"usage": {
"prompt_tokens": usage_meta.get("promptTokenCount", 0),
"completion_tokens": usage_meta.get("candidatesTokenCount", 0),
"total_tokens": usage_meta.get("totalTokenCount", 0),
},
}
except Exception as e:
logger.error(f"[Gemini] call_vision error: {e}")
return {"error": True, "message": str(e)}
def call_with_tools(self, messages, tools=None, stream=False, **kwargs): def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
""" """
Call Gemini API with tool support using REST API (following official docs) Call Gemini API with tool support using REST API (following official docs)

View File

@@ -2,6 +2,8 @@
import time import time
import json import json
from typing import Optional
import requests import requests
from models.bot import Bot from models.bot import Bot
@@ -175,6 +177,51 @@ class MinimaxBot(Bot):
else: else:
return result return result
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""Analyze an image using MiniMax OpenAI-compatible API.
Always uses MiniMax-Text-01 — other MiniMax models do not support vision.
"""
try:
vision_model = "MiniMax-Text-01"
payload = {
"model": vision_model,
"max_tokens": max_tokens,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": image_url}},
],
}],
}
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
resp = requests.post(f"{self.api_base}/chat/completions",
headers=headers, json=payload, timeout=60)
if resp.status_code != 200:
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
data = resp.json()
if "error" in data:
return {"error": True, "message": data["error"].get("message", str(data["error"]))}
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
usage = data.get("usage", {})
return {
"model": vision_model,
"content": content,
"usage": {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
},
}
except Exception as e:
logger.error(f"[MINIMAX] call_vision error: {e}")
return {"error": True, "message": str(e)}
def call_with_tools(self, messages, tools=None, stream=False, **kwargs): def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
""" """
Call MiniMax API with tool support for agent integration Call MiniMax API with tool support for agent integration
@@ -273,37 +320,41 @@ class MinimaxBot(Bot):
if role == "user": if role == "user":
# Handle user message # Handle user message
if isinstance(content, list): if isinstance(content, list):
# Extract text from content blocks has_tool_result = any(
text_parts = [] isinstance(b, dict) and b.get("type") == "tool_result" for b in content
tool_results = [] )
if has_tool_result:
text_parts = []
tool_results = []
for block in content: for block in content:
if isinstance(block, dict): if isinstance(block, dict):
if block.get("type") == "text": if block.get("type") == "text":
text_parts.append(block.get("text", "")) text_parts.append(block.get("text", ""))
elif block.get("type") == "tool_result": elif block.get("type") == "tool_result":
# Tool result should be a separate message with role="tool" tool_call_id = block.get("tool_use_id") or ""
tool_call_id = block.get("tool_use_id") or "" if not tool_call_id:
if not tool_call_id: logger.warning(f"[MINIMAX] tool_result missing tool_use_id")
logger.warning(f"[MINIMAX] tool_result missing tool_use_id") result_content = block.get("content", "")
result_content = block.get("content", "") if not isinstance(result_content, str):
if not isinstance(result_content, str): result_content = json.dumps(result_content, ensure_ascii=False)
result_content = json.dumps(result_content, ensure_ascii=False) tool_results.append({
tool_results.append({ "role": "tool",
"role": "tool", "tool_call_id": tool_call_id,
"tool_call_id": tool_call_id, "content": result_content
"content": result_content })
})
if text_parts: if text_parts:
converted.append({ converted.append({
"role": "user", "role": "user",
"content": "\n".join(text_parts) "content": "\n".join(text_parts)
}) })
# Add all tool results (not just the last one) for tool_result in tool_results:
for tool_result in tool_results: converted.append(tool_result)
converted.append(tool_result) else:
# Keep as-is for multimodal content (e.g. image_url blocks)
converted.append(msg)
else: else:
# Simple text content # Simple text content
converted.append({ converted.append({

View File

@@ -2,6 +2,7 @@
import json import json
import time import time
from typing import Optional
import requests import requests
from models.bot import Bot from models.bot import Bot
@@ -147,6 +148,49 @@ class MoonshotBot(Bot):
else: else:
return result return result
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""Analyze an image using Moonshot (Kimi) OpenAI-compatible API."""
try:
vision_model = model or self.args.get("model", "kimi-k2.5")
payload = {
"model": vision_model,
"max_tokens": max_tokens,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": image_url}},
],
}],
}
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
resp = requests.post(f"{self.base_url}/chat/completions",
headers=headers, json=payload, timeout=60)
if resp.status_code != 200:
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
data = resp.json()
if "error" in data:
return {"error": True, "message": data["error"].get("message", str(data["error"]))}
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
usage = data.get("usage", {})
return {
"model": vision_model,
"content": content,
"usage": {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
},
}
except Exception as e:
logger.error(f"[MOONSHOT] call_vision error: {e}")
return {"error": True, "message": str(e)}
# ==================== Agent mode support ==================== # ==================== Agent mode support ====================
def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs): def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs):
@@ -435,31 +479,37 @@ class MoonshotBot(Bot):
continue continue
if role == "user": if role == "user":
text_parts = [] has_tool_result = any(
tool_results = [] isinstance(b, dict) and b.get("type") == "tool_result" for b in content
)
if has_tool_result:
text_parts = []
tool_results = []
for block in content: for block in content:
if not isinstance(block, dict): if not isinstance(block, dict):
continue continue
if block.get("type") == "text": if block.get("type") == "text":
text_parts.append(block.get("text", "")) text_parts.append(block.get("text", ""))
elif block.get("type") == "tool_result": elif block.get("type") == "tool_result":
tool_call_id = block.get("tool_use_id") or "" tool_call_id = block.get("tool_use_id") or ""
result_content = block.get("content", "") result_content = block.get("content", "")
if not isinstance(result_content, str): if not isinstance(result_content, str):
result_content = json.dumps(result_content, ensure_ascii=False) result_content = json.dumps(result_content, ensure_ascii=False)
tool_results.append({ tool_results.append({
"role": "tool", "role": "tool",
"tool_call_id": tool_call_id, "tool_call_id": tool_call_id,
"content": result_content "content": result_content
}) })
# Tool results first (must come right after assistant with tool_calls) for tr in tool_results:
for tr in tool_results: converted.append(tr)
converted.append(tr)
if text_parts: if text_parts:
converted.append({"role": "user", "content": "\n".join(text_parts)}) converted.append({"role": "user", "content": "\n".join(text_parts)})
else:
# Keep as-is for multimodal content (e.g. image_url blocks)
converted.append(msg)
elif role == "assistant": elif role == "assistant":
openai_msg = {"role": "assistant"} openai_msg = {"role": "assistant"}

View File

@@ -9,6 +9,8 @@ This includes: OpenAI, LinkAI, Azure OpenAI, and many third-party providers.
import json import json
import openai import openai
import requests
from typing import Optional
from common.log import logger from common.log import logger
from agent.protocol.message_utils import drop_orphaned_tool_results_openai from agent.protocol.message_utils import drop_orphaned_tool_results_openai
@@ -306,3 +308,51 @@ class OpenAICompatibleBot:
openai_messages.append(msg) openai_messages.append(msg)
return drop_orphaned_tool_results_openai(openai_messages) return drop_orphaned_tool_results_openai(openai_messages)
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""Analyze an image using the OpenAI-compatible /chat/completions endpoint."""
try:
api_config = self.get_api_config()
vision_model = model or api_config.get("model", "gpt-4o")
api_key = api_config.get("api_key", "")
api_base = (api_config.get("api_base") or "https://api.openai.com/v1").rstrip("/")
payload = {
"model": vision_model,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": image_url}},
],
}],
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
resp = requests.post(
f"{api_base}/chat/completions",
headers=headers, json=payload, timeout=60,
)
if resp.status_code != 200:
body = resp.text[:500]
logger.error(f"[{self.__class__.__name__}] call_vision HTTP {resp.status_code}: {body}")
return {"error": True, "message": f"HTTP {resp.status_code}: {body}"}
data = resp.json()
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
usage = data.get("usage", {})
return {
"model": vision_model,
"content": content,
"usage": {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
},
}
except Exception as e:
logger.error(f"[{self.__class__.__name__}] call_vision error: {e}")
return {"error": True, "message": str(e)}

View File

@@ -2,6 +2,7 @@
import time import time
import json import json
from typing import Optional
from models.bot import Bot from models.bot import Bot
from models.zhipuai.zhipu_ai_session import ZhipuAISession from models.zhipuai.zhipu_ai_session import ZhipuAISession
@@ -149,6 +150,40 @@ class ZHIPUAIBot(Bot, ZhipuAIImage):
else: else:
return result return result
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""Analyze an image using ZhipuAI OpenAI-compatible SDK.
Always uses glm-5v-turbo — the text models (glm-5-turbo etc.) do not support vision.
"""
try:
vision_model = "glm-5v-turbo"
response = self.client.chat.completions.create(
model=vision_model,
max_tokens=max_tokens,
messages=[{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": image_url}},
],
}],
)
content = response.choices[0].message.content or ""
usage = response.usage
return {
"model": vision_model,
"content": content,
"usage": {
"prompt_tokens": getattr(usage, "prompt_tokens", 0),
"completion_tokens": getattr(usage, "completion_tokens", 0),
"total_tokens": getattr(usage, "total_tokens", 0),
},
}
except Exception as e:
logger.error(f"[ZHIPU_AI] call_vision error: {e}")
return {"error": True, "message": str(e)}
def call_with_tools(self, messages, tools=None, stream=False, **kwargs): def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
""" """
Call ZhipuAI API with tool support for agent integration Call ZhipuAI API with tool support for agent integration