mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-06-02 00:57:41 +08:00
feat(vision): prioritize main model for image recognition with multi-provider fallback
- Add call_vision method to all bot implementations (DashScope, Claude, Gemini, ZhipuAI, MiniMax, Doubao, Moonshot, OpenAICompatibleBot) using each vendor's native multimodal API format - Remove call_with_tools/call_vision from Bot base class to fix MRO shadowing issue with OpenAICompatibleBot mixin - Refactor vision tool provider resolution: MainModel → other configured models (auto-discovered) → OpenAI → LinkAI, with automatic fallback - Return actual model name used in call_vision responses - Sync config.json API keys to .env bidirectionally on startup - Fix bot instance cache to detect bot_type/use_linkai config changes - Add SSE reconnection support for web console - Preserve image path hints in Gemini text for correct vision tool calls - Update docs/tools/vision.mdx
This commit is contained in:
@@ -1,7 +1,13 @@
|
|||||||
"""
|
"""
|
||||||
Vision tool - Analyze images using OpenAI-compatible Vision API.
|
Vision tool - Analyze images using Vision API.
|
||||||
Supports local files (auto base64-encoded) and HTTP URLs.
|
Supports local files (auto base64-encoded) and HTTP URLs.
|
||||||
Providers are tried in priority order with automatic fallback on failure.
|
|
||||||
|
Provider priority (default):
|
||||||
|
1. Main model via bot.call_vision — zero extra cost
|
||||||
|
2. Other models whose API key is configured — auto-discovered
|
||||||
|
3. OpenAI / LinkAI raw HTTP — reliable fallback
|
||||||
|
When use_linkai=true, LinkAI is promoted to #1.
|
||||||
|
When tool.vision.model is set, that model is used exclusively first.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
@@ -14,10 +20,11 @@ from typing import Any, Dict, List, Optional
|
|||||||
import requests
|
import requests
|
||||||
|
|
||||||
from agent.tools.base_tool import BaseTool, ToolResult
|
from agent.tools.base_tool import BaseTool, ToolResult
|
||||||
|
from common import const
|
||||||
from common.log import logger
|
from common.log import logger
|
||||||
from config import conf
|
from config import conf
|
||||||
|
|
||||||
DEFAULT_MODEL = "gpt-4.1-mini"
|
DEFAULT_MODEL = const.GPT_41_MINI
|
||||||
DEFAULT_TIMEOUT = 60
|
DEFAULT_TIMEOUT = 60
|
||||||
MAX_TOKENS = 1000
|
MAX_TOKENS = 1000
|
||||||
COMPRESS_THRESHOLD = 1_048_576 # 1 MB
|
COMPRESS_THRESHOLD = 1_048_576 # 1 MB
|
||||||
@@ -30,8 +37,20 @@ SUPPORTED_EXTENSIONS = {
|
|||||||
"webp": "image/webp",
|
"webp": "image/webp",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_MAIN_MODEL_PROVIDER_NAME = "MainModel"
|
||||||
|
|
||||||
OPENAI_COMPATIBLE_BOT_TYPES = {"openai", "openAI", "chatGPT"}
|
# (config_key_for_api_key, bot_type, default_vision_model, provider_display_name)
|
||||||
|
# Auto-discovered as fallback vision providers when their API key is configured.
|
||||||
|
# OpenAI and LinkAI are handled separately (raw HTTP providers), so not listed here.
|
||||||
|
_DISCOVERABLE_MODELS = [
|
||||||
|
("moonshot_api_key", const.MOONSHOT, const.KIMI_K2_5, "Moonshot"),
|
||||||
|
("ark_api_key", const.DOUBAO, const.DOUBAO_SEED_2_PRO, "Doubao"),
|
||||||
|
("dashscope_api_key", const.QWEN_DASHSCOPE, const.QWEN36_PLUS, "DashScope"),
|
||||||
|
("claude_api_key", const.CLAUDEAPI, const.CLAUDE_4_6_SONNET, "Claude"),
|
||||||
|
("gemini_api_key", const.GEMINI, const.GEMINI_31_FLASH_LITE_PRE, "Gemini"),
|
||||||
|
("zhipu_ai_api_key", const.ZHIPU_AI, const.GLM_4_7, "ZhipuAI"),
|
||||||
|
("minimax_api_key", const.MiniMax, const.MINIMAX_M2_7, "MiniMax"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -42,6 +61,8 @@ class VisionProvider:
|
|||||||
api_base: str
|
api_base: str
|
||||||
extra_headers: dict = field(default_factory=dict)
|
extra_headers: dict = field(default_factory=dict)
|
||||||
model_override: Optional[str] = None
|
model_override: Optional[str] = None
|
||||||
|
use_bot: bool = False # When True, call via bot.call_vision instead of raw HTTP
|
||||||
|
fallback_bot: Any = None # Bot instance for non-main-model providers
|
||||||
|
|
||||||
|
|
||||||
class VisionAPIError(Exception):
|
class VisionAPIError(Exception):
|
||||||
@@ -50,13 +71,12 @@ class VisionAPIError(Exception):
|
|||||||
|
|
||||||
|
|
||||||
class Vision(BaseTool):
|
class Vision(BaseTool):
|
||||||
"""Analyze images using OpenAI-compatible Vision API"""
|
"""Analyze images using Vision API"""
|
||||||
|
|
||||||
name: str = "vision"
|
name: str = "vision"
|
||||||
description: str = (
|
description: str = (
|
||||||
"Analyze a local image or image URL (jpg/jpeg/png) using Vision API. "
|
"Analyze a local image or image URL (jpg/jpeg/png) using Vision API. "
|
||||||
"Can describe content, extract text, identify objects, colors, etc. "
|
"Can describe content, extract text, identify objects, colors, etc. "
|
||||||
"Requires OPENAI_API_KEY or LINKAI_API_KEY."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
params: dict = {
|
params: dict = {
|
||||||
@@ -70,13 +90,6 @@ class Vision(BaseTool):
|
|||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "Question to ask about the image",
|
"description": "Question to ask about the image",
|
||||||
},
|
},
|
||||||
"model": {
|
|
||||||
"type": "string",
|
|
||||||
"description": (
|
|
||||||
f"Vision model to use (default: {DEFAULT_MODEL}). "
|
|
||||||
"Options: gpt-4.1-mini, gpt-4.1, gpt-4o-mini, gpt-4o"
|
|
||||||
),
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
"required": ["image", "question"],
|
"required": ["image", "question"],
|
||||||
}
|
}
|
||||||
@@ -86,15 +99,11 @@ class Vision(BaseTool):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_available() -> bool:
|
def is_available() -> bool:
|
||||||
return bool(
|
return True
|
||||||
conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY")
|
|
||||||
or conf().get("linkai_api_key") or os.environ.get("LINKAI_API_KEY")
|
|
||||||
)
|
|
||||||
|
|
||||||
def execute(self, args: Dict[str, Any]) -> ToolResult:
|
def execute(self, args: Dict[str, Any]) -> ToolResult:
|
||||||
image = args.get("image", "").strip()
|
image = args.get("image", "").strip()
|
||||||
question = args.get("question", "").strip()
|
question = args.get("question", "").strip()
|
||||||
model = args.get("model", DEFAULT_MODEL).strip() or DEFAULT_MODEL
|
|
||||||
|
|
||||||
if not image:
|
if not image:
|
||||||
return ToolResult.fail("Error: 'image' parameter is required")
|
return ToolResult.fail("Error: 'image' parameter is required")
|
||||||
@@ -104,11 +113,12 @@ class Vision(BaseTool):
|
|||||||
providers = self._resolve_providers()
|
providers = self._resolve_providers()
|
||||||
if not providers:
|
if not providers:
|
||||||
return ToolResult.fail(
|
return ToolResult.fail(
|
||||||
"Error: No API key configured for Vision.\n"
|
"Error: No model available for Vision.\n"
|
||||||
"Please configure one of the following using env_config tool:\n"
|
"The main model does not support vision and no other API keys are configured.\n"
|
||||||
" 1. OPENAI_API_KEY (preferred): env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n"
|
"Options:\n"
|
||||||
" 2. LINKAI_API_KEY (fallback): env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")\n\n"
|
" 1. Switch to a multimodal model (e.g. qwen3.6-plus, claude-sonnet-4-6, gemini-2.0-flash)\n"
|
||||||
"Get your key at: https://platform.openai.com/api-keys or https://link-ai.tech"
|
" 2. Configure OPENAI_API_KEY: env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n"
|
||||||
|
" 3. Configure LINKAI_API_KEY: env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")"
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -116,7 +126,7 @@ class Vision(BaseTool):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
return ToolResult.fail(f"Error: {e}")
|
return ToolResult.fail(f"Error: {e}")
|
||||||
|
|
||||||
return self._call_with_fallback(providers, model, question, image_content)
|
return self._call_with_fallback(providers, DEFAULT_MODEL, question, image_content)
|
||||||
|
|
||||||
def _call_with_fallback(self, providers: List[VisionProvider], model: str,
|
def _call_with_fallback(self, providers: List[VisionProvider], model: str,
|
||||||
question: str, image_content: dict) -> ToolResult:
|
question: str, image_content: dict) -> ToolResult:
|
||||||
@@ -125,9 +135,14 @@ class Vision(BaseTool):
|
|||||||
for i, provider in enumerate(providers):
|
for i, provider in enumerate(providers):
|
||||||
use_model = provider.model_override or model
|
use_model = provider.model_override or model
|
||||||
try:
|
try:
|
||||||
logger.debug(f"[Vision] Trying provider '{provider.name}' "
|
logger.info(f"[Vision] Trying provider '{provider.name}' "
|
||||||
f"with model '{use_model}' ({i + 1}/{len(providers)})")
|
f"with model '{use_model}' ({i + 1}/{len(providers)})")
|
||||||
return self._call_api(provider, use_model, question, image_content)
|
if provider.use_bot:
|
||||||
|
result = self._call_via_bot(use_model, question, image_content, provider)
|
||||||
|
else:
|
||||||
|
result = self._call_api(provider, use_model, question, image_content)
|
||||||
|
logger.info(f"[Vision] ✅ Success via {provider.name} (model={use_model})")
|
||||||
|
return result
|
||||||
except VisionAPIError as e:
|
except VisionAPIError as e:
|
||||||
errors.append(f"[{provider.name}/{use_model}] {e}")
|
errors.append(f"[{provider.name}/{use_model}] {e}")
|
||||||
logger.warning(f"[Vision] Provider '{provider.name}' failed: {e}")
|
logger.warning(f"[Vision] Provider '{provider.name}' failed: {e}")
|
||||||
@@ -148,35 +163,113 @@ class Vision(BaseTool):
|
|||||||
def _resolve_providers(self) -> List[VisionProvider]:
|
def _resolve_providers(self) -> List[VisionProvider]:
|
||||||
"""
|
"""
|
||||||
Build an ordered list of available providers.
|
Build an ordered list of available providers.
|
||||||
Each provider builder returns a VisionProvider or None.
|
|
||||||
To add a new provider, append a builder method to _PROVIDER_BUILDERS.
|
Priority:
|
||||||
|
- use_linkai=true → [LinkAI, MainModel, OtherModels…, OpenAI]
|
||||||
|
- default → [MainModel, OtherModels…, OpenAI, LinkAI]
|
||||||
|
|
||||||
|
"OtherModels" are auto-discovered from configured API keys.
|
||||||
|
The main model's bot_type is excluded from OtherModels to avoid
|
||||||
|
duplicating the MainModel provider.
|
||||||
"""
|
"""
|
||||||
|
use_linkai = conf().get("use_linkai", False) and conf().get("linkai_api_key")
|
||||||
providers: List[VisionProvider] = []
|
providers: List[VisionProvider] = []
|
||||||
for builder in self._PROVIDER_BUILDERS:
|
|
||||||
provider = builder(self)
|
if use_linkai:
|
||||||
if provider:
|
self._append_provider(providers, self._build_linkai_provider)
|
||||||
providers.append(provider)
|
self._append_provider(providers, self._build_main_model_provider)
|
||||||
|
self._append_other_model_providers(providers)
|
||||||
|
self._append_provider(providers, self._build_openai_provider)
|
||||||
|
else:
|
||||||
|
self._append_provider(providers, self._build_main_model_provider)
|
||||||
|
self._append_other_model_providers(providers)
|
||||||
|
self._append_provider(providers, self._build_openai_provider)
|
||||||
|
self._append_provider(providers, self._build_linkai_provider)
|
||||||
|
|
||||||
return providers
|
return providers
|
||||||
|
|
||||||
def _build_custom_model_provider(self) -> Optional[VisionProvider]:
|
@staticmethod
|
||||||
|
def _append_provider(providers: List[VisionProvider], builder) -> None:
|
||||||
|
p = builder()
|
||||||
|
if p:
|
||||||
|
providers.append(p)
|
||||||
|
|
||||||
|
def _append_other_model_providers(self, providers: List[VisionProvider]) -> None:
|
||||||
"""
|
"""
|
||||||
When bot_type is openai-compatible and a custom model is configured,
|
Auto-discover other models whose API key is configured.
|
||||||
try the user's own model first — it may already support multimodal input.
|
Skip the main model's own bot_type (already covered by MainModel provider).
|
||||||
|
Skip bot_types that already have a provider in the list (e.g. OpenAI).
|
||||||
"""
|
"""
|
||||||
bot_type = conf().get("bot_type", "")
|
# Determine main model's bot_type so we can skip it
|
||||||
if bot_type not in OPENAI_COMPATIBLE_BOT_TYPES:
|
main_bot_type = None
|
||||||
|
if self.model and hasattr(self.model, '_resolve_bot_type'):
|
||||||
|
main_bot_type = self.model._resolve_bot_type(conf().get("model", ""))
|
||||||
|
|
||||||
|
existing_names = {p.name for p in providers}
|
||||||
|
|
||||||
|
for config_key, bot_type, default_model, display_name in _DISCOVERABLE_MODELS:
|
||||||
|
if display_name in existing_names:
|
||||||
|
continue
|
||||||
|
if bot_type == main_bot_type:
|
||||||
|
continue
|
||||||
|
api_key = conf().get(config_key, "")
|
||||||
|
if not api_key or not api_key.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Create a bot instance and check if it supports call_vision
|
||||||
|
try:
|
||||||
|
from models.bot_factory import create_bot
|
||||||
|
bot = create_bot(bot_type)
|
||||||
|
if not hasattr(bot, 'call_vision'):
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
providers.append(VisionProvider(
|
||||||
|
name=display_name,
|
||||||
|
api_key="",
|
||||||
|
api_base="",
|
||||||
|
model_override=default_model,
|
||||||
|
use_bot=True,
|
||||||
|
fallback_bot=bot,
|
||||||
|
))
|
||||||
|
|
||||||
|
def _resolve_vision_model(self) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Determine which model to use for vision.
|
||||||
|
|
||||||
|
1. User explicit config: tool.vision.model in config.json
|
||||||
|
2. Fallback to the main configured model name
|
||||||
|
"""
|
||||||
|
tool_conf = conf().get("tool", {})
|
||||||
|
user_vision_model = tool_conf.get("vision", {}).get("model") if isinstance(tool_conf, dict) else None
|
||||||
|
if user_vision_model:
|
||||||
|
return user_vision_model
|
||||||
|
model_name = conf().get("model", "")
|
||||||
|
return model_name or None
|
||||||
|
|
||||||
|
def _build_main_model_provider(self) -> Optional[VisionProvider]:
|
||||||
|
"""
|
||||||
|
Use the vendor's own model for vision via bot.call_vision.
|
||||||
|
Only available when the bot class has call_vision.
|
||||||
|
"""
|
||||||
|
if not (self.model and hasattr(self.model, 'bot')):
|
||||||
return None
|
return None
|
||||||
custom_model = conf().get("model", "")
|
try:
|
||||||
if not custom_model or custom_model == DEFAULT_MODEL:
|
bot = self.model.bot
|
||||||
|
if not hasattr(bot, 'call_vision'):
|
||||||
|
return None
|
||||||
|
except Exception:
|
||||||
return None
|
return None
|
||||||
api_key = conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY")
|
|
||||||
if not api_key:
|
vision_model = self._resolve_vision_model()
|
||||||
return None
|
|
||||||
api_base = (conf().get("open_ai_api_base") or os.environ.get("OPENAI_API_BASE", "")).rstrip("/") \
|
|
||||||
or "https://api.openai.com/v1"
|
|
||||||
return VisionProvider(
|
return VisionProvider(
|
||||||
name="CustomModel", api_key=api_key, api_base=self._ensure_v1(api_base),
|
name=_MAIN_MODEL_PROVIDER_NAME,
|
||||||
model_override=custom_model,
|
api_key="",
|
||||||
|
api_base="",
|
||||||
|
model_override=vision_model,
|
||||||
|
use_bot=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _build_openai_provider(self) -> Optional[VisionProvider]:
|
def _build_openai_provider(self) -> Optional[VisionProvider]:
|
||||||
@@ -200,7 +293,54 @@ class Vision(BaseTool):
|
|||||||
return VisionProvider(name="LinkAI", api_key=api_key, api_base=self._ensure_v1(api_base),
|
return VisionProvider(name="LinkAI", api_key=api_key, api_base=self._ensure_v1(api_base),
|
||||||
extra_headers=extra)
|
extra_headers=extra)
|
||||||
|
|
||||||
_PROVIDER_BUILDERS = [_build_custom_model_provider, _build_openai_provider, _build_linkai_provider]
|
def _call_via_bot(self, model: str, question: str, image_content: dict,
|
||||||
|
provider: Optional[VisionProvider] = None) -> ToolResult:
|
||||||
|
"""
|
||||||
|
Call a model's call_vision with vendor-native API format.
|
||||||
|
Uses the provider's _fallback_bot if set, otherwise the main model bot.
|
||||||
|
Raises VisionAPIError on failure so fallback can proceed.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
bot = (provider and provider.fallback_bot) or self.model.bot
|
||||||
|
except Exception as e:
|
||||||
|
raise VisionAPIError(f"Cannot access bot: {e}")
|
||||||
|
|
||||||
|
# Extract the raw image URL from the OpenAI-format image_content block
|
||||||
|
image_url = image_content.get("image_url", {}).get("url", "")
|
||||||
|
if not image_url:
|
||||||
|
raise VisionAPIError("No image URL in content block")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = bot.call_vision(
|
||||||
|
image_url=image_url,
|
||||||
|
question=question,
|
||||||
|
model=model,
|
||||||
|
max_tokens=MAX_TOKENS,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
raise VisionAPIError(f"call_vision failed: {e}")
|
||||||
|
|
||||||
|
if response is NotImplemented:
|
||||||
|
raise VisionAPIError("Bot does not support vision")
|
||||||
|
|
||||||
|
if isinstance(response, dict) and response.get("error"):
|
||||||
|
raise VisionAPIError(f"API error - {response.get('message', 'Unknown')}")
|
||||||
|
|
||||||
|
content = response.get("content", "") if isinstance(response, dict) else ""
|
||||||
|
if not content:
|
||||||
|
raise VisionAPIError("Empty response from main model")
|
||||||
|
|
||||||
|
usage_info = response.get("usage", {}) if isinstance(response, dict) else {}
|
||||||
|
|
||||||
|
# Use the actual model name from the bot response if available
|
||||||
|
actual_model = response.get("model", model) if isinstance(response, dict) else model
|
||||||
|
provider_name = provider.name if provider else _MAIN_MODEL_PROVIDER_NAME
|
||||||
|
return ToolResult.success({
|
||||||
|
"model": actual_model,
|
||||||
|
"provider": provider_name,
|
||||||
|
"content": content,
|
||||||
|
"usage": usage_info,
|
||||||
|
})
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _ensure_v1(api_base: str) -> str:
|
def _ensure_v1(api_base: str) -> str:
|
||||||
@@ -213,9 +353,13 @@ class Vision(BaseTool):
|
|||||||
return api_base.rstrip("/") + "/v1"
|
return api_base.rstrip("/") + "/v1"
|
||||||
|
|
||||||
def _build_image_content(self, image: str) -> dict:
|
def _build_image_content(self, image: str) -> dict:
|
||||||
"""Build the image_url content block for the API request."""
|
"""
|
||||||
|
Build the image_url content block.
|
||||||
|
Both remote URLs and local files are converted to base64 data URLs
|
||||||
|
so every bot backend can consume them without extra downloads.
|
||||||
|
"""
|
||||||
if image.startswith(("http://", "https://")):
|
if image.startswith(("http://", "https://")):
|
||||||
return {"type": "image_url", "image_url": {"url": image}}
|
return self._download_to_data_url(image)
|
||||||
|
|
||||||
if not os.path.isfile(image):
|
if not os.path.isfile(image):
|
||||||
raise FileNotFoundError(f"Image file not found: {image}")
|
raise FileNotFoundError(f"Image file not found: {image}")
|
||||||
@@ -239,6 +383,19 @@ class Vision(BaseTool):
|
|||||||
data_url = f"data:{mime_type};base64,{b64}"
|
data_url = f"data:{mime_type};base64,{b64}"
|
||||||
return {"type": "image_url", "image_url": {"url": data_url}}
|
return {"type": "image_url", "image_url": {"url": data_url}}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _download_to_data_url(url: str) -> dict:
|
||||||
|
"""Download a remote image and return it as a base64 data URL."""
|
||||||
|
resp = requests.get(url, timeout=30)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
raise VisionAPIError(f"Failed to download image: HTTP {resp.status_code}")
|
||||||
|
content_type = resp.headers.get("Content-Type", "image/jpeg").split(";")[0].strip()
|
||||||
|
if not content_type.startswith("image/"):
|
||||||
|
content_type = "image/jpeg"
|
||||||
|
b64 = base64.b64encode(resp.content).decode("ascii")
|
||||||
|
data_url = f"data:{content_type};base64,{b64}"
|
||||||
|
return {"type": "image_url", "image_url": {"url": data_url}}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _maybe_compress(path: str) -> str:
|
def _maybe_compress(path: str) -> str:
|
||||||
"""Compress image to under COMPRESS_THRESHOLD with max long-edge 1536px."""
|
"""Compress image to under COMPRESS_THRESHOLD with max long-edge 1536px."""
|
||||||
@@ -312,7 +469,6 @@ class Vision(BaseTool):
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"max_completion_tokens": MAX_TOKENS,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
|
|||||||
@@ -124,14 +124,15 @@ class AgentLLMModel(LLMModel):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def bot(self):
|
def bot(self):
|
||||||
"""Lazy load the bot, re-create when model changes"""
|
"""Lazy load the bot, re-create when model or bot_type changes"""
|
||||||
from models.bot_factory import create_bot
|
from models.bot_factory import create_bot
|
||||||
cur_model = self.model
|
cur_model = self.model
|
||||||
if self._bot is None or self._bot_model != cur_model:
|
cur_bot_type = self._resolve_bot_type(cur_model)
|
||||||
bot_type = self._resolve_bot_type(cur_model)
|
if self._bot is None or self._bot_model != cur_model or getattr(self, '_bot_type', None) != cur_bot_type:
|
||||||
self._bot = create_bot(bot_type)
|
self._bot = create_bot(cur_bot_type)
|
||||||
self._bot = add_openai_compatible_support(self._bot)
|
self._bot = add_openai_compatible_support(self._bot)
|
||||||
self._bot_model = cur_model
|
self._bot_model = cur_model
|
||||||
|
self._bot_type = cur_bot_type
|
||||||
return self._bot
|
return self._bot
|
||||||
|
|
||||||
def call(self, request: LLMRequest):
|
def call(self, request: LLMRequest):
|
||||||
@@ -505,15 +506,15 @@ class AgentBridge:
|
|||||||
|
|
||||||
def _migrate_config_to_env(self, workspace_root: str):
|
def _migrate_config_to_env(self, workspace_root: str):
|
||||||
"""
|
"""
|
||||||
Migrate API keys from config.json to .env file if not already set
|
Sync API keys from config.json to .env file.
|
||||||
|
Adds new keys and updates changed values on each startup.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
workspace_root: Workspace directory path (not used, kept for compatibility)
|
workspace_root: Workspace directory path (not used, kept for compatibility)
|
||||||
"""
|
"""
|
||||||
from config import conf
|
from config import conf
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# Mapping from config.json keys to environment variable names
|
|
||||||
key_mapping = {
|
key_mapping = {
|
||||||
"open_ai_api_key": "OPENAI_API_KEY",
|
"open_ai_api_key": "OPENAI_API_KEY",
|
||||||
"open_ai_api_base": "OPENAI_API_BASE",
|
"open_ai_api_base": "OPENAI_API_BASE",
|
||||||
@@ -522,10 +523,9 @@ class AgentBridge:
|
|||||||
"linkai_api_key": "LINKAI_API_KEY",
|
"linkai_api_key": "LINKAI_API_KEY",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Use fixed secure location for .env file
|
|
||||||
env_file = expand_path("~/.cow/.env")
|
env_file = expand_path("~/.cow/.env")
|
||||||
|
|
||||||
# Read existing env vars from .env file
|
# Read existing env vars (key -> value)
|
||||||
existing_env_vars = {}
|
existing_env_vars = {}
|
||||||
if os.path.exists(env_file):
|
if os.path.exists(env_file):
|
||||||
try:
|
try:
|
||||||
@@ -533,48 +533,46 @@ class AgentBridge:
|
|||||||
for line in f:
|
for line in f:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if line and not line.startswith('#') and '=' in line:
|
if line and not line.startswith('#') and '=' in line:
|
||||||
key, _ = line.split('=', 1)
|
key, val = line.split('=', 1)
|
||||||
existing_env_vars[key.strip()] = True
|
existing_env_vars[key.strip()] = val.strip()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"[AgentBridge] Failed to read .env file: {e}")
|
logger.warning(f"[AgentBridge] Failed to read .env file: {e}")
|
||||||
|
|
||||||
# Check which keys need to be migrated
|
# Sync config.json values into .env (add/update/remove)
|
||||||
keys_to_migrate = {}
|
updated = False
|
||||||
for config_key, env_key in key_mapping.items():
|
for config_key, env_key in key_mapping.items():
|
||||||
# Skip if already in .env file
|
raw = conf().get(config_key, "")
|
||||||
if env_key in existing_env_vars:
|
value = raw.strip() if raw else ""
|
||||||
continue
|
old_value = existing_env_vars.get(env_key)
|
||||||
|
|
||||||
# Get value from config.json
|
if value:
|
||||||
value = conf().get(config_key, "")
|
if old_value == value:
|
||||||
if value and value.strip(): # Only migrate non-empty values
|
continue
|
||||||
keys_to_migrate[env_key] = value.strip()
|
existing_env_vars[env_key] = value
|
||||||
|
os.environ[env_key] = value
|
||||||
# Log summary if there are keys to skip
|
updated = True
|
||||||
if existing_env_vars:
|
else:
|
||||||
logger.debug(f"[AgentBridge] {len(existing_env_vars)} env vars already in .env")
|
if old_value is None:
|
||||||
|
continue
|
||||||
# Write new keys to .env file
|
existing_env_vars.pop(env_key, None)
|
||||||
if keys_to_migrate:
|
os.environ.pop(env_key, None)
|
||||||
|
updated = True
|
||||||
|
updated = True
|
||||||
|
|
||||||
|
if updated:
|
||||||
try:
|
try:
|
||||||
# Ensure ~/.cow directory and .env file exist
|
|
||||||
env_dir = os.path.dirname(env_file)
|
env_dir = os.path.dirname(env_file)
|
||||||
if not os.path.exists(env_dir):
|
os.makedirs(env_dir, exist_ok=True)
|
||||||
os.makedirs(env_dir, exist_ok=True)
|
|
||||||
if not os.path.exists(env_file):
|
with open(env_file, 'w', encoding='utf-8') as f:
|
||||||
open(env_file, 'a').close()
|
f.write('# Environment variables for agent\n')
|
||||||
|
f.write('# Auto-managed - synced from config.json on startup\n\n')
|
||||||
# Append new keys
|
for key, value in sorted(existing_env_vars.items()):
|
||||||
with open(env_file, 'a', encoding='utf-8') as f:
|
|
||||||
f.write('\n# Auto-migrated from config.json\n')
|
|
||||||
for key, value in keys_to_migrate.items():
|
|
||||||
f.write(f'{key}={value}\n')
|
f.write(f'{key}={value}\n')
|
||||||
# Also set in current process
|
|
||||||
os.environ[key] = value
|
logger.info(f"[AgentBridge] Synced API keys from config.json to .env")
|
||||||
|
|
||||||
logger.info(f"[AgentBridge] Migrated {len(keys_to_migrate)} API keys from config.json to .env: {list(keys_to_migrate.keys())}")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"[AgentBridge] Failed to migrate API keys: {e}")
|
logger.warning(f"[AgentBridge] Failed to sync API keys: {e}")
|
||||||
|
|
||||||
def _persist_messages(
|
def _persist_messages(
|
||||||
self, session_id: str, new_messages: list, channel_type: str = ""
|
self, session_id: str, new_messages: list, channel_type: str = ""
|
||||||
|
|||||||
@@ -490,7 +490,7 @@ class AgentInitializer:
|
|||||||
|
|
||||||
env_file = expand_path("~/.cow/.env")
|
env_file = expand_path("~/.cow/.env")
|
||||||
|
|
||||||
# Read existing env vars
|
# Read existing env vars (key -> value)
|
||||||
existing_env_vars = {}
|
existing_env_vars = {}
|
||||||
if os.path.exists(env_file):
|
if os.path.exists(env_file):
|
||||||
try:
|
try:
|
||||||
@@ -498,38 +498,46 @@ class AgentInitializer:
|
|||||||
for line in f:
|
for line in f:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if line and not line.startswith('#') and '=' in line:
|
if line and not line.startswith('#') and '=' in line:
|
||||||
key, _ = line.split('=', 1)
|
key, val = line.split('=', 1)
|
||||||
existing_env_vars[key.strip()] = True
|
existing_env_vars[key.strip()] = val.strip()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"[AgentInitializer] Failed to read .env file: {e}")
|
logger.warning(f"[AgentInitializer] Failed to read .env file: {e}")
|
||||||
|
|
||||||
# Check which keys need migration
|
# Sync config.json values into .env (add/update/remove)
|
||||||
keys_to_migrate = {}
|
updated = False
|
||||||
for config_key, env_key in key_mapping.items():
|
for config_key, env_key in key_mapping.items():
|
||||||
if env_key in existing_env_vars:
|
raw = conf().get(config_key, "")
|
||||||
continue
|
value = raw.strip() if raw else ""
|
||||||
value = conf().get(config_key, "")
|
old_value = existing_env_vars.get(env_key)
|
||||||
if value and value.strip():
|
|
||||||
keys_to_migrate[env_key] = value.strip()
|
if value:
|
||||||
|
if old_value == value:
|
||||||
# Write new keys
|
continue
|
||||||
if keys_to_migrate:
|
existing_env_vars[env_key] = value
|
||||||
|
os.environ[env_key] = value
|
||||||
|
updated = True
|
||||||
|
else:
|
||||||
|
if old_value is None:
|
||||||
|
continue
|
||||||
|
existing_env_vars.pop(env_key, None)
|
||||||
|
os.environ.pop(env_key, None)
|
||||||
|
updated = True
|
||||||
|
|
||||||
|
if updated:
|
||||||
try:
|
try:
|
||||||
env_dir = os.path.dirname(env_file)
|
env_dir = os.path.dirname(env_file)
|
||||||
if not os.path.exists(env_dir):
|
os.makedirs(env_dir, exist_ok=True)
|
||||||
os.makedirs(env_dir, exist_ok=True)
|
|
||||||
if not os.path.exists(env_file):
|
# Rewrite the entire .env file to ensure consistency
|
||||||
open(env_file, 'a').close()
|
with open(env_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write('# Environment variables for agent\n')
|
||||||
with open(env_file, 'a', encoding='utf-8') as f:
|
f.write('# Auto-managed - synced from config.json on startup\n\n')
|
||||||
f.write('\n# Auto-migrated from config.json\n')
|
for key, value in sorted(existing_env_vars.items()):
|
||||||
for key, value in keys_to_migrate.items():
|
|
||||||
f.write(f'{key}={value}\n')
|
f.write(f'{key}={value}\n')
|
||||||
os.environ[key] = value
|
|
||||||
|
logger.info(f"[AgentInitializer] Synced API keys from config.json to .env")
|
||||||
logger.info(f"[AgentInitializer] Migrated {len(keys_to_migrate)} API keys to .env: {list(keys_to_migrate.keys())}")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"[AgentInitializer] Failed to migrate API keys: {e}")
|
logger.warning(f"[AgentInitializer] Failed to sync API keys: {e}")
|
||||||
|
|
||||||
def _start_daily_flush_timer(self):
|
def _start_daily_flush_timer(self):
|
||||||
"""Start a background thread that flushes all agents' memory daily at 23:55."""
|
"""Start a background thread that flushes all agents' memory daily at 23:55."""
|
||||||
|
|||||||
@@ -806,15 +806,17 @@ function sendMessage() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function startSSE(requestId, loadingEl, timestamp) {
|
function startSSE(requestId, loadingEl, timestamp) {
|
||||||
const es = new EventSource(`/stream?request_id=${encodeURIComponent(requestId)}`);
|
|
||||||
activeStreams[requestId] = es;
|
|
||||||
|
|
||||||
let botEl = null;
|
let botEl = null;
|
||||||
let stepsEl = null; // .agent-steps (thinking summaries + tool indicators)
|
let stepsEl = null; // .agent-steps (thinking summaries + tool indicators)
|
||||||
let contentEl = null; // .answer-content (final streaming answer)
|
let contentEl = null; // .answer-content (final streaming answer)
|
||||||
let mediaEl = null; // .media-content (images & file attachments)
|
let mediaEl = null; // .media-content (images & file attachments)
|
||||||
let accumulatedText = '';
|
let accumulatedText = '';
|
||||||
let currentToolEl = null;
|
let currentToolEl = null;
|
||||||
|
let done = false;
|
||||||
|
|
||||||
|
const MAX_RECONNECTS = 10;
|
||||||
|
const RECONNECT_BASE_MS = 1000;
|
||||||
|
let reconnectCount = 0;
|
||||||
|
|
||||||
function ensureBotEl() {
|
function ensureBotEl() {
|
||||||
if (botEl) return;
|
if (botEl) return;
|
||||||
@@ -839,180 +841,204 @@ function startSSE(requestId, loadingEl, timestamp) {
|
|||||||
mediaEl = botEl.querySelector('.media-content');
|
mediaEl = botEl.querySelector('.media-content');
|
||||||
}
|
}
|
||||||
|
|
||||||
es.onmessage = function(e) {
|
function connect() {
|
||||||
let item;
|
const es = new EventSource(`/stream?request_id=${encodeURIComponent(requestId)}`);
|
||||||
try { item = JSON.parse(e.data); } catch (_) { return; }
|
activeStreams[requestId] = es;
|
||||||
|
|
||||||
if (item.type === 'delta') {
|
es.onmessage = function(e) {
|
||||||
ensureBotEl();
|
let item;
|
||||||
accumulatedText += item.content;
|
try { item = JSON.parse(e.data); } catch (_) { return; }
|
||||||
contentEl.innerHTML = renderMarkdown(accumulatedText);
|
|
||||||
scrollChatToBottom();
|
|
||||||
|
|
||||||
} else if (item.type === 'tool_start') {
|
// Successful data received, reset reconnect counter
|
||||||
ensureBotEl();
|
reconnectCount = 0;
|
||||||
|
|
||||||
// Save current thinking as a collapsible step
|
if (item.type === 'delta') {
|
||||||
if (accumulatedText.trim()) {
|
ensureBotEl();
|
||||||
const fullText = accumulatedText.trim();
|
accumulatedText += item.content;
|
||||||
const oneLine = fullText.replace(/\n+/g, ' ');
|
contentEl.innerHTML = renderMarkdown(accumulatedText);
|
||||||
const needsTruncate = oneLine.length > 80;
|
scrollChatToBottom();
|
||||||
const stepEl = document.createElement('div');
|
|
||||||
stepEl.className = 'agent-step agent-thinking-step' + (needsTruncate ? '' : ' no-expand');
|
} else if (item.type === 'tool_start') {
|
||||||
if (needsTruncate) {
|
ensureBotEl();
|
||||||
const truncated = oneLine.substring(0, 80) + '…';
|
|
||||||
stepEl.innerHTML = `
|
// Save current thinking as a collapsible step
|
||||||
<div class="thinking-header" onclick="this.parentElement.classList.toggle('expanded')">
|
if (accumulatedText.trim()) {
|
||||||
<i class="fas fa-lightbulb text-amber-400 flex-shrink-0"></i>
|
const fullText = accumulatedText.trim();
|
||||||
<span class="thinking-summary">${escapeHtml(truncated)}</span>
|
const oneLine = fullText.replace(/\n+/g, ' ');
|
||||||
<i class="fas fa-chevron-right thinking-chevron"></i>
|
const needsTruncate = oneLine.length > 80;
|
||||||
</div>
|
const stepEl = document.createElement('div');
|
||||||
<div class="thinking-full">${renderMarkdown(fullText)}</div>`;
|
stepEl.className = 'agent-step agent-thinking-step' + (needsTruncate ? '' : ' no-expand');
|
||||||
} else {
|
if (needsTruncate) {
|
||||||
stepEl.innerHTML = `
|
const truncated = oneLine.substring(0, 80) + '…';
|
||||||
<div class="thinking-header no-toggle">
|
stepEl.innerHTML = `
|
||||||
<i class="fas fa-lightbulb text-amber-400 flex-shrink-0"></i>
|
<div class="thinking-header" onclick="this.parentElement.classList.toggle('expanded')">
|
||||||
<span>${escapeHtml(oneLine)}</span>
|
<i class="fas fa-lightbulb text-amber-400 flex-shrink-0"></i>
|
||||||
</div>`;
|
<span class="thinking-summary">${escapeHtml(truncated)}</span>
|
||||||
|
<i class="fas fa-chevron-right thinking-chevron"></i>
|
||||||
|
</div>
|
||||||
|
<div class="thinking-full">${renderMarkdown(fullText)}</div>`;
|
||||||
|
} else {
|
||||||
|
stepEl.innerHTML = `
|
||||||
|
<div class="thinking-header no-toggle">
|
||||||
|
<i class="fas fa-lightbulb text-amber-400 flex-shrink-0"></i>
|
||||||
|
<span>${escapeHtml(oneLine)}</span>
|
||||||
|
</div>`;
|
||||||
|
}
|
||||||
|
stepsEl.appendChild(stepEl);
|
||||||
}
|
}
|
||||||
stepsEl.appendChild(stepEl);
|
accumulatedText = '';
|
||||||
}
|
contentEl.innerHTML = '';
|
||||||
accumulatedText = '';
|
|
||||||
contentEl.innerHTML = '';
|
|
||||||
|
|
||||||
// Add tool execution indicator (collapsible)
|
// Add tool execution indicator (collapsible)
|
||||||
currentToolEl = document.createElement('div');
|
currentToolEl = document.createElement('div');
|
||||||
currentToolEl.className = 'agent-step agent-tool-step';
|
currentToolEl.className = 'agent-step agent-tool-step';
|
||||||
const argsStr = formatToolArgs(item.arguments || {});
|
const argsStr = formatToolArgs(item.arguments || {});
|
||||||
currentToolEl.innerHTML = `
|
currentToolEl.innerHTML = `
|
||||||
<div class="tool-header" onclick="this.parentElement.classList.toggle('expanded')">
|
<div class="tool-header" onclick="this.parentElement.classList.toggle('expanded')">
|
||||||
<i class="fas fa-cog fa-spin text-primary-400 flex-shrink-0 tool-icon"></i>
|
<i class="fas fa-cog fa-spin text-primary-400 flex-shrink-0 tool-icon"></i>
|
||||||
<span class="tool-name">${item.tool}</span>
|
<span class="tool-name">${item.tool}</span>
|
||||||
<i class="fas fa-chevron-right tool-chevron"></i>
|
<i class="fas fa-chevron-right tool-chevron"></i>
|
||||||
</div>
|
|
||||||
<div class="tool-detail">
|
|
||||||
<div class="tool-detail-section">
|
|
||||||
<div class="tool-detail-label">Input</div>
|
|
||||||
<pre class="tool-detail-content">${argsStr}</pre>
|
|
||||||
</div>
|
</div>
|
||||||
<div class="tool-detail-section tool-output-section"></div>
|
<div class="tool-detail">
|
||||||
</div>`;
|
<div class="tool-detail-section">
|
||||||
stepsEl.appendChild(currentToolEl);
|
<div class="tool-detail-label">Input</div>
|
||||||
|
<pre class="tool-detail-content">${argsStr}</pre>
|
||||||
|
</div>
|
||||||
|
<div class="tool-detail-section tool-output-section"></div>
|
||||||
|
</div>`;
|
||||||
|
stepsEl.appendChild(currentToolEl);
|
||||||
|
|
||||||
scrollChatToBottom();
|
scrollChatToBottom();
|
||||||
|
|
||||||
} else if (item.type === 'tool_end') {
|
} else if (item.type === 'tool_end') {
|
||||||
if (currentToolEl) {
|
if (currentToolEl) {
|
||||||
const isError = item.status !== 'success';
|
const isError = item.status !== 'success';
|
||||||
const icon = currentToolEl.querySelector('.tool-icon');
|
const icon = currentToolEl.querySelector('.tool-icon');
|
||||||
icon.className = isError
|
icon.className = isError
|
||||||
? 'fas fa-times text-red-400 flex-shrink-0 tool-icon'
|
? 'fas fa-times text-red-400 flex-shrink-0 tool-icon'
|
||||||
: 'fas fa-check text-primary-400 flex-shrink-0 tool-icon';
|
: 'fas fa-check text-primary-400 flex-shrink-0 tool-icon';
|
||||||
|
|
||||||
// Show execution time
|
// Show execution time
|
||||||
const nameEl = currentToolEl.querySelector('.tool-name');
|
const nameEl = currentToolEl.querySelector('.tool-name');
|
||||||
if (item.execution_time !== undefined) {
|
if (item.execution_time !== undefined) {
|
||||||
nameEl.innerHTML += ` <span class="tool-time">${item.execution_time}s</span>`;
|
nameEl.innerHTML += ` <span class="tool-time">${item.execution_time}s</span>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fill output section
|
||||||
|
const outputSection = currentToolEl.querySelector('.tool-output-section');
|
||||||
|
if (outputSection && item.result) {
|
||||||
|
outputSection.innerHTML = `
|
||||||
|
<div class="tool-detail-label">${isError ? 'Error' : 'Output'}</div>
|
||||||
|
<pre class="tool-detail-content ${isError ? 'tool-error-text' : ''}">${escapeHtml(String(item.result))}</pre>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isError) currentToolEl.classList.add('tool-failed');
|
||||||
|
currentToolEl = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fill output section
|
} else if (item.type === 'image') {
|
||||||
const outputSection = currentToolEl.querySelector('.tool-output-section');
|
ensureBotEl();
|
||||||
if (outputSection && item.result) {
|
const imgEl = document.createElement('img');
|
||||||
outputSection.innerHTML = `
|
imgEl.src = item.content;
|
||||||
<div class="tool-detail-label">${isError ? 'Error' : 'Output'}</div>
|
imgEl.alt = 'screenshot';
|
||||||
<pre class="tool-detail-content ${isError ? 'tool-error-text' : ''}">${escapeHtml(String(item.result))}</pre>`;
|
imgEl.style.cssText = 'max-width:600px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);';
|
||||||
}
|
imgEl.onclick = () => window.open(item.content, '_blank');
|
||||||
|
mediaEl.appendChild(imgEl);
|
||||||
|
scrollChatToBottom();
|
||||||
|
|
||||||
if (isError) currentToolEl.classList.add('tool-failed');
|
} else if (item.type === 'text') {
|
||||||
currentToolEl = null;
|
// Intermediate text sent before media items; display it but keep SSE open.
|
||||||
|
ensureBotEl();
|
||||||
|
contentEl.classList.remove('sse-streaming');
|
||||||
|
const textContent = item.content || accumulatedText;
|
||||||
|
if (textContent) contentEl.innerHTML = renderMarkdown(textContent);
|
||||||
|
applyHighlighting(botEl);
|
||||||
|
scrollChatToBottom();
|
||||||
|
|
||||||
|
} else if (item.type === 'video') {
|
||||||
|
ensureBotEl();
|
||||||
|
const wrapper = document.createElement('div');
|
||||||
|
wrapper.innerHTML = _buildVideoHtml(item.content);
|
||||||
|
mediaEl.appendChild(wrapper.firstElementChild || wrapper);
|
||||||
|
scrollChatToBottom();
|
||||||
|
|
||||||
|
} else if (item.type === 'file') {
|
||||||
|
ensureBotEl();
|
||||||
|
const fileName = item.file_name || item.content.split('/').pop();
|
||||||
|
const fileEl = document.createElement('a');
|
||||||
|
fileEl.href = item.content;
|
||||||
|
fileEl.download = fileName;
|
||||||
|
fileEl.target = '_blank';
|
||||||
|
fileEl.className = 'file-attachment';
|
||||||
|
fileEl.style.cssText = 'display:inline-flex;align-items:center;gap:6px;padding:8px 14px;margin:8px 0;border-radius:8px;background:var(--bg-secondary,#f3f4f6);color:var(--text-primary,#374151);text-decoration:none;font-size:14px;border:1px solid var(--border-color,#e5e7eb);';
|
||||||
|
fileEl.innerHTML = `<i class="fas fa-file-download" style="color:#6b7280;"></i> ${fileName}`;
|
||||||
|
mediaEl.appendChild(fileEl);
|
||||||
|
scrollChatToBottom();
|
||||||
|
|
||||||
|
} else if (item.type === 'phase') {
|
||||||
|
// Coarse progress (e.g. cow install-browser); must not close SSE (unlike "done")
|
||||||
|
ensureBotEl();
|
||||||
|
const wrap = document.createElement('div');
|
||||||
|
wrap.className = 'text-xs sm:text-sm text-slate-600 dark:text-slate-400 border-l-2 border-primary-400 pl-2 py-1 my-0.5';
|
||||||
|
wrap.textContent = String(item.content || '');
|
||||||
|
stepsEl.appendChild(wrap);
|
||||||
|
scrollChatToBottom();
|
||||||
|
|
||||||
|
} else if (item.type === 'done') {
|
||||||
|
done = true;
|
||||||
|
es.close();
|
||||||
|
delete activeStreams[requestId];
|
||||||
|
|
||||||
|
// item.content may be empty when "done" is only a stream-close signal after media.
|
||||||
|
const finalText = item.content || accumulatedText;
|
||||||
|
|
||||||
|
if (!botEl && finalText) {
|
||||||
|
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
|
||||||
|
addBotMessage(finalText, new Date((item.timestamp || Date.now() / 1000) * 1000), requestId);
|
||||||
|
} else if (botEl) {
|
||||||
|
contentEl.classList.remove('sse-streaming');
|
||||||
|
// Only update text content when there is something new to show.
|
||||||
|
if (finalText) contentEl.innerHTML = renderMarkdown(finalText);
|
||||||
|
applyHighlighting(botEl);
|
||||||
|
}
|
||||||
|
scrollChatToBottom();
|
||||||
|
|
||||||
|
} else if (item.type === 'error') {
|
||||||
|
done = true;
|
||||||
|
es.close();
|
||||||
|
delete activeStreams[requestId];
|
||||||
|
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
|
||||||
|
addBotMessage(t('error_send'), new Date());
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
} else if (item.type === 'image') {
|
es.onerror = function() {
|
||||||
ensureBotEl();
|
|
||||||
const imgEl = document.createElement('img');
|
|
||||||
imgEl.src = item.content;
|
|
||||||
imgEl.alt = 'screenshot';
|
|
||||||
imgEl.style.cssText = 'max-width:600px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);';
|
|
||||||
imgEl.onclick = () => window.open(item.content, '_blank');
|
|
||||||
mediaEl.appendChild(imgEl);
|
|
||||||
scrollChatToBottom();
|
|
||||||
|
|
||||||
} else if (item.type === 'text') {
|
|
||||||
// Intermediate text sent before media items; display it but keep SSE open.
|
|
||||||
ensureBotEl();
|
|
||||||
contentEl.classList.remove('sse-streaming');
|
|
||||||
const textContent = item.content || accumulatedText;
|
|
||||||
if (textContent) contentEl.innerHTML = renderMarkdown(textContent);
|
|
||||||
applyHighlighting(botEl);
|
|
||||||
scrollChatToBottom();
|
|
||||||
|
|
||||||
} else if (item.type === 'video') {
|
|
||||||
ensureBotEl();
|
|
||||||
const wrapper = document.createElement('div');
|
|
||||||
wrapper.innerHTML = _buildVideoHtml(item.content);
|
|
||||||
mediaEl.appendChild(wrapper.firstElementChild || wrapper);
|
|
||||||
scrollChatToBottom();
|
|
||||||
|
|
||||||
} else if (item.type === 'file') {
|
|
||||||
ensureBotEl();
|
|
||||||
const fileName = item.file_name || item.content.split('/').pop();
|
|
||||||
const fileEl = document.createElement('a');
|
|
||||||
fileEl.href = item.content;
|
|
||||||
fileEl.download = fileName;
|
|
||||||
fileEl.target = '_blank';
|
|
||||||
fileEl.className = 'file-attachment';
|
|
||||||
fileEl.style.cssText = 'display:inline-flex;align-items:center;gap:6px;padding:8px 14px;margin:8px 0;border-radius:8px;background:var(--bg-secondary,#f3f4f6);color:var(--text-primary,#374151);text-decoration:none;font-size:14px;border:1px solid var(--border-color,#e5e7eb);';
|
|
||||||
fileEl.innerHTML = `<i class="fas fa-file-download" style="color:#6b7280;"></i> ${fileName}`;
|
|
||||||
mediaEl.appendChild(fileEl);
|
|
||||||
scrollChatToBottom();
|
|
||||||
|
|
||||||
} else if (item.type === 'phase') {
|
|
||||||
// Coarse progress (e.g. cow install-browser); must not close SSE (unlike "done")
|
|
||||||
ensureBotEl();
|
|
||||||
const wrap = document.createElement('div');
|
|
||||||
wrap.className = 'text-xs sm:text-sm text-slate-600 dark:text-slate-400 border-l-2 border-primary-400 pl-2 py-1 my-0.5';
|
|
||||||
wrap.textContent = String(item.content || '');
|
|
||||||
stepsEl.appendChild(wrap);
|
|
||||||
scrollChatToBottom();
|
|
||||||
|
|
||||||
} else if (item.type === 'done') {
|
|
||||||
es.close();
|
es.close();
|
||||||
delete activeStreams[requestId];
|
delete activeStreams[requestId];
|
||||||
|
|
||||||
// item.content may be empty when "done" is only a stream-close signal after media.
|
if (done) return;
|
||||||
const finalText = item.content || accumulatedText;
|
|
||||||
|
|
||||||
if (!botEl && finalText) {
|
if (reconnectCount < MAX_RECONNECTS) {
|
||||||
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
|
reconnectCount++;
|
||||||
addBotMessage(finalText, new Date((item.timestamp || Date.now() / 1000) * 1000), requestId);
|
const delay = Math.min(RECONNECT_BASE_MS * reconnectCount, 5000);
|
||||||
} else if (botEl) {
|
console.warn(`[SSE] connection lost for ${requestId}, reconnecting in ${delay}ms (attempt ${reconnectCount}/${MAX_RECONNECTS})`);
|
||||||
|
setTimeout(connect, delay);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Exhausted retries, show whatever we have
|
||||||
|
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
|
||||||
|
if (!botEl) {
|
||||||
|
addBotMessage(t('error_send'), new Date());
|
||||||
|
} else if (accumulatedText) {
|
||||||
contentEl.classList.remove('sse-streaming');
|
contentEl.classList.remove('sse-streaming');
|
||||||
// Only update text content when there is something new to show.
|
contentEl.innerHTML = renderMarkdown(accumulatedText);
|
||||||
if (finalText) contentEl.innerHTML = renderMarkdown(finalText);
|
|
||||||
applyHighlighting(botEl);
|
applyHighlighting(botEl);
|
||||||
}
|
}
|
||||||
scrollChatToBottom();
|
};
|
||||||
|
}
|
||||||
|
|
||||||
} else if (item.type === 'error') {
|
connect();
|
||||||
es.close();
|
|
||||||
delete activeStreams[requestId];
|
|
||||||
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
|
|
||||||
addBotMessage(t('error_send'), new Date());
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
es.onerror = function() {
|
|
||||||
es.close();
|
|
||||||
delete activeStreams[requestId];
|
|
||||||
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
|
|
||||||
if (!botEl) {
|
|
||||||
addBotMessage(t('error_send'), new Date());
|
|
||||||
} else if (accumulatedText) {
|
|
||||||
contentEl.classList.remove('sse-streaming');
|
|
||||||
contentEl.innerHTML = renderMarkdown(accumulatedText);
|
|
||||||
applyHighlighting(botEl);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function startPolling() {
|
function startPolling() {
|
||||||
|
|||||||
@@ -329,14 +329,18 @@ class WebChannel(ChatChannel):
|
|||||||
"""
|
"""
|
||||||
SSE generator for a given request_id.
|
SSE generator for a given request_id.
|
||||||
Yields UTF-8 encoded bytes to avoid WSGI Latin-1 mangling.
|
Yields UTF-8 encoded bytes to avoid WSGI Latin-1 mangling.
|
||||||
|
Supports client reconnection: the queue is only removed after a
|
||||||
|
"done" event is consumed, so a new GET /stream with the same
|
||||||
|
request_id can resume reading remaining events.
|
||||||
"""
|
"""
|
||||||
if request_id not in self.sse_queues:
|
if request_id not in self.sse_queues:
|
||||||
yield b"data: {\"type\": \"error\", \"message\": \"invalid request_id\"}\n\n"
|
yield b"data: {\"type\": \"error\", \"message\": \"invalid request_id\"}\n\n"
|
||||||
return
|
return
|
||||||
|
|
||||||
q = self.sse_queues[request_id]
|
q = self.sse_queues[request_id]
|
||||||
timeout = 300 # 5 minutes max
|
idle_timeout = 600 # 10 minutes without any real event
|
||||||
deadline = time.time() + timeout
|
deadline = time.time() + idle_timeout
|
||||||
|
done = False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while time.time() < deadline:
|
while time.time() < deadline:
|
||||||
@@ -346,13 +350,18 @@ class WebChannel(ChatChannel):
|
|||||||
yield b": keepalive\n\n"
|
yield b": keepalive\n\n"
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Real event received, reset idle deadline
|
||||||
|
deadline = time.time() + idle_timeout
|
||||||
|
|
||||||
payload = json.dumps(item, ensure_ascii=False)
|
payload = json.dumps(item, ensure_ascii=False)
|
||||||
yield f"data: {payload}\n\n".encode("utf-8")
|
yield f"data: {payload}\n\n".encode("utf-8")
|
||||||
|
|
||||||
if item.get("type") == "done":
|
if item.get("type") == "done":
|
||||||
|
done = True
|
||||||
break
|
break
|
||||||
finally:
|
finally:
|
||||||
self.sse_queues.pop(request_id, None)
|
if done:
|
||||||
|
self.sse_queues.pop(request_id, None)
|
||||||
|
|
||||||
def poll_response(self):
|
def poll_response(self):
|
||||||
"""
|
"""
|
||||||
|
|||||||
72
docs/en/tools/vision.mdx
Normal file
72
docs/en/tools/vision.mdx
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
---
|
||||||
|
title: vision - Image Analysis
|
||||||
|
description: Analyze image content (recognition, description, OCR, etc.)
|
||||||
|
---
|
||||||
|
|
||||||
|
Analyze local images or image URLs using Vision API. Supports content description, text extraction (OCR), object recognition, and more.
|
||||||
|
|
||||||
|
## Model Selection
|
||||||
|
|
||||||
|
The vision tool uses a multi-level auto-selection strategy with automatic fallback — no manual configuration required:
|
||||||
|
|
||||||
|
1. **Main model** — uses the currently configured main model for image recognition (zero extra cost)
|
||||||
|
2. **Other configured models** — auto-discovers other models with configured API keys as alternatives
|
||||||
|
3. **OpenAI** — uses `open_ai_api_key` to call gpt-4.1-mini
|
||||||
|
4. **LinkAI** — uses `linkai_api_key` to call LinkAI vision service
|
||||||
|
|
||||||
|
When `use_linkai=true`, LinkAI is promoted to the highest priority.
|
||||||
|
|
||||||
|
If the current provider fails, the tool automatically tries the next one until it succeeds or all fail.
|
||||||
|
|
||||||
|
### Supported Models
|
||||||
|
|
||||||
|
| Vendor | Vision Model | Notes |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| OpenAI / Compatible | Main model | All OpenAI-compatible multimodal models |
|
||||||
|
| Qwen (DashScope) | Main model | Via MultiModalConversation API |
|
||||||
|
| Claude | Main model | Anthropic native image format |
|
||||||
|
| Gemini | Main model | inlineData format |
|
||||||
|
| Doubao | Main model | doubao-seed-2-0 series natively supported |
|
||||||
|
| Kimi (Moonshot) | Main model | kimi-k2.5 natively supported |
|
||||||
|
| ZhipuAI | glm-5v-turbo | Always uses dedicated vision model |
|
||||||
|
| MiniMax | MiniMax-Text-01 | Always uses dedicated vision model |
|
||||||
|
|
||||||
|
<Note>
|
||||||
|
ZhipuAI and MiniMax text models do not support image understanding, so their dedicated vision models are always used automatically.
|
||||||
|
</Note>
|
||||||
|
|
||||||
|
## Parameters
|
||||||
|
|
||||||
|
| Parameter | Type | Required | Description |
|
||||||
|
| --- | --- | --- | --- |
|
||||||
|
| `image` | string | Yes | Local file path or HTTP(S) image URL |
|
||||||
|
| `question` | string | Yes | Question to ask about the image |
|
||||||
|
|
||||||
|
Supported image formats: jpg, jpeg, png, gif, webp
|
||||||
|
|
||||||
|
## Custom Configuration
|
||||||
|
|
||||||
|
To specify a particular model for the vision tool, add to `config.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"tool": {
|
||||||
|
"vision": {
|
||||||
|
"model": "gpt-4o"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
In most cases no configuration is needed. The tool works automatically as long as the main model supports multimodal input or any vision-capable API key is configured.
|
||||||
|
|
||||||
|
## Use Cases
|
||||||
|
|
||||||
|
- Describe image content
|
||||||
|
- Extract text from images (OCR)
|
||||||
|
- Identify objects, colors, scenes
|
||||||
|
- Analyze screenshots and scanned documents
|
||||||
|
|
||||||
|
<Note>
|
||||||
|
Images larger than 1MB are automatically compressed (max edge 1536px). All images (including remote URLs) are converted to base64 for transmission to ensure compatibility with all model backends.
|
||||||
|
</Note>
|
||||||
72
docs/ja/tools/vision.mdx
Normal file
72
docs/ja/tools/vision.mdx
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
---
|
||||||
|
title: vision - 画像分析
|
||||||
|
description: 画像コンテンツの分析(認識、説明、OCR など)
|
||||||
|
---
|
||||||
|
|
||||||
|
Vision API を使用してローカル画像や画像 URL を分析します。コンテンツの説明、テキスト抽出(OCR)、オブジェクト認識などに対応しています。
|
||||||
|
|
||||||
|
## モデル選択
|
||||||
|
|
||||||
|
Vision ツールは多段階の自動選択+自動フォールバック戦略を採用しており、手動設定なしで利用可能です:
|
||||||
|
|
||||||
|
1. **メインモデル** — 現在設定されているメインモデルで画像認識を実行(追加コストなし)
|
||||||
|
2. **その他の設定済みモデル** — API キーが設定されている他のマルチモーダルモデルを自動検出
|
||||||
|
3. **OpenAI** — `open_ai_api_key` を使用して gpt-4.1-mini を呼び出し
|
||||||
|
4. **LinkAI** — `linkai_api_key` を使用して LinkAI ビジョンサービスを呼び出し
|
||||||
|
|
||||||
|
`use_linkai=true` の場合、LinkAI が最優先になります。
|
||||||
|
|
||||||
|
現在のプロバイダーが失敗した場合、成功するかすべて失敗するまで自動的に次のプロバイダーを試行します。
|
||||||
|
|
||||||
|
### 対応モデル
|
||||||
|
|
||||||
|
| ベンダー | ビジョンモデル | 説明 |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| OpenAI / 互換プロトコル | メインモデル | すべての OpenAI 互換マルチモーダルモデルに対応 |
|
||||||
|
| 通義千問 (DashScope) | メインモデル | MultiModalConversation API 経由 |
|
||||||
|
| Claude | メインモデル | Anthropic ネイティブ画像形式 |
|
||||||
|
| Gemini | メインモデル | inlineData 形式 |
|
||||||
|
| 豆包 (Doubao) | メインモデル | doubao-seed-2-0 シリーズがネイティブ対応 |
|
||||||
|
| Kimi (Moonshot) | メインモデル | kimi-k2.5 がネイティブ対応 |
|
||||||
|
| 智谱 AI | glm-5v-turbo | 常にビジョン専用モデルを使用 |
|
||||||
|
| MiniMax | MiniMax-Text-01 | 常にビジョン専用モデルを使用 |
|
||||||
|
|
||||||
|
<Note>
|
||||||
|
智谱 AI と MiniMax のテキストモデルは画像理解に対応していないため、対応するビジョン専用モデルが自動的に使用されます。
|
||||||
|
</Note>
|
||||||
|
|
||||||
|
## パラメータ
|
||||||
|
|
||||||
|
| パラメータ | 型 | 必須 | 説明 |
|
||||||
|
| --- | --- | --- | --- |
|
||||||
|
| `image` | string | はい | ローカルファイルパスまたは HTTP(S) 画像 URL |
|
||||||
|
| `question` | string | はい | 画像に対する質問 |
|
||||||
|
|
||||||
|
対応画像形式:jpg、jpeg、png、gif、webp
|
||||||
|
|
||||||
|
## カスタム設定
|
||||||
|
|
||||||
|
Vision ツールで使用するモデルを指定するには、`config.json` に以下を追加します:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"tool": {
|
||||||
|
"vision": {
|
||||||
|
"model": "gpt-4o"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
ほとんどの場合、設定は不要です。メインモデルがマルチモーダルに対応しているか、ビジョン対応の API キーが設定されていれば自動的に動作します。
|
||||||
|
|
||||||
|
## ユースケース
|
||||||
|
|
||||||
|
- 画像コンテンツの説明
|
||||||
|
- 画像からのテキスト抽出(OCR)
|
||||||
|
- オブジェクト、色、シーンの識別
|
||||||
|
- スクリーンショットやスキャン文書の分析
|
||||||
|
|
||||||
|
<Note>
|
||||||
|
1MB を超える画像は自動的に圧縮されます(最大辺 1536px)。すべての画像(リモート URL を含む)は base64 に変換して送信され、すべてのモデルバックエンドとの互換性を確保します。
|
||||||
|
</Note>
|
||||||
@@ -5,14 +5,49 @@ description: 分析图片内容(识别、描述、OCR 等)
|
|||||||
|
|
||||||
使用 Vision API 分析本地图片或图片 URL,支持内容描述、文字提取(OCR)、物体识别等。
|
使用 Vision API 分析本地图片或图片 URL,支持内容描述、文字提取(OCR)、物体识别等。
|
||||||
|
|
||||||
## 依赖
|
## 模型选择
|
||||||
|
|
||||||
需要配置至少一个 API Key(通过 `env_config` 工具或工作空间 `.env` 文件配置):
|
Vision 工具采用多级自动选择 + 自动兜底策略,无需手动配置即可使用:
|
||||||
|
|
||||||
| 后端 | 环境变量 | 优先级 |
|
1. **主模型** — 优先使用当前配置的主模型进行图像识别(需要是多模态模型)
|
||||||
|
2. **其他已配置模型** — 自动发现已配置 API Key 的其他多模态模型作为备选
|
||||||
|
|
||||||
|
如果当前 provider 调用失败,会自动尝试下一个,直到成功或全部失败。
|
||||||
|
|
||||||
|
### 支持的模型
|
||||||
|
|
||||||
|
| 厂商 | 视觉模型 | 说明 |
|
||||||
| --- | --- | --- |
|
| --- | --- | --- |
|
||||||
| OpenAI | `OPENAI_API_KEY` | 优先使用 |
|
| OpenAI / 兼容协议 | 使用主模型 | 支持所有 OpenAI 协议兼容的多模态模型 |
|
||||||
| LinkAI | `LINKAI_API_KEY` | 备选 |
|
| 通义千问 (DashScope) | 使用主模型 | 例如 qwen3.6-plus 等 |
|
||||||
|
| Claude | 使用主模型 | Anthropic 原生图像格式 |
|
||||||
|
| Gemini | 使用主模型 | inlineData 格式 |
|
||||||
|
| 豆包 (Doubao) | 使用主模型 | doubao-seed-2-0 系列原生支持 |
|
||||||
|
| Kimi (Moonshot) | 使用主模型 | kimi-k2.5 原生支持 |
|
||||||
|
| 智谱 AI | glm-5v-turbo | 固定使用视觉专用模型 |
|
||||||
|
| MiniMax | MiniMax-Text-01 | 固定使用视觉专用模型 |
|
||||||
|
|
||||||
|
<Note>
|
||||||
|
智谱和 MiniMax 的文本模型不支持图像理解,因此始终使用对应的视觉专用模型,无需手动指定。
|
||||||
|
</Note>
|
||||||
|
|
||||||
|
> 当 `use_linkai=true` 时,默认使用 LinkAI 的多模态模型进行
|
||||||
|
|
||||||
|
## 自定义配置
|
||||||
|
|
||||||
|
如果希望指定 Vision 使用的模型,可在 `config.json` 中配置,例如:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"tool": {
|
||||||
|
"vision": {
|
||||||
|
"model": "gpt-4o"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
大多数情况下无需配置,主模型支持多模态或配置任意一个支持视觉的 API Key 即可自动工作。
|
||||||
|
|
||||||
## 参数
|
## 参数
|
||||||
|
|
||||||
@@ -20,17 +55,18 @@ description: 分析图片内容(识别、描述、OCR 等)
|
|||||||
| --- | --- | --- | --- |
|
| --- | --- | --- | --- |
|
||||||
| `image` | string | 是 | 本地文件路径或 HTTP(S) 图片 URL |
|
| `image` | string | 是 | 本地文件路径或 HTTP(S) 图片 URL |
|
||||||
| `question` | string | 是 | 对图片提出的问题 |
|
| `question` | string | 是 | 对图片提出的问题 |
|
||||||
| `model` | string | 否 | 模型名称(默认 gpt-4.1-mini) |
|
|
||||||
|
|
||||||
支持的图片格式:jpg、jpeg、png、gif、webp
|
支持的图片格式:jpg、jpeg、png、gif、webp
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## 使用场景
|
## 使用场景
|
||||||
|
|
||||||
- 描述图片中的内容
|
- 描述图片中的内容
|
||||||
- 提取图片中的文字(OCR)
|
- 提取图片中的文字(OCR)
|
||||||
- 识别物体、颜色、场景
|
- 识别物体、颜色、场景
|
||||||
- 分析截图、文档扫描件
|
- 分析截图、文档扫描图片等
|
||||||
|
|
||||||
<Note>
|
<Note>
|
||||||
超过 1MB 的图片会自动压缩后上传。如果未配置任何 Vision API Key,该工具不会被加载。
|
超过 1MB 的图片会自动压缩后上传,所有图片(包括远程 URL)会统一转为 base64 传输,确保兼容所有模型后端。
|
||||||
</Note>
|
</Note>
|
||||||
|
|||||||
@@ -2,12 +2,27 @@
|
|||||||
Auto-replay chat robot abstract class
|
Auto-replay chat robot abstract class
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
from bridge.context import Context
|
from bridge.context import Context
|
||||||
from bridge.reply import Reply
|
from bridge.reply import Reply
|
||||||
|
|
||||||
|
|
||||||
class Bot(object):
|
class Bot(object):
|
||||||
|
"""
|
||||||
|
Base class for all chat-bot implementations.
|
||||||
|
|
||||||
|
Subclasses may also implement:
|
||||||
|
|
||||||
|
call_with_tools(messages, tools=None, stream=False, **kwargs)
|
||||||
|
-> dict | generator (OpenAI-compatible format)
|
||||||
|
|
||||||
|
call_vision(image_url, question, model=None, max_tokens=1000)
|
||||||
|
-> dict with keys: model, content, usage (or error/message)
|
||||||
|
|
||||||
|
These are NOT defined here to avoid shadowing concrete implementations
|
||||||
|
provided by mixin classes (e.g. OpenAICompatibleBot) in the MRO.
|
||||||
|
Use ``hasattr(bot, 'call_vision')`` to detect support at runtime.
|
||||||
|
"""
|
||||||
|
|
||||||
def reply(self, query, context: Context = None) -> Reply:
|
def reply(self, query, context: Context = None) -> Reply:
|
||||||
"""
|
"""
|
||||||
bot auto-reply content
|
bot auto-reply content
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
# encoding:utf-8
|
# encoding:utf-8
|
||||||
|
|
||||||
|
import base64
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
@@ -224,6 +227,79 @@ class ClaudeAPIBot(Bot, OpenAIImage):
|
|||||||
return 64000
|
return 64000
|
||||||
return 8192
|
return 8192
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_data_url(data_url: str):
|
||||||
|
"""Parse a data:<mime>;base64,<data> URL into (media_type, base64_data)."""
|
||||||
|
m = re.match(r"^data:([^;]+);base64,(.+)$", data_url, re.DOTALL)
|
||||||
|
if m:
|
||||||
|
return m.group(1), m.group(2)
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
def call_vision(self, image_url: str, question: str,
|
||||||
|
model: Optional[str] = None,
|
||||||
|
max_tokens: int = 1000) -> dict:
|
||||||
|
"""Analyze an image using Claude Messages API (native image blocks)."""
|
||||||
|
try:
|
||||||
|
actual_model = model or self._model_mapping(conf().get("model"))
|
||||||
|
|
||||||
|
# Build Claude-native image content block
|
||||||
|
if image_url.startswith("data:"):
|
||||||
|
media_type, b64_data = self._parse_data_url(image_url)
|
||||||
|
if not b64_data:
|
||||||
|
return {"error": True, "message": "Invalid base64 data URL"}
|
||||||
|
image_block = {
|
||||||
|
"type": "image",
|
||||||
|
"source": {"type": "base64",
|
||||||
|
"media_type": media_type or "image/jpeg",
|
||||||
|
"data": b64_data},
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
image_block = {
|
||||||
|
"type": "image",
|
||||||
|
"source": {"type": "url", "url": image_url},
|
||||||
|
}
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"model": actual_model,
|
||||||
|
"max_tokens": max_tokens,
|
||||||
|
"messages": [{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
image_block,
|
||||||
|
{"type": "text", "text": question},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
}
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"x-api-key": self.api_key,
|
||||||
|
"anthropic-version": "2023-06-01",
|
||||||
|
"content-type": "application/json",
|
||||||
|
}
|
||||||
|
proxies = {"http": self.proxy, "https": self.proxy} if self.proxy else None
|
||||||
|
resp = requests.post(f"{self.api_base}/messages",
|
||||||
|
headers=headers, json=data, proxies=proxies)
|
||||||
|
|
||||||
|
if resp.status_code != 200:
|
||||||
|
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
|
||||||
|
|
||||||
|
body = resp.json()
|
||||||
|
text_parts = [b.get("text", "") for b in body.get("content", [])
|
||||||
|
if b.get("type") == "text"]
|
||||||
|
usage = body.get("usage", {})
|
||||||
|
return {
|
||||||
|
"model": actual_model,
|
||||||
|
"content": "".join(text_parts),
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": usage.get("input_tokens", 0),
|
||||||
|
"completion_tokens": usage.get("output_tokens", 0),
|
||||||
|
"total_tokens": usage.get("input_tokens", 0) + usage.get("output_tokens", 0),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[CLAUDE] call_vision error: {e}")
|
||||||
|
return {"error": True, "message": str(e)}
|
||||||
|
|
||||||
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
|
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
|
||||||
"""
|
"""
|
||||||
Call Claude API with tool support for agent integration
|
Call Claude API with tool support for agent integration
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
# encoding:utf-8
|
# encoding:utf-8
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from models.bot import Bot
|
from models.bot import Bot
|
||||||
from models.session_manager import SessionManager
|
from models.session_manager import SessionManager
|
||||||
from bridge.context import ContextType
|
from bridge.context import ContextType
|
||||||
@@ -153,6 +155,56 @@ class DashscopeBot(Bot):
|
|||||||
else:
|
else:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def call_vision(self, image_url: str, question: str,
|
||||||
|
model: Optional[str] = None,
|
||||||
|
max_tokens: int = 1000) -> dict:
|
||||||
|
"""Analyze an image using DashScope MultiModalConversation API."""
|
||||||
|
try:
|
||||||
|
dashscope.api_key = self.api_key
|
||||||
|
vision_model = model or "qwen-vl-max"
|
||||||
|
|
||||||
|
# DashScope multimodal format: {"image": url} + {"text": question}
|
||||||
|
messages = [{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"image": image_url},
|
||||||
|
{"text": question},
|
||||||
|
],
|
||||||
|
}]
|
||||||
|
|
||||||
|
response = MultiModalConversation.call(
|
||||||
|
model=vision_model,
|
||||||
|
messages=messages,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != HTTPStatus.OK:
|
||||||
|
return {
|
||||||
|
"error": True,
|
||||||
|
"message": f"{response.code} - {response.message}",
|
||||||
|
}
|
||||||
|
|
||||||
|
resp_dict = self._response_to_dict(response)
|
||||||
|
choice = resp_dict["output"]["choices"][0]
|
||||||
|
content = choice.get("message", {}).get("content", "")
|
||||||
|
if isinstance(content, list):
|
||||||
|
content = "".join(
|
||||||
|
item.get("text", "") for item in content if isinstance(item, dict)
|
||||||
|
)
|
||||||
|
usage = resp_dict.get("usage", {})
|
||||||
|
return {
|
||||||
|
"model": vision_model,
|
||||||
|
"content": content,
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": usage.get("input_tokens", 0),
|
||||||
|
"completion_tokens": usage.get("output_tokens", 0),
|
||||||
|
"total_tokens": usage.get("total_tokens", 0),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[DASHSCOPE] call_vision error: {e}")
|
||||||
|
return {"error": True, "message": str(e)}
|
||||||
|
|
||||||
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
|
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
|
||||||
"""
|
"""
|
||||||
Call DashScope API with tool support for agent integration
|
Call DashScope API with tool support for agent integration
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from models.bot import Bot
|
from models.bot import Bot
|
||||||
@@ -147,6 +148,49 @@ class DoubaoBot(Bot):
|
|||||||
else:
|
else:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def call_vision(self, image_url: str, question: str,
|
||||||
|
model: Optional[str] = None,
|
||||||
|
max_tokens: int = 1000) -> dict:
|
||||||
|
"""Analyze an image using Doubao (Volcengine Ark) OpenAI-compatible API."""
|
||||||
|
try:
|
||||||
|
vision_model = model or self.args.get("model", "doubao-seed-2-0-pro-260215")
|
||||||
|
payload = {
|
||||||
|
"model": vision_model,
|
||||||
|
"max_tokens": max_tokens,
|
||||||
|
"messages": [{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": question},
|
||||||
|
{"type": "image_url", "image_url": {"url": image_url}},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
}
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {self.api_key}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
resp = requests.post(f"{self.base_url}/chat/completions",
|
||||||
|
headers=headers, json=payload, timeout=60)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
|
||||||
|
data = resp.json()
|
||||||
|
if "error" in data:
|
||||||
|
return {"error": True, "message": data["error"].get("message", str(data["error"]))}
|
||||||
|
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||||
|
usage = data.get("usage", {})
|
||||||
|
return {
|
||||||
|
"model": vision_model,
|
||||||
|
"content": content,
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": usage.get("prompt_tokens", 0),
|
||||||
|
"completion_tokens": usage.get("completion_tokens", 0),
|
||||||
|
"total_tokens": usage.get("total_tokens", 0),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[DOUBAO] call_vision error: {e}")
|
||||||
|
return {"error": True, "message": str(e)}
|
||||||
|
|
||||||
# ==================== Agent mode support ====================
|
# ==================== Agent mode support ====================
|
||||||
|
|
||||||
def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs):
|
def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs):
|
||||||
@@ -434,31 +478,37 @@ class DoubaoBot(Bot):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if role == "user":
|
if role == "user":
|
||||||
text_parts = []
|
has_tool_result = any(
|
||||||
tool_results = []
|
isinstance(b, dict) and b.get("type") == "tool_result" for b in content
|
||||||
|
)
|
||||||
|
if has_tool_result:
|
||||||
|
text_parts = []
|
||||||
|
tool_results = []
|
||||||
|
|
||||||
for block in content:
|
for block in content:
|
||||||
if not isinstance(block, dict):
|
if not isinstance(block, dict):
|
||||||
continue
|
continue
|
||||||
if block.get("type") == "text":
|
if block.get("type") == "text":
|
||||||
text_parts.append(block.get("text", ""))
|
text_parts.append(block.get("text", ""))
|
||||||
elif block.get("type") == "tool_result":
|
elif block.get("type") == "tool_result":
|
||||||
tool_call_id = block.get("tool_use_id") or ""
|
tool_call_id = block.get("tool_use_id") or ""
|
||||||
result_content = block.get("content", "")
|
result_content = block.get("content", "")
|
||||||
if not isinstance(result_content, str):
|
if not isinstance(result_content, str):
|
||||||
result_content = json.dumps(result_content, ensure_ascii=False)
|
result_content = json.dumps(result_content, ensure_ascii=False)
|
||||||
tool_results.append({
|
tool_results.append({
|
||||||
"role": "tool",
|
"role": "tool",
|
||||||
"tool_call_id": tool_call_id,
|
"tool_call_id": tool_call_id,
|
||||||
"content": result_content
|
"content": result_content
|
||||||
})
|
})
|
||||||
|
|
||||||
# Tool results first (must come right after assistant with tool_calls)
|
for tr in tool_results:
|
||||||
for tr in tool_results:
|
converted.append(tr)
|
||||||
converted.append(tr)
|
|
||||||
|
|
||||||
if text_parts:
|
if text_parts:
|
||||||
converted.append({"role": "user", "content": "\n".join(text_parts)})
|
converted.append({"role": "user", "content": "\n".join(text_parts)})
|
||||||
|
else:
|
||||||
|
# Keep as-is for multimodal content (e.g. image_url blocks)
|
||||||
|
converted.append(msg)
|
||||||
|
|
||||||
elif role == "assistant":
|
elif role == "assistant":
|
||||||
openai_msg = {"role": "assistant"}
|
openai_msg = {"role": "assistant"}
|
||||||
|
|||||||
@@ -12,6 +12,8 @@ import mimetypes
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from models.bot import Bot
|
from models.bot import Bot
|
||||||
from models.session_manager import SessionManager
|
from models.session_manager import SessionManager
|
||||||
@@ -144,7 +146,12 @@ class GoogleGeminiBot(Bot):
|
|||||||
return "", []
|
return "", []
|
||||||
pattern = r"\[图片:\s*([^\]]+)\]"
|
pattern = r"\[图片:\s*([^\]]+)\]"
|
||||||
image_paths = [m.strip().strip("'\"") for m in re.findall(pattern, content) if m.strip()]
|
image_paths = [m.strip().strip("'\"") for m in re.findall(pattern, content) if m.strip()]
|
||||||
cleaned_text = re.sub(pattern, "", content)
|
# Replace markers with path-only hints so the model still knows the
|
||||||
|
# original file location (needed when it calls tools like vision).
|
||||||
|
def _replace_with_hint(m):
|
||||||
|
path = m.group(1).strip().strip("'\"")
|
||||||
|
return f"[attached image: {path}]"
|
||||||
|
cleaned_text = re.sub(pattern, _replace_with_hint, content)
|
||||||
cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text).strip()
|
cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text).strip()
|
||||||
return cleaned_text, image_paths
|
return cleaned_text, image_paths
|
||||||
|
|
||||||
@@ -225,6 +232,57 @@ class GoogleGeminiBot(Bot):
|
|||||||
logger.warning(f"[Gemini] Unsupported image URL format: {image_url[:120]}")
|
logger.warning(f"[Gemini] Unsupported image URL format: {image_url[:120]}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def call_vision(self, image_url: str, question: str,
|
||||||
|
model: Optional[str] = None,
|
||||||
|
max_tokens: int = 1000) -> dict:
|
||||||
|
"""Analyze an image using Gemini REST API."""
|
||||||
|
try:
|
||||||
|
model_name = model or self.model or "gemini-2.0-flash"
|
||||||
|
image_part = self._build_inline_part_from_image_url({"url": image_url})
|
||||||
|
if not image_part:
|
||||||
|
return {"error": True, "message": f"Cannot process image URL: {image_url[:120]}"}
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"contents": [{
|
||||||
|
"role": "user",
|
||||||
|
"parts": [image_part, {"text": question}],
|
||||||
|
}],
|
||||||
|
"generationConfig": {"maxOutputTokens": max_tokens},
|
||||||
|
"safetySettings": [
|
||||||
|
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
|
||||||
|
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
|
||||||
|
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
|
||||||
|
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
endpoint = f"{self.api_base}/v1beta/models/{model_name}:generateContent"
|
||||||
|
headers = {"x-goog-api-key": self.api_key, "Content-Type": "application/json"}
|
||||||
|
resp = requests.post(endpoint, headers=headers, json=payload, timeout=60)
|
||||||
|
|
||||||
|
if resp.status_code != 200:
|
||||||
|
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
|
||||||
|
|
||||||
|
body = resp.json()
|
||||||
|
candidates = body.get("candidates", [])
|
||||||
|
text_parts = []
|
||||||
|
for part in candidates[0].get("content", {}).get("parts", []) if candidates else []:
|
||||||
|
if "text" in part:
|
||||||
|
text_parts.append(part["text"])
|
||||||
|
|
||||||
|
usage_meta = body.get("usageMetadata", {})
|
||||||
|
return {
|
||||||
|
"model": model_name,
|
||||||
|
"content": "".join(text_parts),
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": usage_meta.get("promptTokenCount", 0),
|
||||||
|
"completion_tokens": usage_meta.get("candidatesTokenCount", 0),
|
||||||
|
"total_tokens": usage_meta.get("totalTokenCount", 0),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[Gemini] call_vision error: {e}")
|
||||||
|
return {"error": True, "message": str(e)}
|
||||||
|
|
||||||
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
|
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
|
||||||
"""
|
"""
|
||||||
Call Gemini API with tool support using REST API (following official docs)
|
Call Gemini API with tool support using REST API (following official docs)
|
||||||
|
|||||||
@@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from models.bot import Bot
|
from models.bot import Bot
|
||||||
@@ -175,6 +177,51 @@ class MinimaxBot(Bot):
|
|||||||
else:
|
else:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def call_vision(self, image_url: str, question: str,
|
||||||
|
model: Optional[str] = None,
|
||||||
|
max_tokens: int = 1000) -> dict:
|
||||||
|
"""Analyze an image using MiniMax OpenAI-compatible API.
|
||||||
|
Always uses MiniMax-Text-01 — other MiniMax models do not support vision.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
vision_model = "MiniMax-Text-01"
|
||||||
|
payload = {
|
||||||
|
"model": vision_model,
|
||||||
|
"max_tokens": max_tokens,
|
||||||
|
"messages": [{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": question},
|
||||||
|
{"type": "image_url", "image_url": {"url": image_url}},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
}
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {self.api_key}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
resp = requests.post(f"{self.api_base}/chat/completions",
|
||||||
|
headers=headers, json=payload, timeout=60)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
|
||||||
|
data = resp.json()
|
||||||
|
if "error" in data:
|
||||||
|
return {"error": True, "message": data["error"].get("message", str(data["error"]))}
|
||||||
|
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||||
|
usage = data.get("usage", {})
|
||||||
|
return {
|
||||||
|
"model": vision_model,
|
||||||
|
"content": content,
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": usage.get("prompt_tokens", 0),
|
||||||
|
"completion_tokens": usage.get("completion_tokens", 0),
|
||||||
|
"total_tokens": usage.get("total_tokens", 0),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[MINIMAX] call_vision error: {e}")
|
||||||
|
return {"error": True, "message": str(e)}
|
||||||
|
|
||||||
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
|
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
|
||||||
"""
|
"""
|
||||||
Call MiniMax API with tool support for agent integration
|
Call MiniMax API with tool support for agent integration
|
||||||
@@ -273,37 +320,41 @@ class MinimaxBot(Bot):
|
|||||||
if role == "user":
|
if role == "user":
|
||||||
# Handle user message
|
# Handle user message
|
||||||
if isinstance(content, list):
|
if isinstance(content, list):
|
||||||
# Extract text from content blocks
|
has_tool_result = any(
|
||||||
text_parts = []
|
isinstance(b, dict) and b.get("type") == "tool_result" for b in content
|
||||||
tool_results = []
|
)
|
||||||
|
if has_tool_result:
|
||||||
|
text_parts = []
|
||||||
|
tool_results = []
|
||||||
|
|
||||||
for block in content:
|
for block in content:
|
||||||
if isinstance(block, dict):
|
if isinstance(block, dict):
|
||||||
if block.get("type") == "text":
|
if block.get("type") == "text":
|
||||||
text_parts.append(block.get("text", ""))
|
text_parts.append(block.get("text", ""))
|
||||||
elif block.get("type") == "tool_result":
|
elif block.get("type") == "tool_result":
|
||||||
# Tool result should be a separate message with role="tool"
|
tool_call_id = block.get("tool_use_id") or ""
|
||||||
tool_call_id = block.get("tool_use_id") or ""
|
if not tool_call_id:
|
||||||
if not tool_call_id:
|
logger.warning(f"[MINIMAX] tool_result missing tool_use_id")
|
||||||
logger.warning(f"[MINIMAX] tool_result missing tool_use_id")
|
result_content = block.get("content", "")
|
||||||
result_content = block.get("content", "")
|
if not isinstance(result_content, str):
|
||||||
if not isinstance(result_content, str):
|
result_content = json.dumps(result_content, ensure_ascii=False)
|
||||||
result_content = json.dumps(result_content, ensure_ascii=False)
|
tool_results.append({
|
||||||
tool_results.append({
|
"role": "tool",
|
||||||
"role": "tool",
|
"tool_call_id": tool_call_id,
|
||||||
"tool_call_id": tool_call_id,
|
"content": result_content
|
||||||
"content": result_content
|
})
|
||||||
})
|
|
||||||
|
|
||||||
if text_parts:
|
if text_parts:
|
||||||
converted.append({
|
converted.append({
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "\n".join(text_parts)
|
"content": "\n".join(text_parts)
|
||||||
})
|
})
|
||||||
|
|
||||||
# Add all tool results (not just the last one)
|
for tool_result in tool_results:
|
||||||
for tool_result in tool_results:
|
converted.append(tool_result)
|
||||||
converted.append(tool_result)
|
else:
|
||||||
|
# Keep as-is for multimodal content (e.g. image_url blocks)
|
||||||
|
converted.append(msg)
|
||||||
else:
|
else:
|
||||||
# Simple text content
|
# Simple text content
|
||||||
converted.append({
|
converted.append({
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from models.bot import Bot
|
from models.bot import Bot
|
||||||
@@ -147,6 +148,49 @@ class MoonshotBot(Bot):
|
|||||||
else:
|
else:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def call_vision(self, image_url: str, question: str,
|
||||||
|
model: Optional[str] = None,
|
||||||
|
max_tokens: int = 1000) -> dict:
|
||||||
|
"""Analyze an image using Moonshot (Kimi) OpenAI-compatible API."""
|
||||||
|
try:
|
||||||
|
vision_model = model or self.args.get("model", "kimi-k2.5")
|
||||||
|
payload = {
|
||||||
|
"model": vision_model,
|
||||||
|
"max_tokens": max_tokens,
|
||||||
|
"messages": [{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": question},
|
||||||
|
{"type": "image_url", "image_url": {"url": image_url}},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
}
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {self.api_key}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
resp = requests.post(f"{self.base_url}/chat/completions",
|
||||||
|
headers=headers, json=payload, timeout=60)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
|
||||||
|
data = resp.json()
|
||||||
|
if "error" in data:
|
||||||
|
return {"error": True, "message": data["error"].get("message", str(data["error"]))}
|
||||||
|
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||||
|
usage = data.get("usage", {})
|
||||||
|
return {
|
||||||
|
"model": vision_model,
|
||||||
|
"content": content,
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": usage.get("prompt_tokens", 0),
|
||||||
|
"completion_tokens": usage.get("completion_tokens", 0),
|
||||||
|
"total_tokens": usage.get("total_tokens", 0),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[MOONSHOT] call_vision error: {e}")
|
||||||
|
return {"error": True, "message": str(e)}
|
||||||
|
|
||||||
# ==================== Agent mode support ====================
|
# ==================== Agent mode support ====================
|
||||||
|
|
||||||
def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs):
|
def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs):
|
||||||
@@ -435,31 +479,37 @@ class MoonshotBot(Bot):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if role == "user":
|
if role == "user":
|
||||||
text_parts = []
|
has_tool_result = any(
|
||||||
tool_results = []
|
isinstance(b, dict) and b.get("type") == "tool_result" for b in content
|
||||||
|
)
|
||||||
|
if has_tool_result:
|
||||||
|
text_parts = []
|
||||||
|
tool_results = []
|
||||||
|
|
||||||
for block in content:
|
for block in content:
|
||||||
if not isinstance(block, dict):
|
if not isinstance(block, dict):
|
||||||
continue
|
continue
|
||||||
if block.get("type") == "text":
|
if block.get("type") == "text":
|
||||||
text_parts.append(block.get("text", ""))
|
text_parts.append(block.get("text", ""))
|
||||||
elif block.get("type") == "tool_result":
|
elif block.get("type") == "tool_result":
|
||||||
tool_call_id = block.get("tool_use_id") or ""
|
tool_call_id = block.get("tool_use_id") or ""
|
||||||
result_content = block.get("content", "")
|
result_content = block.get("content", "")
|
||||||
if not isinstance(result_content, str):
|
if not isinstance(result_content, str):
|
||||||
result_content = json.dumps(result_content, ensure_ascii=False)
|
result_content = json.dumps(result_content, ensure_ascii=False)
|
||||||
tool_results.append({
|
tool_results.append({
|
||||||
"role": "tool",
|
"role": "tool",
|
||||||
"tool_call_id": tool_call_id,
|
"tool_call_id": tool_call_id,
|
||||||
"content": result_content
|
"content": result_content
|
||||||
})
|
})
|
||||||
|
|
||||||
# Tool results first (must come right after assistant with tool_calls)
|
for tr in tool_results:
|
||||||
for tr in tool_results:
|
converted.append(tr)
|
||||||
converted.append(tr)
|
|
||||||
|
|
||||||
if text_parts:
|
if text_parts:
|
||||||
converted.append({"role": "user", "content": "\n".join(text_parts)})
|
converted.append({"role": "user", "content": "\n".join(text_parts)})
|
||||||
|
else:
|
||||||
|
# Keep as-is for multimodal content (e.g. image_url blocks)
|
||||||
|
converted.append(msg)
|
||||||
|
|
||||||
elif role == "assistant":
|
elif role == "assistant":
|
||||||
openai_msg = {"role": "assistant"}
|
openai_msg = {"role": "assistant"}
|
||||||
|
|||||||
@@ -9,6 +9,8 @@ This includes: OpenAI, LinkAI, Azure OpenAI, and many third-party providers.
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import openai
|
import openai
|
||||||
|
import requests
|
||||||
|
from typing import Optional
|
||||||
from common.log import logger
|
from common.log import logger
|
||||||
from agent.protocol.message_utils import drop_orphaned_tool_results_openai
|
from agent.protocol.message_utils import drop_orphaned_tool_results_openai
|
||||||
|
|
||||||
@@ -306,3 +308,51 @@ class OpenAICompatibleBot:
|
|||||||
openai_messages.append(msg)
|
openai_messages.append(msg)
|
||||||
|
|
||||||
return drop_orphaned_tool_results_openai(openai_messages)
|
return drop_orphaned_tool_results_openai(openai_messages)
|
||||||
|
|
||||||
|
def call_vision(self, image_url: str, question: str,
|
||||||
|
model: Optional[str] = None,
|
||||||
|
max_tokens: int = 1000) -> dict:
|
||||||
|
"""Analyze an image using the OpenAI-compatible /chat/completions endpoint."""
|
||||||
|
try:
|
||||||
|
api_config = self.get_api_config()
|
||||||
|
vision_model = model or api_config.get("model", "gpt-4o")
|
||||||
|
api_key = api_config.get("api_key", "")
|
||||||
|
api_base = (api_config.get("api_base") or "https://api.openai.com/v1").rstrip("/")
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": vision_model,
|
||||||
|
"messages": [{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": question},
|
||||||
|
{"type": "image_url", "image_url": {"url": image_url}},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
}
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {api_key}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
resp = requests.post(
|
||||||
|
f"{api_base}/chat/completions",
|
||||||
|
headers=headers, json=payload, timeout=60,
|
||||||
|
)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
body = resp.text[:500]
|
||||||
|
logger.error(f"[{self.__class__.__name__}] call_vision HTTP {resp.status_code}: {body}")
|
||||||
|
return {"error": True, "message": f"HTTP {resp.status_code}: {body}"}
|
||||||
|
data = resp.json()
|
||||||
|
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||||
|
usage = data.get("usage", {})
|
||||||
|
return {
|
||||||
|
"model": vision_model,
|
||||||
|
"content": content,
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": usage.get("prompt_tokens", 0),
|
||||||
|
"completion_tokens": usage.get("completion_tokens", 0),
|
||||||
|
"total_tokens": usage.get("total_tokens", 0),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[{self.__class__.__name__}] call_vision error: {e}")
|
||||||
|
return {"error": True, "message": str(e)}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from models.bot import Bot
|
from models.bot import Bot
|
||||||
from models.zhipuai.zhipu_ai_session import ZhipuAISession
|
from models.zhipuai.zhipu_ai_session import ZhipuAISession
|
||||||
@@ -149,6 +150,40 @@ class ZHIPUAIBot(Bot, ZhipuAIImage):
|
|||||||
else:
|
else:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def call_vision(self, image_url: str, question: str,
|
||||||
|
model: Optional[str] = None,
|
||||||
|
max_tokens: int = 1000) -> dict:
|
||||||
|
"""Analyze an image using ZhipuAI OpenAI-compatible SDK.
|
||||||
|
Always uses glm-5v-turbo — the text models (glm-5-turbo etc.) do not support vision.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
vision_model = "glm-5v-turbo"
|
||||||
|
response = self.client.chat.completions.create(
|
||||||
|
model=vision_model,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
messages=[{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": question},
|
||||||
|
{"type": "image_url", "image_url": {"url": image_url}},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
)
|
||||||
|
content = response.choices[0].message.content or ""
|
||||||
|
usage = response.usage
|
||||||
|
return {
|
||||||
|
"model": vision_model,
|
||||||
|
"content": content,
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": getattr(usage, "prompt_tokens", 0),
|
||||||
|
"completion_tokens": getattr(usage, "completion_tokens", 0),
|
||||||
|
"total_tokens": getattr(usage, "total_tokens", 0),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[ZHIPU_AI] call_vision error: {e}")
|
||||||
|
return {"error": True, "message": str(e)}
|
||||||
|
|
||||||
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
|
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
|
||||||
"""
|
"""
|
||||||
Call ZhipuAI API with tool support for agent integration
|
Call ZhipuAI API with tool support for agent integration
|
||||||
|
|||||||
Reference in New Issue
Block a user