feat(vision): prioritize main model for image recognition with multi-provider fallback

- Add call_vision method to all bot implementations (DashScope, Claude,
  Gemini, ZhipuAI, MiniMax, Doubao, Moonshot, OpenAICompatibleBot)
  using each vendor's native multimodal API format
- Remove call_with_tools/call_vision from Bot base class to fix MRO
  shadowing issue with OpenAICompatibleBot mixin
- Refactor vision tool provider resolution: MainModel → other configured
  models (auto-discovered) → OpenAI → LinkAI, with automatic fallback
- Return actual model name used in call_vision responses
- Sync config.json API keys to .env bidirectionally on startup
- Fix bot instance cache to detect bot_type/use_linkai config changes
- Add SSE reconnection support for web console
- Preserve image path hints in Gemini text for correct vision tool calls
- Update docs/tools/vision.mdx
This commit is contained in:
zhayujie
2026-04-11 19:46:11 +08:00
parent 3cd92ccda3
commit 26693acc3f
17 changed files with 1173 additions and 359 deletions

View File

@@ -1,6 +1,8 @@
# encoding:utf-8
import json
from typing import Optional
from models.bot import Bot
from models.session_manager import SessionManager
from bridge.context import ContextType
@@ -153,6 +155,56 @@ class DashscopeBot(Bot):
else:
return result
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""Analyze an image using DashScope MultiModalConversation API."""
try:
dashscope.api_key = self.api_key
vision_model = model or "qwen-vl-max"
# DashScope multimodal format: {"image": url} + {"text": question}
messages = [{
"role": "user",
"content": [
{"image": image_url},
{"text": question},
],
}]
response = MultiModalConversation.call(
model=vision_model,
messages=messages,
max_tokens=max_tokens,
)
if response.status_code != HTTPStatus.OK:
return {
"error": True,
"message": f"{response.code} - {response.message}",
}
resp_dict = self._response_to_dict(response)
choice = resp_dict["output"]["choices"][0]
content = choice.get("message", {}).get("content", "")
if isinstance(content, list):
content = "".join(
item.get("text", "") for item in content if isinstance(item, dict)
)
usage = resp_dict.get("usage", {})
return {
"model": vision_model,
"content": content,
"usage": {
"prompt_tokens": usage.get("input_tokens", 0),
"completion_tokens": usage.get("output_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
},
}
except Exception as e:
logger.error(f"[DASHSCOPE] call_vision error: {e}")
return {"error": True, "message": str(e)}
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
"""
Call DashScope API with tool support for agent integration