mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-06-02 00:57:41 +08:00
feat(vision): prioritize main model for image recognition with multi-provider fallback
- Add call_vision method to all bot implementations (DashScope, Claude, Gemini, ZhipuAI, MiniMax, Doubao, Moonshot, OpenAICompatibleBot) using each vendor's native multimodal API format - Remove call_with_tools/call_vision from Bot base class to fix MRO shadowing issue with OpenAICompatibleBot mixin - Refactor vision tool provider resolution: MainModel → other configured models (auto-discovered) → OpenAI → LinkAI, with automatic fallback - Return actual model name used in call_vision responses - Sync config.json API keys to .env bidirectionally on startup - Fix bot instance cache to detect bot_type/use_linkai config changes - Add SSE reconnection support for web console - Preserve image path hints in Gemini text for correct vision tool calls - Update docs/tools/vision.mdx
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
# encoding:utf-8
|
||||
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
from models.bot import Bot
|
||||
from models.session_manager import SessionManager
|
||||
from bridge.context import ContextType
|
||||
@@ -153,6 +155,56 @@ class DashscopeBot(Bot):
|
||||
else:
|
||||
return result
|
||||
|
||||
def call_vision(self, image_url: str, question: str,
|
||||
model: Optional[str] = None,
|
||||
max_tokens: int = 1000) -> dict:
|
||||
"""Analyze an image using DashScope MultiModalConversation API."""
|
||||
try:
|
||||
dashscope.api_key = self.api_key
|
||||
vision_model = model or "qwen-vl-max"
|
||||
|
||||
# DashScope multimodal format: {"image": url} + {"text": question}
|
||||
messages = [{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"image": image_url},
|
||||
{"text": question},
|
||||
],
|
||||
}]
|
||||
|
||||
response = MultiModalConversation.call(
|
||||
model=vision_model,
|
||||
messages=messages,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
if response.status_code != HTTPStatus.OK:
|
||||
return {
|
||||
"error": True,
|
||||
"message": f"{response.code} - {response.message}",
|
||||
}
|
||||
|
||||
resp_dict = self._response_to_dict(response)
|
||||
choice = resp_dict["output"]["choices"][0]
|
||||
content = choice.get("message", {}).get("content", "")
|
||||
if isinstance(content, list):
|
||||
content = "".join(
|
||||
item.get("text", "") for item in content if isinstance(item, dict)
|
||||
)
|
||||
usage = resp_dict.get("usage", {})
|
||||
return {
|
||||
"model": vision_model,
|
||||
"content": content,
|
||||
"usage": {
|
||||
"prompt_tokens": usage.get("input_tokens", 0),
|
||||
"completion_tokens": usage.get("output_tokens", 0),
|
||||
"total_tokens": usage.get("total_tokens", 0),
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"[DASHSCOPE] call_vision error: {e}")
|
||||
return {"error": True, "message": str(e)}
|
||||
|
||||
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
|
||||
"""
|
||||
Call DashScope API with tool support for agent integration
|
||||
|
||||
Reference in New Issue
Block a user