mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-06-02 00:57:41 +08:00
Merge branch 'master' into feat-knowledge
This commit is contained in:
@@ -2,12 +2,27 @@
|
||||
Auto-replay chat robot abstract class
|
||||
"""
|
||||
|
||||
|
||||
from bridge.context import Context
|
||||
from bridge.reply import Reply
|
||||
|
||||
|
||||
class Bot(object):
|
||||
"""
|
||||
Base class for all chat-bot implementations.
|
||||
|
||||
Subclasses may also implement:
|
||||
|
||||
call_with_tools(messages, tools=None, stream=False, **kwargs)
|
||||
-> dict | generator (OpenAI-compatible format)
|
||||
|
||||
call_vision(image_url, question, model=None, max_tokens=1000)
|
||||
-> dict with keys: model, content, usage (or error/message)
|
||||
|
||||
These are NOT defined here to avoid shadowing concrete implementations
|
||||
provided by mixin classes (e.g. OpenAICompatibleBot) in the MRO.
|
||||
Use ``hasattr(bot, 'call_vision')`` to detect support at runtime.
|
||||
"""
|
||||
|
||||
def reply(self, query, context: Context = None) -> Reply:
|
||||
"""
|
||||
bot auto-reply content
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
# encoding:utf-8
|
||||
|
||||
import base64
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
|
||||
@@ -224,6 +227,79 @@ class ClaudeAPIBot(Bot, OpenAIImage):
|
||||
return 64000
|
||||
return 8192
|
||||
|
||||
@staticmethod
|
||||
def _parse_data_url(data_url: str):
|
||||
"""Parse a data:<mime>;base64,<data> URL into (media_type, base64_data)."""
|
||||
m = re.match(r"^data:([^;]+);base64,(.+)$", data_url, re.DOTALL)
|
||||
if m:
|
||||
return m.group(1), m.group(2)
|
||||
return None, None
|
||||
|
||||
def call_vision(self, image_url: str, question: str,
|
||||
model: Optional[str] = None,
|
||||
max_tokens: int = 1000) -> dict:
|
||||
"""Analyze an image using Claude Messages API (native image blocks)."""
|
||||
try:
|
||||
actual_model = model or self._model_mapping(conf().get("model"))
|
||||
|
||||
# Build Claude-native image content block
|
||||
if image_url.startswith("data:"):
|
||||
media_type, b64_data = self._parse_data_url(image_url)
|
||||
if not b64_data:
|
||||
return {"error": True, "message": "Invalid base64 data URL"}
|
||||
image_block = {
|
||||
"type": "image",
|
||||
"source": {"type": "base64",
|
||||
"media_type": media_type or "image/jpeg",
|
||||
"data": b64_data},
|
||||
}
|
||||
else:
|
||||
image_block = {
|
||||
"type": "image",
|
||||
"source": {"type": "url", "url": image_url},
|
||||
}
|
||||
|
||||
data = {
|
||||
"model": actual_model,
|
||||
"max_tokens": max_tokens,
|
||||
"messages": [{
|
||||
"role": "user",
|
||||
"content": [
|
||||
image_block,
|
||||
{"type": "text", "text": question},
|
||||
],
|
||||
}],
|
||||
}
|
||||
|
||||
headers = {
|
||||
"x-api-key": self.api_key,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"content-type": "application/json",
|
||||
}
|
||||
proxies = {"http": self.proxy, "https": self.proxy} if self.proxy else None
|
||||
resp = requests.post(f"{self.api_base}/messages",
|
||||
headers=headers, json=data, proxies=proxies)
|
||||
|
||||
if resp.status_code != 200:
|
||||
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
|
||||
|
||||
body = resp.json()
|
||||
text_parts = [b.get("text", "") for b in body.get("content", [])
|
||||
if b.get("type") == "text"]
|
||||
usage = body.get("usage", {})
|
||||
return {
|
||||
"model": actual_model,
|
||||
"content": "".join(text_parts),
|
||||
"usage": {
|
||||
"prompt_tokens": usage.get("input_tokens", 0),
|
||||
"completion_tokens": usage.get("output_tokens", 0),
|
||||
"total_tokens": usage.get("input_tokens", 0) + usage.get("output_tokens", 0),
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"[CLAUDE] call_vision error: {e}")
|
||||
return {"error": True, "message": str(e)}
|
||||
|
||||
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
|
||||
"""
|
||||
Call Claude API with tool support for agent integration
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
# encoding:utf-8
|
||||
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
from models.bot import Bot
|
||||
from models.session_manager import SessionManager
|
||||
from bridge.context import ContextType
|
||||
@@ -153,6 +155,56 @@ class DashscopeBot(Bot):
|
||||
else:
|
||||
return result
|
||||
|
||||
def call_vision(self, image_url: str, question: str,
|
||||
model: Optional[str] = None,
|
||||
max_tokens: int = 1000) -> dict:
|
||||
"""Analyze an image using DashScope MultiModalConversation API."""
|
||||
try:
|
||||
dashscope.api_key = self.api_key
|
||||
vision_model = model or "qwen-vl-max"
|
||||
|
||||
# DashScope multimodal format: {"image": url} + {"text": question}
|
||||
messages = [{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"image": image_url},
|
||||
{"text": question},
|
||||
],
|
||||
}]
|
||||
|
||||
response = MultiModalConversation.call(
|
||||
model=vision_model,
|
||||
messages=messages,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
if response.status_code != HTTPStatus.OK:
|
||||
return {
|
||||
"error": True,
|
||||
"message": f"{response.code} - {response.message}",
|
||||
}
|
||||
|
||||
resp_dict = self._response_to_dict(response)
|
||||
choice = resp_dict["output"]["choices"][0]
|
||||
content = choice.get("message", {}).get("content", "")
|
||||
if isinstance(content, list):
|
||||
content = "".join(
|
||||
item.get("text", "") for item in content if isinstance(item, dict)
|
||||
)
|
||||
usage = resp_dict.get("usage", {})
|
||||
return {
|
||||
"model": vision_model,
|
||||
"content": content,
|
||||
"usage": {
|
||||
"prompt_tokens": usage.get("input_tokens", 0),
|
||||
"completion_tokens": usage.get("output_tokens", 0),
|
||||
"total_tokens": usage.get("total_tokens", 0),
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"[DASHSCOPE] call_vision error: {e}")
|
||||
return {"error": True, "message": str(e)}
|
||||
|
||||
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
|
||||
"""
|
||||
Call DashScope API with tool support for agent integration
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
import json
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from models.bot import Bot
|
||||
@@ -147,6 +148,49 @@ class DoubaoBot(Bot):
|
||||
else:
|
||||
return result
|
||||
|
||||
def call_vision(self, image_url: str, question: str,
|
||||
model: Optional[str] = None,
|
||||
max_tokens: int = 1000) -> dict:
|
||||
"""Analyze an image using Doubao (Volcengine Ark) OpenAI-compatible API."""
|
||||
try:
|
||||
vision_model = model or self.args.get("model", "doubao-seed-2-0-pro-260215")
|
||||
payload = {
|
||||
"model": vision_model,
|
||||
"max_tokens": max_tokens,
|
||||
"messages": [{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": question},
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
],
|
||||
}],
|
||||
}
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
resp = requests.post(f"{self.base_url}/chat/completions",
|
||||
headers=headers, json=payload, timeout=60)
|
||||
if resp.status_code != 200:
|
||||
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
|
||||
data = resp.json()
|
||||
if "error" in data:
|
||||
return {"error": True, "message": data["error"].get("message", str(data["error"]))}
|
||||
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
usage = data.get("usage", {})
|
||||
return {
|
||||
"model": vision_model,
|
||||
"content": content,
|
||||
"usage": {
|
||||
"prompt_tokens": usage.get("prompt_tokens", 0),
|
||||
"completion_tokens": usage.get("completion_tokens", 0),
|
||||
"total_tokens": usage.get("total_tokens", 0),
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"[DOUBAO] call_vision error: {e}")
|
||||
return {"error": True, "message": str(e)}
|
||||
|
||||
# ==================== Agent mode support ====================
|
||||
|
||||
def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs):
|
||||
@@ -434,31 +478,37 @@ class DoubaoBot(Bot):
|
||||
continue
|
||||
|
||||
if role == "user":
|
||||
text_parts = []
|
||||
tool_results = []
|
||||
has_tool_result = any(
|
||||
isinstance(b, dict) and b.get("type") == "tool_result" for b in content
|
||||
)
|
||||
if has_tool_result:
|
||||
text_parts = []
|
||||
tool_results = []
|
||||
|
||||
for block in content:
|
||||
if not isinstance(block, dict):
|
||||
continue
|
||||
if block.get("type") == "text":
|
||||
text_parts.append(block.get("text", ""))
|
||||
elif block.get("type") == "tool_result":
|
||||
tool_call_id = block.get("tool_use_id") or ""
|
||||
result_content = block.get("content", "")
|
||||
if not isinstance(result_content, str):
|
||||
result_content = json.dumps(result_content, ensure_ascii=False)
|
||||
tool_results.append({
|
||||
"role": "tool",
|
||||
"tool_call_id": tool_call_id,
|
||||
"content": result_content
|
||||
})
|
||||
for block in content:
|
||||
if not isinstance(block, dict):
|
||||
continue
|
||||
if block.get("type") == "text":
|
||||
text_parts.append(block.get("text", ""))
|
||||
elif block.get("type") == "tool_result":
|
||||
tool_call_id = block.get("tool_use_id") or ""
|
||||
result_content = block.get("content", "")
|
||||
if not isinstance(result_content, str):
|
||||
result_content = json.dumps(result_content, ensure_ascii=False)
|
||||
tool_results.append({
|
||||
"role": "tool",
|
||||
"tool_call_id": tool_call_id,
|
||||
"content": result_content
|
||||
})
|
||||
|
||||
# Tool results first (must come right after assistant with tool_calls)
|
||||
for tr in tool_results:
|
||||
converted.append(tr)
|
||||
for tr in tool_results:
|
||||
converted.append(tr)
|
||||
|
||||
if text_parts:
|
||||
converted.append({"role": "user", "content": "\n".join(text_parts)})
|
||||
if text_parts:
|
||||
converted.append({"role": "user", "content": "\n".join(text_parts)})
|
||||
else:
|
||||
# Keep as-is for multimodal content (e.g. image_url blocks)
|
||||
converted.append(msg)
|
||||
|
||||
elif role == "assistant":
|
||||
openai_msg = {"role": "assistant"}
|
||||
|
||||
@@ -12,6 +12,8 @@ import mimetypes
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from models.bot import Bot
|
||||
from models.session_manager import SessionManager
|
||||
@@ -144,7 +146,12 @@ class GoogleGeminiBot(Bot):
|
||||
return "", []
|
||||
pattern = r"\[图片:\s*([^\]]+)\]"
|
||||
image_paths = [m.strip().strip("'\"") for m in re.findall(pattern, content) if m.strip()]
|
||||
cleaned_text = re.sub(pattern, "", content)
|
||||
# Replace markers with path-only hints so the model still knows the
|
||||
# original file location (needed when it calls tools like vision).
|
||||
def _replace_with_hint(m):
|
||||
path = m.group(1).strip().strip("'\"")
|
||||
return f"[attached image: {path}]"
|
||||
cleaned_text = re.sub(pattern, _replace_with_hint, content)
|
||||
cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text).strip()
|
||||
return cleaned_text, image_paths
|
||||
|
||||
@@ -225,6 +232,57 @@ class GoogleGeminiBot(Bot):
|
||||
logger.warning(f"[Gemini] Unsupported image URL format: {image_url[:120]}")
|
||||
return None
|
||||
|
||||
def call_vision(self, image_url: str, question: str,
|
||||
model: Optional[str] = None,
|
||||
max_tokens: int = 1000) -> dict:
|
||||
"""Analyze an image using Gemini REST API."""
|
||||
try:
|
||||
model_name = model or self.model or "gemini-2.0-flash"
|
||||
image_part = self._build_inline_part_from_image_url({"url": image_url})
|
||||
if not image_part:
|
||||
return {"error": True, "message": f"Cannot process image URL: {image_url[:120]}"}
|
||||
|
||||
payload = {
|
||||
"contents": [{
|
||||
"role": "user",
|
||||
"parts": [image_part, {"text": question}],
|
||||
}],
|
||||
"generationConfig": {"maxOutputTokens": max_tokens},
|
||||
"safetySettings": [
|
||||
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
|
||||
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
|
||||
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
|
||||
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
|
||||
],
|
||||
}
|
||||
endpoint = f"{self.api_base}/v1beta/models/{model_name}:generateContent"
|
||||
headers = {"x-goog-api-key": self.api_key, "Content-Type": "application/json"}
|
||||
resp = requests.post(endpoint, headers=headers, json=payload, timeout=60)
|
||||
|
||||
if resp.status_code != 200:
|
||||
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
|
||||
|
||||
body = resp.json()
|
||||
candidates = body.get("candidates", [])
|
||||
text_parts = []
|
||||
for part in candidates[0].get("content", {}).get("parts", []) if candidates else []:
|
||||
if "text" in part:
|
||||
text_parts.append(part["text"])
|
||||
|
||||
usage_meta = body.get("usageMetadata", {})
|
||||
return {
|
||||
"model": model_name,
|
||||
"content": "".join(text_parts),
|
||||
"usage": {
|
||||
"prompt_tokens": usage_meta.get("promptTokenCount", 0),
|
||||
"completion_tokens": usage_meta.get("candidatesTokenCount", 0),
|
||||
"total_tokens": usage_meta.get("totalTokenCount", 0),
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"[Gemini] call_vision error: {e}")
|
||||
return {"error": True, "message": str(e)}
|
||||
|
||||
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
|
||||
"""
|
||||
Call Gemini API with tool support using REST API (following official docs)
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
import time
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
|
||||
from models.bot import Bot
|
||||
@@ -20,7 +22,7 @@ class MinimaxBot(Bot):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.args = {
|
||||
"model": conf().get("model") or "MiniMax-M2.1",
|
||||
"model": conf().get("model") or "MiniMax-M2.7",
|
||||
"temperature": conf().get("temperature", 0.3),
|
||||
"top_p": conf().get("top_p", 0.95),
|
||||
}
|
||||
@@ -175,6 +177,51 @@ class MinimaxBot(Bot):
|
||||
else:
|
||||
return result
|
||||
|
||||
def call_vision(self, image_url: str, question: str,
|
||||
model: Optional[str] = None,
|
||||
max_tokens: int = 1000) -> dict:
|
||||
"""Analyze an image using MiniMax OpenAI-compatible API.
|
||||
Always uses MiniMax-Text-01 — other MiniMax models do not support vision.
|
||||
"""
|
||||
try:
|
||||
vision_model = "MiniMax-Text-01"
|
||||
payload = {
|
||||
"model": vision_model,
|
||||
"max_tokens": max_tokens,
|
||||
"messages": [{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": question},
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
],
|
||||
}],
|
||||
}
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
resp = requests.post(f"{self.api_base}/chat/completions",
|
||||
headers=headers, json=payload, timeout=60)
|
||||
if resp.status_code != 200:
|
||||
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
|
||||
data = resp.json()
|
||||
if "error" in data:
|
||||
return {"error": True, "message": data["error"].get("message", str(data["error"]))}
|
||||
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
usage = data.get("usage", {})
|
||||
return {
|
||||
"model": vision_model,
|
||||
"content": content,
|
||||
"usage": {
|
||||
"prompt_tokens": usage.get("prompt_tokens", 0),
|
||||
"completion_tokens": usage.get("completion_tokens", 0),
|
||||
"total_tokens": usage.get("total_tokens", 0),
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"[MINIMAX] call_vision error: {e}")
|
||||
return {"error": True, "message": str(e)}
|
||||
|
||||
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
|
||||
"""
|
||||
Call MiniMax API with tool support for agent integration
|
||||
@@ -270,37 +317,41 @@ class MinimaxBot(Bot):
|
||||
if role == "user":
|
||||
# Handle user message
|
||||
if isinstance(content, list):
|
||||
# Extract text from content blocks
|
||||
text_parts = []
|
||||
tool_results = []
|
||||
has_tool_result = any(
|
||||
isinstance(b, dict) and b.get("type") == "tool_result" for b in content
|
||||
)
|
||||
if has_tool_result:
|
||||
text_parts = []
|
||||
tool_results = []
|
||||
|
||||
for block in content:
|
||||
if isinstance(block, dict):
|
||||
if block.get("type") == "text":
|
||||
text_parts.append(block.get("text", ""))
|
||||
elif block.get("type") == "tool_result":
|
||||
# Tool result should be a separate message with role="tool"
|
||||
tool_call_id = block.get("tool_use_id") or ""
|
||||
if not tool_call_id:
|
||||
logger.warning(f"[MINIMAX] tool_result missing tool_use_id")
|
||||
result_content = block.get("content", "")
|
||||
if not isinstance(result_content, str):
|
||||
result_content = json.dumps(result_content, ensure_ascii=False)
|
||||
tool_results.append({
|
||||
"role": "tool",
|
||||
"tool_call_id": tool_call_id,
|
||||
"content": result_content
|
||||
})
|
||||
for block in content:
|
||||
if isinstance(block, dict):
|
||||
if block.get("type") == "text":
|
||||
text_parts.append(block.get("text", ""))
|
||||
elif block.get("type") == "tool_result":
|
||||
tool_call_id = block.get("tool_use_id") or ""
|
||||
if not tool_call_id:
|
||||
logger.warning(f"[MINIMAX] tool_result missing tool_use_id")
|
||||
result_content = block.get("content", "")
|
||||
if not isinstance(result_content, str):
|
||||
result_content = json.dumps(result_content, ensure_ascii=False)
|
||||
tool_results.append({
|
||||
"role": "tool",
|
||||
"tool_call_id": tool_call_id,
|
||||
"content": result_content
|
||||
})
|
||||
|
||||
if text_parts:
|
||||
converted.append({
|
||||
"role": "user",
|
||||
"content": "\n".join(text_parts)
|
||||
})
|
||||
if text_parts:
|
||||
converted.append({
|
||||
"role": "user",
|
||||
"content": "\n".join(text_parts)
|
||||
})
|
||||
|
||||
# Add all tool results (not just the last one)
|
||||
for tool_result in tool_results:
|
||||
converted.append(tool_result)
|
||||
for tool_result in tool_results:
|
||||
converted.append(tool_result)
|
||||
else:
|
||||
# Keep as-is for multimodal content (e.g. image_url blocks)
|
||||
converted.append(msg)
|
||||
else:
|
||||
# Simple text content
|
||||
converted.append({
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
import json
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from models.bot import Bot
|
||||
@@ -147,6 +148,49 @@ class MoonshotBot(Bot):
|
||||
else:
|
||||
return result
|
||||
|
||||
def call_vision(self, image_url: str, question: str,
|
||||
model: Optional[str] = None,
|
||||
max_tokens: int = 1000) -> dict:
|
||||
"""Analyze an image using Moonshot (Kimi) OpenAI-compatible API."""
|
||||
try:
|
||||
vision_model = model or self.args.get("model", "kimi-k2.5")
|
||||
payload = {
|
||||
"model": vision_model,
|
||||
"max_tokens": max_tokens,
|
||||
"messages": [{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": question},
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
],
|
||||
}],
|
||||
}
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
resp = requests.post(f"{self.base_url}/chat/completions",
|
||||
headers=headers, json=payload, timeout=60)
|
||||
if resp.status_code != 200:
|
||||
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
|
||||
data = resp.json()
|
||||
if "error" in data:
|
||||
return {"error": True, "message": data["error"].get("message", str(data["error"]))}
|
||||
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
usage = data.get("usage", {})
|
||||
return {
|
||||
"model": vision_model,
|
||||
"content": content,
|
||||
"usage": {
|
||||
"prompt_tokens": usage.get("prompt_tokens", 0),
|
||||
"completion_tokens": usage.get("completion_tokens", 0),
|
||||
"total_tokens": usage.get("total_tokens", 0),
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"[MOONSHOT] call_vision error: {e}")
|
||||
return {"error": True, "message": str(e)}
|
||||
|
||||
# ==================== Agent mode support ====================
|
||||
|
||||
def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs):
|
||||
@@ -435,31 +479,37 @@ class MoonshotBot(Bot):
|
||||
continue
|
||||
|
||||
if role == "user":
|
||||
text_parts = []
|
||||
tool_results = []
|
||||
has_tool_result = any(
|
||||
isinstance(b, dict) and b.get("type") == "tool_result" for b in content
|
||||
)
|
||||
if has_tool_result:
|
||||
text_parts = []
|
||||
tool_results = []
|
||||
|
||||
for block in content:
|
||||
if not isinstance(block, dict):
|
||||
continue
|
||||
if block.get("type") == "text":
|
||||
text_parts.append(block.get("text", ""))
|
||||
elif block.get("type") == "tool_result":
|
||||
tool_call_id = block.get("tool_use_id") or ""
|
||||
result_content = block.get("content", "")
|
||||
if not isinstance(result_content, str):
|
||||
result_content = json.dumps(result_content, ensure_ascii=False)
|
||||
tool_results.append({
|
||||
"role": "tool",
|
||||
"tool_call_id": tool_call_id,
|
||||
"content": result_content
|
||||
})
|
||||
for block in content:
|
||||
if not isinstance(block, dict):
|
||||
continue
|
||||
if block.get("type") == "text":
|
||||
text_parts.append(block.get("text", ""))
|
||||
elif block.get("type") == "tool_result":
|
||||
tool_call_id = block.get("tool_use_id") or ""
|
||||
result_content = block.get("content", "")
|
||||
if not isinstance(result_content, str):
|
||||
result_content = json.dumps(result_content, ensure_ascii=False)
|
||||
tool_results.append({
|
||||
"role": "tool",
|
||||
"tool_call_id": tool_call_id,
|
||||
"content": result_content
|
||||
})
|
||||
|
||||
# Tool results first (must come right after assistant with tool_calls)
|
||||
for tr in tool_results:
|
||||
converted.append(tr)
|
||||
for tr in tool_results:
|
||||
converted.append(tr)
|
||||
|
||||
if text_parts:
|
||||
converted.append({"role": "user", "content": "\n".join(text_parts)})
|
||||
if text_parts:
|
||||
converted.append({"role": "user", "content": "\n".join(text_parts)})
|
||||
else:
|
||||
# Keep as-is for multimodal content (e.g. image_url blocks)
|
||||
converted.append(msg)
|
||||
|
||||
elif role == "assistant":
|
||||
openai_msg = {"role": "assistant"}
|
||||
|
||||
@@ -9,6 +9,8 @@ This includes: OpenAI, LinkAI, Azure OpenAI, and many third-party providers.
|
||||
|
||||
import json
|
||||
import openai
|
||||
import requests
|
||||
from typing import Optional
|
||||
from common.log import logger
|
||||
from agent.protocol.message_utils import drop_orphaned_tool_results_openai
|
||||
|
||||
@@ -306,3 +308,51 @@ class OpenAICompatibleBot:
|
||||
openai_messages.append(msg)
|
||||
|
||||
return drop_orphaned_tool_results_openai(openai_messages)
|
||||
|
||||
def call_vision(self, image_url: str, question: str,
|
||||
model: Optional[str] = None,
|
||||
max_tokens: int = 1000) -> dict:
|
||||
"""Analyze an image using the OpenAI-compatible /chat/completions endpoint."""
|
||||
try:
|
||||
api_config = self.get_api_config()
|
||||
vision_model = model or api_config.get("model", "gpt-4o")
|
||||
api_key = api_config.get("api_key", "")
|
||||
api_base = (api_config.get("api_base") or "https://api.openai.com/v1").rstrip("/")
|
||||
|
||||
payload = {
|
||||
"model": vision_model,
|
||||
"messages": [{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": question},
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
],
|
||||
}],
|
||||
}
|
||||
headers = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
resp = requests.post(
|
||||
f"{api_base}/chat/completions",
|
||||
headers=headers, json=payload, timeout=60,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
body = resp.text[:500]
|
||||
logger.error(f"[{self.__class__.__name__}] call_vision HTTP {resp.status_code}: {body}")
|
||||
return {"error": True, "message": f"HTTP {resp.status_code}: {body}"}
|
||||
data = resp.json()
|
||||
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
usage = data.get("usage", {})
|
||||
return {
|
||||
"model": vision_model,
|
||||
"content": content,
|
||||
"usage": {
|
||||
"prompt_tokens": usage.get("prompt_tokens", 0),
|
||||
"completion_tokens": usage.get("completion_tokens", 0),
|
||||
"total_tokens": usage.get("total_tokens", 0),
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"[{self.__class__.__name__}] call_vision error: {e}")
|
||||
return {"error": True, "message": str(e)}
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
import time
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
from models.bot import Bot
|
||||
from models.zhipuai.zhipu_ai_session import ZhipuAISession
|
||||
@@ -149,6 +150,40 @@ class ZHIPUAIBot(Bot, ZhipuAIImage):
|
||||
else:
|
||||
return result
|
||||
|
||||
def call_vision(self, image_url: str, question: str,
|
||||
model: Optional[str] = None,
|
||||
max_tokens: int = 1000) -> dict:
|
||||
"""Analyze an image using ZhipuAI OpenAI-compatible SDK.
|
||||
Always uses glm-5v-turbo — the text models (glm-5-turbo etc.) do not support vision.
|
||||
"""
|
||||
try:
|
||||
vision_model = "glm-5v-turbo"
|
||||
response = self.client.chat.completions.create(
|
||||
model=vision_model,
|
||||
max_tokens=max_tokens,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": question},
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
],
|
||||
}],
|
||||
)
|
||||
content = response.choices[0].message.content or ""
|
||||
usage = response.usage
|
||||
return {
|
||||
"model": vision_model,
|
||||
"content": content,
|
||||
"usage": {
|
||||
"prompt_tokens": getattr(usage, "prompt_tokens", 0),
|
||||
"completion_tokens": getattr(usage, "completion_tokens", 0),
|
||||
"total_tokens": getattr(usage, "total_tokens", 0),
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"[ZHIPU_AI] call_vision error: {e}")
|
||||
return {"error": True, "message": str(e)}
|
||||
|
||||
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
|
||||
"""
|
||||
Call ZhipuAI API with tool support for agent integration
|
||||
|
||||
Reference in New Issue
Block a user