From 3cd92ccda3e746f7315fe1f8db9a71962041dafa Mon Sep 17 00:00:00 2001 From: zhayujie Date: Thu, 9 Apr 2026 21:29:53 +0800 Subject: [PATCH 1/4] feat: add port config --- common/cloud_client.py | 6 +++--- config.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/common/cloud_client.py b/common/cloud_client.py index 656c1604..c71b02fe 100644 --- a/common/cloud_client.py +++ b/common/cloud_client.py @@ -47,8 +47,8 @@ CREDENTIAL_MAP = { class CloudClient(LinkAIClient): - def __init__(self, api_key: str, channel, host: str = ""): - super().__init__(api_key, host) + def __init__(self, api_key: str, channel, host: str = "", port=None): + super().__init__(api_key, host, port=port) self.channel = channel self.client_type = channel.channel_type self.channel_mgr = None @@ -733,7 +733,7 @@ def start(channel, channel_mgr=None): return global chat_client - chat_client = CloudClient(api_key=conf().get("linkai_api_key"), host=conf().get("cloud_host", ""), channel=channel) + chat_client = CloudClient(api_key=conf().get("linkai_api_key"), host=conf().get("cloud_host", ""), port=conf().get("cloud_port"), channel=channel) chat_client.channel_mgr = channel_mgr chat_client.config = _build_config() chat_client.start() diff --git a/config.py b/config.py index 6edd9c04..2ccd505b 100644 --- a/config.py +++ b/config.py @@ -189,6 +189,7 @@ available_setting = { "linkai_app_code": "", "linkai_api_base": "https://api.link-ai.tech", # linkAI服务地址 "cloud_host": "client.link-ai.tech", + "cloud_port": None, "cloud_deployment_id": "", "minimax_api_key": "", "Minimax_group_id": "", From 90d18353534ed8baff2562ad3c3bd9020e3b0ce8 Mon Sep 17 00:00:00 2001 From: 6vision Date: Sat, 11 Apr 2026 15:45:34 +0800 Subject: [PATCH 2/4] fix: send generic file types (tar.gz, zip, etc.) as FILE instead of TEXT Previously, files with extensions not in the known categories (image, document, video, audio) fell through to a fallback that returned ReplyType.TEXT, causing the file to never actually be sent to the user. Now the fallback uses ReplyType.FILE so all file types are delivered. Made-with: Cursor --- bridge/agent_bridge.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/bridge/agent_bridge.py b/bridge/agent_bridge.py index 84b7aad6..665abd22 100644 --- a/bridge/agent_bridge.py +++ b/bridge/agent_bridge.py @@ -498,10 +498,14 @@ class AgentBridge: reply.text_content = text_response return reply - # For other unknown file types, return text with file info - message = text_response or file_info.get("message", "文件已准备") - message += f"\n\n[文件: {file_info.get('file_name', file_path)}]" - return Reply(ReplyType.TEXT, message) + # For all other file types (tar.gz, zip, etc.), also use FILE type + file_url = f"file://{file_path}" + logger.info(f"[AgentBridge] Sending generic file: {file_url}") + reply = Reply(ReplyType.FILE, file_url) + reply.file_name = file_info.get("file_name", os.path.basename(file_path)) + if text_response: + reply.text_content = text_response + return reply def _migrate_config_to_env(self, workspace_root: str): """ From c34308cbd4679110b1be98dfd4825af98552c66e Mon Sep 17 00:00:00 2001 From: octo-patch Date: Sat, 11 Apr 2026 17:03:44 +0800 Subject: [PATCH 3/4] feat: add MiniMax-M2.7-highspeed model and MiniMax TTS support - Add MiniMax-M2.7-highspeed constant to const.py and MODEL_LIST - Update MinimaxBot default model from MiniMax-M2.1 to MiniMax-M2.7 - Add MinimaxVoice TTS provider (voice/minimax/minimax_voice.py) - Supports speech-2.8-hd and speech-2.8-turbo models - SSE streaming with hex-decoded audio chunks - Reuses MINIMAX_API_KEY - Register MinimaxVoice in voice factory - Add unit tests (14 tests, all passing) - Update README with MiniMax-M2.7-highspeed and TTS configuration --- README.md | 5 +- common/const.py | 3 +- models/minimax/minimax_bot.py | 2 +- tests/test_minimax_provider.py | 184 +++++++++++++++++++++++++++++++++ voice/factory.py | 4 + voice/minimax/__init__.py | 0 voice/minimax/minimax_voice.py | 106 +++++++++++++++++++ 7 files changed, 300 insertions(+), 4 deletions(-) create mode 100644 tests/test_minimax_provider.py create mode 100644 voice/minimax/__init__.py create mode 100644 voice/minimax/minimax_voice.py diff --git a/README.md b/README.md index 7478b99d..59609ecc 100644 --- a/README.md +++ b/README.md @@ -213,6 +213,7 @@ cow install-browser + 添加 `"speech_recognition": true` 将开启语音识别,默认使用 openai 的 whisper 模型识别为文字,同时以文字回复,该参数仅支持私聊 (注意由于语音消息无法匹配前缀,一旦开启将对所有语音自动回复,支持语音触发画图); + 添加 `"group_speech_recognition": true` 将开启群组语音识别,默认使用 openai 的 whisper 模型识别为文字,同时以文字回复,参数仅支持群聊 (会匹配 group_chat_prefix 和 group_chat_keyword, 支持语音触发画图); + 添加 `"voice_reply_voice": true` 将开启语音回复语音(同时作用于私聊和群聊) ++ 使用 MiniMax TTS:设置 `"text_to_voice": "minimax"`,并配置 `minimax_api_key`;可通过 `"tts_voice_id"` 指定发音人(如 `English_Graceful_Lady`),`"text_to_voice_model"` 指定模型(如 `speech-2.8-hd`、`speech-2.8-turbo`)
@@ -357,7 +358,7 @@ sudo docker logs -f chatgpt-on-wechat "minimax_api_key": "" } ``` - - `model`: 可填写 `MiniMax-M2.7、MiniMax-M2.5、MiniMax-M2.1、MiniMax-M2.1-lightning、MiniMax-M2、abab6.5-chat` 等 + - `model`: 可填写 `MiniMax-M2.7、MiniMax-M2.7-highspeed、MiniMax-M2.5、MiniMax-M2.1、MiniMax-M2.1-lightning、MiniMax-M2、abab6.5-chat` 等 - `minimax_api_key`:MiniMax 平台的 API-KEY,在 [控制台](https://platform.minimaxi.com/user-center/basic-information/interface-key) 创建 方式二:OpenAI 兼容方式接入,配置如下: @@ -370,7 +371,7 @@ sudo docker logs -f chatgpt-on-wechat } ``` - `bot_type`: OpenAI 兼容方式 -- `model`: 可填 `MiniMax-M2.7、MiniMax-M2.5、MiniMax-M2.1、MiniMax-M2.1-lightning、MiniMax-M2`,参考[API文档](https://platform.minimaxi.com/document/%E5%AF%B9%E8%AF%9D?key=66701d281d57f38758d581d0#QklxsNSbaf6kM4j6wjO5eEek) +- `model`: 可填 `MiniMax-M2.7、MiniMax-M2.7-highspeed、MiniMax-M2.5、MiniMax-M2.1、MiniMax-M2.1-lightning、MiniMax-M2`,参考[API文档](https://platform.minimaxi.com/document/%E5%AF%B9%E8%AF%9D?key=66701d281d57f38758d581d0#QklxsNSbaf6kM4j6wjO5eEek) - `open_ai_api_base`: MiniMax 平台 API 的 BASE URL - `open_ai_api_key`: MiniMax 平台的 API-KEY
diff --git a/common/const.py b/common/const.py index f7e67e52..ecaf5b0f 100644 --- a/common/const.py +++ b/common/const.py @@ -93,6 +93,7 @@ QWQ_PLUS = "qwq-plus" # MiniMax MINIMAX_M2_7 = "MiniMax-M2.7" # MiniMax M2.7 - Latest +MINIMAX_M2_7_HIGHSPEED = "MiniMax-M2.7-highspeed" # MiniMax M2.7 highspeed MINIMAX_M2_5 = "MiniMax-M2.5" # MiniMax M2.5 MINIMAX_M2_1 = "MiniMax-M2.1" # MiniMax M2.1 MINIMAX_M2_1_LIGHTNING = "MiniMax-M2.1-lightning" # MiniMax M2.1 极速版 @@ -175,7 +176,7 @@ MODEL_LIST = [ QWEN36_PLUS, QWEN35_PLUS, QWEN3_MAX, QWEN_MAX, QWEN_PLUS, QWEN_TURBO, QWEN_LONG, # MiniMax - MiniMax, MINIMAX_M2_7, MINIMAX_M2_5, MINIMAX_M2_1, MINIMAX_M2_1_LIGHTNING, MINIMAX_M2, MINIMAX_ABAB6_5, + MiniMax, MINIMAX_M2_7, MINIMAX_M2_7_HIGHSPEED, MINIMAX_M2_5, MINIMAX_M2_1, MINIMAX_M2_1_LIGHTNING, MINIMAX_M2, MINIMAX_ABAB6_5, # GLM ZHIPU_AI, GLM_5_TURBO, GLM_5, GLM_4, GLM_4_PLUS, GLM_4_flash, GLM_4_LONG, GLM_4_ALLTOOLS, diff --git a/models/minimax/minimax_bot.py b/models/minimax/minimax_bot.py index af80e795..0fd45e66 100644 --- a/models/minimax/minimax_bot.py +++ b/models/minimax/minimax_bot.py @@ -20,7 +20,7 @@ class MinimaxBot(Bot): def __init__(self): super().__init__() self.args = { - "model": conf().get("model") or "MiniMax-M2.1", + "model": conf().get("model") or "MiniMax-M2.7", "temperature": conf().get("temperature", 0.3), "top_p": conf().get("top_p", 0.95), } diff --git a/tests/test_minimax_provider.py b/tests/test_minimax_provider.py new file mode 100644 index 00000000..cfad7fd7 --- /dev/null +++ b/tests/test_minimax_provider.py @@ -0,0 +1,184 @@ +# encoding:utf-8 +""" +Unit tests for MiniMax provider additions: + - MiniMax-M2.7-highspeed constant in const.py + - Default model update in MinimaxBot + - MinimaxVoice TTS provider +""" +import sys +import os +import json +import unittest +from unittest.mock import MagicMock, patch, PropertyMock + +# Add project root to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + + +class TestMinimaxConst(unittest.TestCase): + """Test that MiniMax-M2.7-highspeed is properly registered in const.py.""" + + def test_m2_7_highspeed_constant_defined(self): + from common import const + self.assertTrue(hasattr(const, "MINIMAX_M2_7_HIGHSPEED")) + self.assertEqual(const.MINIMAX_M2_7_HIGHSPEED, "MiniMax-M2.7-highspeed") + + def test_m2_7_constant_defined(self): + from common import const + self.assertEqual(const.MINIMAX_M2_7, "MiniMax-M2.7") + + def test_m2_7_highspeed_in_model_list(self): + from common import const + self.assertIn("MiniMax-M2.7-highspeed", const.MODEL_LIST) + + def test_m2_7_in_model_list(self): + from common import const + self.assertIn("MiniMax-M2.7", const.MODEL_LIST) + + def test_minimax_provider_key_defined(self): + from common import const + self.assertEqual(const.MiniMax, "minimax") + + +class TestMinimaxBotDefaultModel(unittest.TestCase): + """Test that MinimaxBot defaults to MiniMax-M2.7.""" + + def test_default_model_is_m2_7(self): + # Patch conf() to return empty config + mock_conf = MagicMock() + mock_conf.get = MagicMock(side_effect=lambda key, default=None: default) + + with patch("models.minimax.minimax_bot.conf", return_value=mock_conf): + with patch("models.minimax.minimax_bot.SessionManager"): + from models.minimax import minimax_bot + # Reload to pick up patches + import importlib + importlib.reload(minimax_bot) + with patch("models.minimax.minimax_bot.conf", return_value=mock_conf): + bot = minimax_bot.MinimaxBot.__new__(minimax_bot.MinimaxBot) + bot.args = { + "model": mock_conf.get("model") or "MiniMax-M2.7", + } + self.assertEqual(bot.args["model"], "MiniMax-M2.7") + + def test_default_model_string(self): + """Verify the fallback string literal in minimax_bot.py is MiniMax-M2.7.""" + import ast + bot_path = os.path.join(os.path.dirname(__file__), "..", "models", "minimax", "minimax_bot.py") + with open(bot_path) as f: + source = f.read() + # Verify MiniMax-M2.7 is in the source (not M2.1) + self.assertIn("MiniMax-M2.7", source) + self.assertNotIn('"MiniMax-M2.1"', source) + + +class TestMinimaxVoice(unittest.TestCase): + """Test MinimaxVoice TTS provider.""" + + def _make_voice(self, api_key="test-key", api_base="https://api.minimax.io/v1"): + mock_conf = MagicMock() + def conf_get(key, default=None): + return { + "minimax_api_key": api_key, + "minimax_api_base": api_base, + }.get(key, default) + mock_conf.get = conf_get + with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf): + from voice.minimax.minimax_voice import MinimaxVoice + return MinimaxVoice() + + def test_instantiation(self): + voice = self._make_voice() + self.assertIsNotNone(voice) + + def test_api_base_strips_v1_suffix(self): + voice = self._make_voice(api_base="https://api.minimax.io/v1") + self.assertEqual(voice.api_base, "https://api.minimax.io") + + def test_api_base_no_trailing_slash(self): + voice = self._make_voice(api_base="https://api.minimax.io") + self.assertEqual(voice.api_base, "https://api.minimax.io") + + def test_voice_to_text_not_supported(self): + voice = self._make_voice() + with self.assertRaises(NotImplementedError): + voice.voiceToText("dummy.wav") + + def test_text_to_voice_success(self): + """Test textToVoice with mocked SSE stream response.""" + import os + os.makedirs("tmp", exist_ok=True) + + # Build fake SSE stream bytes + audio_hex = bytes([0x49, 0x44, 0x33]).hex() # "ID3" MP3 magic bytes + sse_line = f'data: {{"data": {{"audio": "{audio_hex}", "status": 2}}}}\n\n' + done_line = "data: [DONE]\n\n" + fake_body = (sse_line + done_line).encode("utf-8") + + mock_response = MagicMock() + mock_response.raise_for_status = MagicMock() + mock_response.iter_lines.return_value = [ + line.encode("utf-8") for line in (sse_line + done_line).splitlines() if line + ] + + mock_conf = MagicMock() + def conf_get(key, default=None): + return { + "minimax_api_key": "test-key", + "minimax_api_base": "https://api.minimax.io", + }.get(key, default) + mock_conf.get = conf_get + + with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf): + with patch("voice.minimax.minimax_voice.requests.post", return_value=mock_response): + from voice.minimax import minimax_voice + import importlib + importlib.reload(minimax_voice) + with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf): + voice = minimax_voice.MinimaxVoice() + from bridge.reply import ReplyType + reply = voice.textToVoice("Hello, world!") + self.assertEqual(reply.type, ReplyType.VOICE) + self.assertTrue(reply.content.endswith(".mp3")) + + def test_text_to_voice_no_audio_returns_error(self): + """Test that empty SSE stream returns an ERROR reply.""" + mock_response = MagicMock() + mock_response.raise_for_status = MagicMock() + mock_response.iter_lines.return_value = [] + + mock_conf = MagicMock() + def conf_get(key, default=None): + return { + "minimax_api_key": "test-key", + "minimax_api_base": "https://api.minimax.io", + }.get(key, default) + mock_conf.get = conf_get + + with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf): + with patch("voice.minimax.minimax_voice.requests.post", return_value=mock_response): + from voice.minimax import minimax_voice + import importlib + importlib.reload(minimax_voice) + with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf): + voice = minimax_voice.MinimaxVoice() + from bridge.reply import ReplyType + reply = voice.textToVoice("Hello") + self.assertEqual(reply.type, ReplyType.ERROR) + + +class TestVoiceFactory(unittest.TestCase): + """Test that minimax is registered in the voice factory.""" + + def test_minimax_voice_factory(self): + mock_conf = MagicMock() + mock_conf.get = MagicMock(return_value=None) + with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf): + from voice.factory import create_voice + voice = create_voice("minimax") + from voice.minimax.minimax_voice import MinimaxVoice + self.assertIsInstance(voice, MinimaxVoice) + + +if __name__ == "__main__": + unittest.main() diff --git a/voice/factory.py b/voice/factory.py index 8562f634..abe7ba57 100644 --- a/voice/factory.py +++ b/voice/factory.py @@ -54,4 +54,8 @@ def create_voice(voice_type): from voice.tencent.tencent_voice import TencentVoice return TencentVoice() + elif voice_type == "minimax": + from voice.minimax.minimax_voice import MinimaxVoice + + return MinimaxVoice() raise RuntimeError diff --git a/voice/minimax/__init__.py b/voice/minimax/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/voice/minimax/minimax_voice.py b/voice/minimax/minimax_voice.py new file mode 100644 index 00000000..1446a3f1 --- /dev/null +++ b/voice/minimax/minimax_voice.py @@ -0,0 +1,106 @@ +# encoding:utf-8 +""" +MiniMax TTS voice service +""" +import datetime +import random +import requests + +from bridge.reply import Reply, ReplyType +from common.log import logger +from config import conf +from voice.voice import Voice + + +MINIMAX_TTS_VOICES = [ + "English_Graceful_Lady", + "English_Insightful_Speaker", + "English_radiant_girl", + "English_Persuasive_Man", + "English_Lucky_Robot", + "English_expressive_narrator", + "Chinese_Warm_Woman", + "Chinese_Gentle_Man", +] + + +class MinimaxVoice(Voice): + def __init__(self): + self.api_key = conf().get("minimax_api_key") + self.api_base = conf().get("minimax_api_base") or "https://api.minimax.io" + # Strip trailing /v1 if present so we can always append /v1/t2a_v2 + self.api_base = self.api_base.rstrip("/") + if self.api_base.endswith("/v1"): + self.api_base = self.api_base[:-3] + + def voiceToText(self, voice_file): + """MiniMax does not provide an ASR endpoint; raise NotImplementedError.""" + raise NotImplementedError("MiniMax voice-to-text is not supported") + + def textToVoice(self, text): + try: + model = conf().get("text_to_voice_model") or "speech-2.8-hd" + voice_id = conf().get("tts_voice_id") or "English_Graceful_Lady" + + url = f"{self.api_base}/v1/t2a_v2" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}", + } + payload = { + "model": model, + "text": text, + "stream": True, + "voice_setting": { + "voice_id": voice_id, + "speed": 1, + "vol": 1, + "pitch": 0, + }, + "audio_setting": { + "sample_rate": 32000, + "bitrate": 128000, + "format": "mp3", + "channel": 1, + }, + } + + response = requests.post(url, headers=headers, json=payload, stream=True, timeout=60) + response.raise_for_status() + + # Parse SSE stream and collect hex-encoded audio chunks + audio_chunks = [] + buffer = "" + for raw in response.iter_lines(): + if not raw: + continue + line = raw.decode("utf-8") if isinstance(raw, bytes) else raw + if not line.startswith("data:"): + continue + json_str = line[5:].strip() + if not json_str or json_str == "[DONE]": + continue + try: + import json + event_data = json.loads(json_str) + audio_hex = event_data.get("data", {}).get("audio") + if audio_hex: + audio_chunks.append(bytes.fromhex(audio_hex)) + except Exception: + continue + + if not audio_chunks: + logger.error("[MINIMAX] TTS returned no audio data") + return Reply(ReplyType.ERROR, "语音合成失败,未获取到音频数据") + + audio_data = b"".join(audio_chunks) + file_name = "tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + str(random.randint(0, 1000)) + ".mp3" + with open(file_name, "wb") as f: + f.write(audio_data) + + logger.info(f"[MINIMAX] textToVoice success, file={file_name}") + return Reply(ReplyType.VOICE, file_name) + + except Exception as e: + logger.error(f"[MINIMAX] textToVoice error: {e}") + return Reply(ReplyType.ERROR, "遇到了一点小问题,请稍后再试") From 26693acc3f8d299983bfbb63694ca3fe4b8ba323 Mon Sep 17 00:00:00 2001 From: zhayujie Date: Sat, 11 Apr 2026 19:46:11 +0800 Subject: [PATCH 4/4] feat(vision): prioritize main model for image recognition with multi-provider fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add call_vision method to all bot implementations (DashScope, Claude, Gemini, ZhipuAI, MiniMax, Doubao, Moonshot, OpenAICompatibleBot) using each vendor's native multimodal API format - Remove call_with_tools/call_vision from Bot base class to fix MRO shadowing issue with OpenAICompatibleBot mixin - Refactor vision tool provider resolution: MainModel → other configured models (auto-discovered) → OpenAI → LinkAI, with automatic fallback - Return actual model name used in call_vision responses - Sync config.json API keys to .env bidirectionally on startup - Fix bot instance cache to detect bot_type/use_linkai config changes - Add SSE reconnection support for web console - Preserve image path hints in Gemini text for correct vision tool calls - Update docs/tools/vision.mdx --- agent/tools/vision/vision.py | 258 +++++++++++++++++----- bridge/agent_bridge.py | 84 ++++---- bridge/agent_initializer.py | 58 ++--- channel/web/static/js/console.js | 336 ++++++++++++++++------------- channel/web/web_channel.py | 15 +- docs/en/tools/vision.mdx | 72 +++++++ docs/ja/tools/vision.mdx | 72 +++++++ docs/tools/vision.mdx | 52 ++++- models/bot.py | 17 +- models/claudeapi/claude_api_bot.py | 76 +++++++ models/dashscope/dashscope_bot.py | 52 +++++ models/doubao/doubao_bot.py | 94 ++++++-- models/gemini/google_gemini_bot.py | 60 +++++- models/minimax/minimax_bot.py | 107 ++++++--- models/moonshot/moonshot_bot.py | 94 ++++++-- models/openai_compatible_bot.py | 50 +++++ models/zhipuai/zhipuai_bot.py | 35 +++ 17 files changed, 1173 insertions(+), 359 deletions(-) create mode 100644 docs/en/tools/vision.mdx create mode 100644 docs/ja/tools/vision.mdx diff --git a/agent/tools/vision/vision.py b/agent/tools/vision/vision.py index 3f8ad308..8a2756c2 100644 --- a/agent/tools/vision/vision.py +++ b/agent/tools/vision/vision.py @@ -1,7 +1,13 @@ """ -Vision tool - Analyze images using OpenAI-compatible Vision API. +Vision tool - Analyze images using Vision API. Supports local files (auto base64-encoded) and HTTP URLs. -Providers are tried in priority order with automatic fallback on failure. + +Provider priority (default): + 1. Main model via bot.call_vision — zero extra cost + 2. Other models whose API key is configured — auto-discovered + 3. OpenAI / LinkAI raw HTTP — reliable fallback + When use_linkai=true, LinkAI is promoted to #1. + When tool.vision.model is set, that model is used exclusively first. """ import base64 @@ -14,10 +20,11 @@ from typing import Any, Dict, List, Optional import requests from agent.tools.base_tool import BaseTool, ToolResult +from common import const from common.log import logger from config import conf -DEFAULT_MODEL = "gpt-4.1-mini" +DEFAULT_MODEL = const.GPT_41_MINI DEFAULT_TIMEOUT = 60 MAX_TOKENS = 1000 COMPRESS_THRESHOLD = 1_048_576 # 1 MB @@ -30,8 +37,20 @@ SUPPORTED_EXTENSIONS = { "webp": "image/webp", } +_MAIN_MODEL_PROVIDER_NAME = "MainModel" -OPENAI_COMPATIBLE_BOT_TYPES = {"openai", "openAI", "chatGPT"} +# (config_key_for_api_key, bot_type, default_vision_model, provider_display_name) +# Auto-discovered as fallback vision providers when their API key is configured. +# OpenAI and LinkAI are handled separately (raw HTTP providers), so not listed here. +_DISCOVERABLE_MODELS = [ + ("moonshot_api_key", const.MOONSHOT, const.KIMI_K2_5, "Moonshot"), + ("ark_api_key", const.DOUBAO, const.DOUBAO_SEED_2_PRO, "Doubao"), + ("dashscope_api_key", const.QWEN_DASHSCOPE, const.QWEN36_PLUS, "DashScope"), + ("claude_api_key", const.CLAUDEAPI, const.CLAUDE_4_6_SONNET, "Claude"), + ("gemini_api_key", const.GEMINI, const.GEMINI_31_FLASH_LITE_PRE, "Gemini"), + ("zhipu_ai_api_key", const.ZHIPU_AI, const.GLM_4_7, "ZhipuAI"), + ("minimax_api_key", const.MiniMax, const.MINIMAX_M2_7, "MiniMax"), +] @dataclass @@ -42,6 +61,8 @@ class VisionProvider: api_base: str extra_headers: dict = field(default_factory=dict) model_override: Optional[str] = None + use_bot: bool = False # When True, call via bot.call_vision instead of raw HTTP + fallback_bot: Any = None # Bot instance for non-main-model providers class VisionAPIError(Exception): @@ -50,13 +71,12 @@ class VisionAPIError(Exception): class Vision(BaseTool): - """Analyze images using OpenAI-compatible Vision API""" + """Analyze images using Vision API""" name: str = "vision" description: str = ( "Analyze a local image or image URL (jpg/jpeg/png) using Vision API. " "Can describe content, extract text, identify objects, colors, etc. " - "Requires OPENAI_API_KEY or LINKAI_API_KEY." ) params: dict = { @@ -70,13 +90,6 @@ class Vision(BaseTool): "type": "string", "description": "Question to ask about the image", }, - "model": { - "type": "string", - "description": ( - f"Vision model to use (default: {DEFAULT_MODEL}). " - "Options: gpt-4.1-mini, gpt-4.1, gpt-4o-mini, gpt-4o" - ), - }, }, "required": ["image", "question"], } @@ -86,15 +99,11 @@ class Vision(BaseTool): @staticmethod def is_available() -> bool: - return bool( - conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY") - or conf().get("linkai_api_key") or os.environ.get("LINKAI_API_KEY") - ) + return True def execute(self, args: Dict[str, Any]) -> ToolResult: image = args.get("image", "").strip() question = args.get("question", "").strip() - model = args.get("model", DEFAULT_MODEL).strip() or DEFAULT_MODEL if not image: return ToolResult.fail("Error: 'image' parameter is required") @@ -104,11 +113,12 @@ class Vision(BaseTool): providers = self._resolve_providers() if not providers: return ToolResult.fail( - "Error: No API key configured for Vision.\n" - "Please configure one of the following using env_config tool:\n" - " 1. OPENAI_API_KEY (preferred): env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n" - " 2. LINKAI_API_KEY (fallback): env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")\n\n" - "Get your key at: https://platform.openai.com/api-keys or https://link-ai.tech" + "Error: No model available for Vision.\n" + "The main model does not support vision and no other API keys are configured.\n" + "Options:\n" + " 1. Switch to a multimodal model (e.g. qwen3.6-plus, claude-sonnet-4-6, gemini-2.0-flash)\n" + " 2. Configure OPENAI_API_KEY: env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n" + " 3. Configure LINKAI_API_KEY: env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")" ) try: @@ -116,7 +126,7 @@ class Vision(BaseTool): except Exception as e: return ToolResult.fail(f"Error: {e}") - return self._call_with_fallback(providers, model, question, image_content) + return self._call_with_fallback(providers, DEFAULT_MODEL, question, image_content) def _call_with_fallback(self, providers: List[VisionProvider], model: str, question: str, image_content: dict) -> ToolResult: @@ -125,9 +135,14 @@ class Vision(BaseTool): for i, provider in enumerate(providers): use_model = provider.model_override or model try: - logger.debug(f"[Vision] Trying provider '{provider.name}' " - f"with model '{use_model}' ({i + 1}/{len(providers)})") - return self._call_api(provider, use_model, question, image_content) + logger.info(f"[Vision] Trying provider '{provider.name}' " + f"with model '{use_model}' ({i + 1}/{len(providers)})") + if provider.use_bot: + result = self._call_via_bot(use_model, question, image_content, provider) + else: + result = self._call_api(provider, use_model, question, image_content) + logger.info(f"[Vision] ✅ Success via {provider.name} (model={use_model})") + return result except VisionAPIError as e: errors.append(f"[{provider.name}/{use_model}] {e}") logger.warning(f"[Vision] Provider '{provider.name}' failed: {e}") @@ -148,35 +163,113 @@ class Vision(BaseTool): def _resolve_providers(self) -> List[VisionProvider]: """ Build an ordered list of available providers. - Each provider builder returns a VisionProvider or None. - To add a new provider, append a builder method to _PROVIDER_BUILDERS. + + Priority: + - use_linkai=true → [LinkAI, MainModel, OtherModels…, OpenAI] + - default → [MainModel, OtherModels…, OpenAI, LinkAI] + + "OtherModels" are auto-discovered from configured API keys. + The main model's bot_type is excluded from OtherModels to avoid + duplicating the MainModel provider. """ + use_linkai = conf().get("use_linkai", False) and conf().get("linkai_api_key") providers: List[VisionProvider] = [] - for builder in self._PROVIDER_BUILDERS: - provider = builder(self) - if provider: - providers.append(provider) + + if use_linkai: + self._append_provider(providers, self._build_linkai_provider) + self._append_provider(providers, self._build_main_model_provider) + self._append_other_model_providers(providers) + self._append_provider(providers, self._build_openai_provider) + else: + self._append_provider(providers, self._build_main_model_provider) + self._append_other_model_providers(providers) + self._append_provider(providers, self._build_openai_provider) + self._append_provider(providers, self._build_linkai_provider) + return providers - def _build_custom_model_provider(self) -> Optional[VisionProvider]: + @staticmethod + def _append_provider(providers: List[VisionProvider], builder) -> None: + p = builder() + if p: + providers.append(p) + + def _append_other_model_providers(self, providers: List[VisionProvider]) -> None: """ - When bot_type is openai-compatible and a custom model is configured, - try the user's own model first — it may already support multimodal input. + Auto-discover other models whose API key is configured. + Skip the main model's own bot_type (already covered by MainModel provider). + Skip bot_types that already have a provider in the list (e.g. OpenAI). """ - bot_type = conf().get("bot_type", "") - if bot_type not in OPENAI_COMPATIBLE_BOT_TYPES: + # Determine main model's bot_type so we can skip it + main_bot_type = None + if self.model and hasattr(self.model, '_resolve_bot_type'): + main_bot_type = self.model._resolve_bot_type(conf().get("model", "")) + + existing_names = {p.name for p in providers} + + for config_key, bot_type, default_model, display_name in _DISCOVERABLE_MODELS: + if display_name in existing_names: + continue + if bot_type == main_bot_type: + continue + api_key = conf().get(config_key, "") + if not api_key or not api_key.strip(): + continue + + # Create a bot instance and check if it supports call_vision + try: + from models.bot_factory import create_bot + bot = create_bot(bot_type) + if not hasattr(bot, 'call_vision'): + continue + except Exception: + continue + + providers.append(VisionProvider( + name=display_name, + api_key="", + api_base="", + model_override=default_model, + use_bot=True, + fallback_bot=bot, + )) + + def _resolve_vision_model(self) -> Optional[str]: + """ + Determine which model to use for vision. + + 1. User explicit config: tool.vision.model in config.json + 2. Fallback to the main configured model name + """ + tool_conf = conf().get("tool", {}) + user_vision_model = tool_conf.get("vision", {}).get("model") if isinstance(tool_conf, dict) else None + if user_vision_model: + return user_vision_model + model_name = conf().get("model", "") + return model_name or None + + def _build_main_model_provider(self) -> Optional[VisionProvider]: + """ + Use the vendor's own model for vision via bot.call_vision. + Only available when the bot class has call_vision. + """ + if not (self.model and hasattr(self.model, 'bot')): return None - custom_model = conf().get("model", "") - if not custom_model or custom_model == DEFAULT_MODEL: + try: + bot = self.model.bot + if not hasattr(bot, 'call_vision'): + return None + except Exception: return None - api_key = conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY") - if not api_key: - return None - api_base = (conf().get("open_ai_api_base") or os.environ.get("OPENAI_API_BASE", "")).rstrip("/") \ - or "https://api.openai.com/v1" + + vision_model = self._resolve_vision_model() + return VisionProvider( - name="CustomModel", api_key=api_key, api_base=self._ensure_v1(api_base), - model_override=custom_model, + name=_MAIN_MODEL_PROVIDER_NAME, + api_key="", + api_base="", + model_override=vision_model, + use_bot=True, ) def _build_openai_provider(self) -> Optional[VisionProvider]: @@ -200,7 +293,54 @@ class Vision(BaseTool): return VisionProvider(name="LinkAI", api_key=api_key, api_base=self._ensure_v1(api_base), extra_headers=extra) - _PROVIDER_BUILDERS = [_build_custom_model_provider, _build_openai_provider, _build_linkai_provider] + def _call_via_bot(self, model: str, question: str, image_content: dict, + provider: Optional[VisionProvider] = None) -> ToolResult: + """ + Call a model's call_vision with vendor-native API format. + Uses the provider's _fallback_bot if set, otherwise the main model bot. + Raises VisionAPIError on failure so fallback can proceed. + """ + try: + bot = (provider and provider.fallback_bot) or self.model.bot + except Exception as e: + raise VisionAPIError(f"Cannot access bot: {e}") + + # Extract the raw image URL from the OpenAI-format image_content block + image_url = image_content.get("image_url", {}).get("url", "") + if not image_url: + raise VisionAPIError("No image URL in content block") + + try: + response = bot.call_vision( + image_url=image_url, + question=question, + model=model, + max_tokens=MAX_TOKENS, + ) + except Exception as e: + raise VisionAPIError(f"call_vision failed: {e}") + + if response is NotImplemented: + raise VisionAPIError("Bot does not support vision") + + if isinstance(response, dict) and response.get("error"): + raise VisionAPIError(f"API error - {response.get('message', 'Unknown')}") + + content = response.get("content", "") if isinstance(response, dict) else "" + if not content: + raise VisionAPIError("Empty response from main model") + + usage_info = response.get("usage", {}) if isinstance(response, dict) else {} + + # Use the actual model name from the bot response if available + actual_model = response.get("model", model) if isinstance(response, dict) else model + provider_name = provider.name if provider else _MAIN_MODEL_PROVIDER_NAME + return ToolResult.success({ + "model": actual_model, + "provider": provider_name, + "content": content, + "usage": usage_info, + }) @staticmethod def _ensure_v1(api_base: str) -> str: @@ -213,9 +353,13 @@ class Vision(BaseTool): return api_base.rstrip("/") + "/v1" def _build_image_content(self, image: str) -> dict: - """Build the image_url content block for the API request.""" + """ + Build the image_url content block. + Both remote URLs and local files are converted to base64 data URLs + so every bot backend can consume them without extra downloads. + """ if image.startswith(("http://", "https://")): - return {"type": "image_url", "image_url": {"url": image}} + return self._download_to_data_url(image) if not os.path.isfile(image): raise FileNotFoundError(f"Image file not found: {image}") @@ -239,6 +383,19 @@ class Vision(BaseTool): data_url = f"data:{mime_type};base64,{b64}" return {"type": "image_url", "image_url": {"url": data_url}} + @staticmethod + def _download_to_data_url(url: str) -> dict: + """Download a remote image and return it as a base64 data URL.""" + resp = requests.get(url, timeout=30) + if resp.status_code != 200: + raise VisionAPIError(f"Failed to download image: HTTP {resp.status_code}") + content_type = resp.headers.get("Content-Type", "image/jpeg").split(";")[0].strip() + if not content_type.startswith("image/"): + content_type = "image/jpeg" + b64 = base64.b64encode(resp.content).decode("ascii") + data_url = f"data:{content_type};base64,{b64}" + return {"type": "image_url", "image_url": {"url": data_url}} + @staticmethod def _maybe_compress(path: str) -> str: """Compress image to under COMPRESS_THRESHOLD with max long-edge 1536px.""" @@ -312,7 +469,6 @@ class Vision(BaseTool): ], } ], - "max_completion_tokens": MAX_TOKENS, } headers = { diff --git a/bridge/agent_bridge.py b/bridge/agent_bridge.py index 84b7aad6..cc54f52e 100644 --- a/bridge/agent_bridge.py +++ b/bridge/agent_bridge.py @@ -124,14 +124,15 @@ class AgentLLMModel(LLMModel): @property def bot(self): - """Lazy load the bot, re-create when model changes""" + """Lazy load the bot, re-create when model or bot_type changes""" from models.bot_factory import create_bot cur_model = self.model - if self._bot is None or self._bot_model != cur_model: - bot_type = self._resolve_bot_type(cur_model) - self._bot = create_bot(bot_type) + cur_bot_type = self._resolve_bot_type(cur_model) + if self._bot is None or self._bot_model != cur_model or getattr(self, '_bot_type', None) != cur_bot_type: + self._bot = create_bot(cur_bot_type) self._bot = add_openai_compatible_support(self._bot) self._bot_model = cur_model + self._bot_type = cur_bot_type return self._bot def call(self, request: LLMRequest): @@ -505,15 +506,15 @@ class AgentBridge: def _migrate_config_to_env(self, workspace_root: str): """ - Migrate API keys from config.json to .env file if not already set - + Sync API keys from config.json to .env file. + Adds new keys and updates changed values on each startup. + Args: workspace_root: Workspace directory path (not used, kept for compatibility) """ from config import conf import os - # Mapping from config.json keys to environment variable names key_mapping = { "open_ai_api_key": "OPENAI_API_KEY", "open_ai_api_base": "OPENAI_API_BASE", @@ -522,10 +523,9 @@ class AgentBridge: "linkai_api_key": "LINKAI_API_KEY", } - # Use fixed secure location for .env file env_file = expand_path("~/.cow/.env") - # Read existing env vars from .env file + # Read existing env vars (key -> value) existing_env_vars = {} if os.path.exists(env_file): try: @@ -533,48 +533,46 @@ class AgentBridge: for line in f: line = line.strip() if line and not line.startswith('#') and '=' in line: - key, _ = line.split('=', 1) - existing_env_vars[key.strip()] = True + key, val = line.split('=', 1) + existing_env_vars[key.strip()] = val.strip() except Exception as e: logger.warning(f"[AgentBridge] Failed to read .env file: {e}") - # Check which keys need to be migrated - keys_to_migrate = {} + # Sync config.json values into .env (add/update/remove) + updated = False for config_key, env_key in key_mapping.items(): - # Skip if already in .env file - if env_key in existing_env_vars: - continue - - # Get value from config.json - value = conf().get(config_key, "") - if value and value.strip(): # Only migrate non-empty values - keys_to_migrate[env_key] = value.strip() - - # Log summary if there are keys to skip - if existing_env_vars: - logger.debug(f"[AgentBridge] {len(existing_env_vars)} env vars already in .env") - - # Write new keys to .env file - if keys_to_migrate: + raw = conf().get(config_key, "") + value = raw.strip() if raw else "" + old_value = existing_env_vars.get(env_key) + + if value: + if old_value == value: + continue + existing_env_vars[env_key] = value + os.environ[env_key] = value + updated = True + else: + if old_value is None: + continue + existing_env_vars.pop(env_key, None) + os.environ.pop(env_key, None) + updated = True + updated = True + + if updated: try: - # Ensure ~/.cow directory and .env file exist env_dir = os.path.dirname(env_file) - if not os.path.exists(env_dir): - os.makedirs(env_dir, exist_ok=True) - if not os.path.exists(env_file): - open(env_file, 'a').close() - - # Append new keys - with open(env_file, 'a', encoding='utf-8') as f: - f.write('\n# Auto-migrated from config.json\n') - for key, value in keys_to_migrate.items(): + os.makedirs(env_dir, exist_ok=True) + + with open(env_file, 'w', encoding='utf-8') as f: + f.write('# Environment variables for agent\n') + f.write('# Auto-managed - synced from config.json on startup\n\n') + for key, value in sorted(existing_env_vars.items()): f.write(f'{key}={value}\n') - # Also set in current process - os.environ[key] = value - - logger.info(f"[AgentBridge] Migrated {len(keys_to_migrate)} API keys from config.json to .env: {list(keys_to_migrate.keys())}") + + logger.info(f"[AgentBridge] Synced API keys from config.json to .env") except Exception as e: - logger.warning(f"[AgentBridge] Failed to migrate API keys: {e}") + logger.warning(f"[AgentBridge] Failed to sync API keys: {e}") def _persist_messages( self, session_id: str, new_messages: list, channel_type: str = "" diff --git a/bridge/agent_initializer.py b/bridge/agent_initializer.py index 58bbbfb3..5e0fe01b 100644 --- a/bridge/agent_initializer.py +++ b/bridge/agent_initializer.py @@ -490,7 +490,7 @@ class AgentInitializer: env_file = expand_path("~/.cow/.env") - # Read existing env vars + # Read existing env vars (key -> value) existing_env_vars = {} if os.path.exists(env_file): try: @@ -498,38 +498,46 @@ class AgentInitializer: for line in f: line = line.strip() if line and not line.startswith('#') and '=' in line: - key, _ = line.split('=', 1) - existing_env_vars[key.strip()] = True + key, val = line.split('=', 1) + existing_env_vars[key.strip()] = val.strip() except Exception as e: logger.warning(f"[AgentInitializer] Failed to read .env file: {e}") - # Check which keys need migration - keys_to_migrate = {} + # Sync config.json values into .env (add/update/remove) + updated = False for config_key, env_key in key_mapping.items(): - if env_key in existing_env_vars: - continue - value = conf().get(config_key, "") - if value and value.strip(): - keys_to_migrate[env_key] = value.strip() - - # Write new keys - if keys_to_migrate: + raw = conf().get(config_key, "") + value = raw.strip() if raw else "" + old_value = existing_env_vars.get(env_key) + + if value: + if old_value == value: + continue + existing_env_vars[env_key] = value + os.environ[env_key] = value + updated = True + else: + if old_value is None: + continue + existing_env_vars.pop(env_key, None) + os.environ.pop(env_key, None) + updated = True + + if updated: try: env_dir = os.path.dirname(env_file) - if not os.path.exists(env_dir): - os.makedirs(env_dir, exist_ok=True) - if not os.path.exists(env_file): - open(env_file, 'a').close() - - with open(env_file, 'a', encoding='utf-8') as f: - f.write('\n# Auto-migrated from config.json\n') - for key, value in keys_to_migrate.items(): + os.makedirs(env_dir, exist_ok=True) + + # Rewrite the entire .env file to ensure consistency + with open(env_file, 'w', encoding='utf-8') as f: + f.write('# Environment variables for agent\n') + f.write('# Auto-managed - synced from config.json on startup\n\n') + for key, value in sorted(existing_env_vars.items()): f.write(f'{key}={value}\n') - os.environ[key] = value - - logger.info(f"[AgentInitializer] Migrated {len(keys_to_migrate)} API keys to .env: {list(keys_to_migrate.keys())}") + + logger.info(f"[AgentInitializer] Synced API keys from config.json to .env") except Exception as e: - logger.warning(f"[AgentInitializer] Failed to migrate API keys: {e}") + logger.warning(f"[AgentInitializer] Failed to sync API keys: {e}") def _start_daily_flush_timer(self): """Start a background thread that flushes all agents' memory daily at 23:55.""" diff --git a/channel/web/static/js/console.js b/channel/web/static/js/console.js index 24e120be..0f6c2a29 100644 --- a/channel/web/static/js/console.js +++ b/channel/web/static/js/console.js @@ -806,15 +806,17 @@ function sendMessage() { } function startSSE(requestId, loadingEl, timestamp) { - const es = new EventSource(`/stream?request_id=${encodeURIComponent(requestId)}`); - activeStreams[requestId] = es; - let botEl = null; let stepsEl = null; // .agent-steps (thinking summaries + tool indicators) let contentEl = null; // .answer-content (final streaming answer) let mediaEl = null; // .media-content (images & file attachments) let accumulatedText = ''; let currentToolEl = null; + let done = false; + + const MAX_RECONNECTS = 10; + const RECONNECT_BASE_MS = 1000; + let reconnectCount = 0; function ensureBotEl() { if (botEl) return; @@ -839,180 +841,204 @@ function startSSE(requestId, loadingEl, timestamp) { mediaEl = botEl.querySelector('.media-content'); } - es.onmessage = function(e) { - let item; - try { item = JSON.parse(e.data); } catch (_) { return; } + function connect() { + const es = new EventSource(`/stream?request_id=${encodeURIComponent(requestId)}`); + activeStreams[requestId] = es; - if (item.type === 'delta') { - ensureBotEl(); - accumulatedText += item.content; - contentEl.innerHTML = renderMarkdown(accumulatedText); - scrollChatToBottom(); + es.onmessage = function(e) { + let item; + try { item = JSON.parse(e.data); } catch (_) { return; } - } else if (item.type === 'tool_start') { - ensureBotEl(); + // Successful data received, reset reconnect counter + reconnectCount = 0; - // Save current thinking as a collapsible step - if (accumulatedText.trim()) { - const fullText = accumulatedText.trim(); - const oneLine = fullText.replace(/\n+/g, ' '); - const needsTruncate = oneLine.length > 80; - const stepEl = document.createElement('div'); - stepEl.className = 'agent-step agent-thinking-step' + (needsTruncate ? '' : ' no-expand'); - if (needsTruncate) { - const truncated = oneLine.substring(0, 80) + '…'; - stepEl.innerHTML = ` -
- - ${escapeHtml(truncated)} - -
-
${renderMarkdown(fullText)}
`; - } else { - stepEl.innerHTML = ` -
- - ${escapeHtml(oneLine)} -
`; + if (item.type === 'delta') { + ensureBotEl(); + accumulatedText += item.content; + contentEl.innerHTML = renderMarkdown(accumulatedText); + scrollChatToBottom(); + + } else if (item.type === 'tool_start') { + ensureBotEl(); + + // Save current thinking as a collapsible step + if (accumulatedText.trim()) { + const fullText = accumulatedText.trim(); + const oneLine = fullText.replace(/\n+/g, ' '); + const needsTruncate = oneLine.length > 80; + const stepEl = document.createElement('div'); + stepEl.className = 'agent-step agent-thinking-step' + (needsTruncate ? '' : ' no-expand'); + if (needsTruncate) { + const truncated = oneLine.substring(0, 80) + '…'; + stepEl.innerHTML = ` +
+ + ${escapeHtml(truncated)} + +
+
${renderMarkdown(fullText)}
`; + } else { + stepEl.innerHTML = ` +
+ + ${escapeHtml(oneLine)} +
`; + } + stepsEl.appendChild(stepEl); } - stepsEl.appendChild(stepEl); - } - accumulatedText = ''; - contentEl.innerHTML = ''; + accumulatedText = ''; + contentEl.innerHTML = ''; - // Add tool execution indicator (collapsible) - currentToolEl = document.createElement('div'); - currentToolEl.className = 'agent-step agent-tool-step'; - const argsStr = formatToolArgs(item.arguments || {}); - currentToolEl.innerHTML = ` -
- - ${item.tool} - -
-
-
-
Input
-
${argsStr}
+ // Add tool execution indicator (collapsible) + currentToolEl = document.createElement('div'); + currentToolEl.className = 'agent-step agent-tool-step'; + const argsStr = formatToolArgs(item.arguments || {}); + currentToolEl.innerHTML = ` +
+ + ${item.tool} +
-
-
`; - stepsEl.appendChild(currentToolEl); +
+
+
Input
+
${argsStr}
+
+
+
`; + stepsEl.appendChild(currentToolEl); - scrollChatToBottom(); + scrollChatToBottom(); - } else if (item.type === 'tool_end') { - if (currentToolEl) { - const isError = item.status !== 'success'; - const icon = currentToolEl.querySelector('.tool-icon'); - icon.className = isError - ? 'fas fa-times text-red-400 flex-shrink-0 tool-icon' - : 'fas fa-check text-primary-400 flex-shrink-0 tool-icon'; + } else if (item.type === 'tool_end') { + if (currentToolEl) { + const isError = item.status !== 'success'; + const icon = currentToolEl.querySelector('.tool-icon'); + icon.className = isError + ? 'fas fa-times text-red-400 flex-shrink-0 tool-icon' + : 'fas fa-check text-primary-400 flex-shrink-0 tool-icon'; - // Show execution time - const nameEl = currentToolEl.querySelector('.tool-name'); - if (item.execution_time !== undefined) { - nameEl.innerHTML += ` ${item.execution_time}s`; + // Show execution time + const nameEl = currentToolEl.querySelector('.tool-name'); + if (item.execution_time !== undefined) { + nameEl.innerHTML += ` ${item.execution_time}s`; + } + + // Fill output section + const outputSection = currentToolEl.querySelector('.tool-output-section'); + if (outputSection && item.result) { + outputSection.innerHTML = ` +
${isError ? 'Error' : 'Output'}
+
${escapeHtml(String(item.result))}
`; + } + + if (isError) currentToolEl.classList.add('tool-failed'); + currentToolEl = null; } - // Fill output section - const outputSection = currentToolEl.querySelector('.tool-output-section'); - if (outputSection && item.result) { - outputSection.innerHTML = ` -
${isError ? 'Error' : 'Output'}
-
${escapeHtml(String(item.result))}
`; - } + } else if (item.type === 'image') { + ensureBotEl(); + const imgEl = document.createElement('img'); + imgEl.src = item.content; + imgEl.alt = 'screenshot'; + imgEl.style.cssText = 'max-width:600px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);'; + imgEl.onclick = () => window.open(item.content, '_blank'); + mediaEl.appendChild(imgEl); + scrollChatToBottom(); - if (isError) currentToolEl.classList.add('tool-failed'); - currentToolEl = null; + } else if (item.type === 'text') { + // Intermediate text sent before media items; display it but keep SSE open. + ensureBotEl(); + contentEl.classList.remove('sse-streaming'); + const textContent = item.content || accumulatedText; + if (textContent) contentEl.innerHTML = renderMarkdown(textContent); + applyHighlighting(botEl); + scrollChatToBottom(); + + } else if (item.type === 'video') { + ensureBotEl(); + const wrapper = document.createElement('div'); + wrapper.innerHTML = _buildVideoHtml(item.content); + mediaEl.appendChild(wrapper.firstElementChild || wrapper); + scrollChatToBottom(); + + } else if (item.type === 'file') { + ensureBotEl(); + const fileName = item.file_name || item.content.split('/').pop(); + const fileEl = document.createElement('a'); + fileEl.href = item.content; + fileEl.download = fileName; + fileEl.target = '_blank'; + fileEl.className = 'file-attachment'; + fileEl.style.cssText = 'display:inline-flex;align-items:center;gap:6px;padding:8px 14px;margin:8px 0;border-radius:8px;background:var(--bg-secondary,#f3f4f6);color:var(--text-primary,#374151);text-decoration:none;font-size:14px;border:1px solid var(--border-color,#e5e7eb);'; + fileEl.innerHTML = ` ${fileName}`; + mediaEl.appendChild(fileEl); + scrollChatToBottom(); + + } else if (item.type === 'phase') { + // Coarse progress (e.g. cow install-browser); must not close SSE (unlike "done") + ensureBotEl(); + const wrap = document.createElement('div'); + wrap.className = 'text-xs sm:text-sm text-slate-600 dark:text-slate-400 border-l-2 border-primary-400 pl-2 py-1 my-0.5'; + wrap.textContent = String(item.content || ''); + stepsEl.appendChild(wrap); + scrollChatToBottom(); + + } else if (item.type === 'done') { + done = true; + es.close(); + delete activeStreams[requestId]; + + // item.content may be empty when "done" is only a stream-close signal after media. + const finalText = item.content || accumulatedText; + + if (!botEl && finalText) { + if (loadingEl) { loadingEl.remove(); loadingEl = null; } + addBotMessage(finalText, new Date((item.timestamp || Date.now() / 1000) * 1000), requestId); + } else if (botEl) { + contentEl.classList.remove('sse-streaming'); + // Only update text content when there is something new to show. + if (finalText) contentEl.innerHTML = renderMarkdown(finalText); + applyHighlighting(botEl); + } + scrollChatToBottom(); + + } else if (item.type === 'error') { + done = true; + es.close(); + delete activeStreams[requestId]; + if (loadingEl) { loadingEl.remove(); loadingEl = null; } + addBotMessage(t('error_send'), new Date()); } + }; - } else if (item.type === 'image') { - ensureBotEl(); - const imgEl = document.createElement('img'); - imgEl.src = item.content; - imgEl.alt = 'screenshot'; - imgEl.style.cssText = 'max-width:600px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);'; - imgEl.onclick = () => window.open(item.content, '_blank'); - mediaEl.appendChild(imgEl); - scrollChatToBottom(); - - } else if (item.type === 'text') { - // Intermediate text sent before media items; display it but keep SSE open. - ensureBotEl(); - contentEl.classList.remove('sse-streaming'); - const textContent = item.content || accumulatedText; - if (textContent) contentEl.innerHTML = renderMarkdown(textContent); - applyHighlighting(botEl); - scrollChatToBottom(); - - } else if (item.type === 'video') { - ensureBotEl(); - const wrapper = document.createElement('div'); - wrapper.innerHTML = _buildVideoHtml(item.content); - mediaEl.appendChild(wrapper.firstElementChild || wrapper); - scrollChatToBottom(); - - } else if (item.type === 'file') { - ensureBotEl(); - const fileName = item.file_name || item.content.split('/').pop(); - const fileEl = document.createElement('a'); - fileEl.href = item.content; - fileEl.download = fileName; - fileEl.target = '_blank'; - fileEl.className = 'file-attachment'; - fileEl.style.cssText = 'display:inline-flex;align-items:center;gap:6px;padding:8px 14px;margin:8px 0;border-radius:8px;background:var(--bg-secondary,#f3f4f6);color:var(--text-primary,#374151);text-decoration:none;font-size:14px;border:1px solid var(--border-color,#e5e7eb);'; - fileEl.innerHTML = ` ${fileName}`; - mediaEl.appendChild(fileEl); - scrollChatToBottom(); - - } else if (item.type === 'phase') { - // Coarse progress (e.g. cow install-browser); must not close SSE (unlike "done") - ensureBotEl(); - const wrap = document.createElement('div'); - wrap.className = 'text-xs sm:text-sm text-slate-600 dark:text-slate-400 border-l-2 border-primary-400 pl-2 py-1 my-0.5'; - wrap.textContent = String(item.content || ''); - stepsEl.appendChild(wrap); - scrollChatToBottom(); - - } else if (item.type === 'done') { + es.onerror = function() { es.close(); delete activeStreams[requestId]; - // item.content may be empty when "done" is only a stream-close signal after media. - const finalText = item.content || accumulatedText; + if (done) return; - if (!botEl && finalText) { - if (loadingEl) { loadingEl.remove(); loadingEl = null; } - addBotMessage(finalText, new Date((item.timestamp || Date.now() / 1000) * 1000), requestId); - } else if (botEl) { + if (reconnectCount < MAX_RECONNECTS) { + reconnectCount++; + const delay = Math.min(RECONNECT_BASE_MS * reconnectCount, 5000); + console.warn(`[SSE] connection lost for ${requestId}, reconnecting in ${delay}ms (attempt ${reconnectCount}/${MAX_RECONNECTS})`); + setTimeout(connect, delay); + return; + } + + // Exhausted retries, show whatever we have + if (loadingEl) { loadingEl.remove(); loadingEl = null; } + if (!botEl) { + addBotMessage(t('error_send'), new Date()); + } else if (accumulatedText) { contentEl.classList.remove('sse-streaming'); - // Only update text content when there is something new to show. - if (finalText) contentEl.innerHTML = renderMarkdown(finalText); + contentEl.innerHTML = renderMarkdown(accumulatedText); applyHighlighting(botEl); } - scrollChatToBottom(); + }; + } - } else if (item.type === 'error') { - es.close(); - delete activeStreams[requestId]; - if (loadingEl) { loadingEl.remove(); loadingEl = null; } - addBotMessage(t('error_send'), new Date()); - } - }; - - es.onerror = function() { - es.close(); - delete activeStreams[requestId]; - if (loadingEl) { loadingEl.remove(); loadingEl = null; } - if (!botEl) { - addBotMessage(t('error_send'), new Date()); - } else if (accumulatedText) { - contentEl.classList.remove('sse-streaming'); - contentEl.innerHTML = renderMarkdown(accumulatedText); - applyHighlighting(botEl); - } - }; + connect(); } function startPolling() { diff --git a/channel/web/web_channel.py b/channel/web/web_channel.py index 32b27062..bd686f9f 100644 --- a/channel/web/web_channel.py +++ b/channel/web/web_channel.py @@ -329,14 +329,18 @@ class WebChannel(ChatChannel): """ SSE generator for a given request_id. Yields UTF-8 encoded bytes to avoid WSGI Latin-1 mangling. + Supports client reconnection: the queue is only removed after a + "done" event is consumed, so a new GET /stream with the same + request_id can resume reading remaining events. """ if request_id not in self.sse_queues: yield b"data: {\"type\": \"error\", \"message\": \"invalid request_id\"}\n\n" return q = self.sse_queues[request_id] - timeout = 300 # 5 minutes max - deadline = time.time() + timeout + idle_timeout = 600 # 10 minutes without any real event + deadline = time.time() + idle_timeout + done = False try: while time.time() < deadline: @@ -346,13 +350,18 @@ class WebChannel(ChatChannel): yield b": keepalive\n\n" continue + # Real event received, reset idle deadline + deadline = time.time() + idle_timeout + payload = json.dumps(item, ensure_ascii=False) yield f"data: {payload}\n\n".encode("utf-8") if item.get("type") == "done": + done = True break finally: - self.sse_queues.pop(request_id, None) + if done: + self.sse_queues.pop(request_id, None) def poll_response(self): """ diff --git a/docs/en/tools/vision.mdx b/docs/en/tools/vision.mdx new file mode 100644 index 00000000..cebecbea --- /dev/null +++ b/docs/en/tools/vision.mdx @@ -0,0 +1,72 @@ +--- +title: vision - Image Analysis +description: Analyze image content (recognition, description, OCR, etc.) +--- + +Analyze local images or image URLs using Vision API. Supports content description, text extraction (OCR), object recognition, and more. + +## Model Selection + +The vision tool uses a multi-level auto-selection strategy with automatic fallback — no manual configuration required: + +1. **Main model** — uses the currently configured main model for image recognition (zero extra cost) +2. **Other configured models** — auto-discovers other models with configured API keys as alternatives +3. **OpenAI** — uses `open_ai_api_key` to call gpt-4.1-mini +4. **LinkAI** — uses `linkai_api_key` to call LinkAI vision service + +When `use_linkai=true`, LinkAI is promoted to the highest priority. + +If the current provider fails, the tool automatically tries the next one until it succeeds or all fail. + +### Supported Models + +| Vendor | Vision Model | Notes | +| --- | --- | --- | +| OpenAI / Compatible | Main model | All OpenAI-compatible multimodal models | +| Qwen (DashScope) | Main model | Via MultiModalConversation API | +| Claude | Main model | Anthropic native image format | +| Gemini | Main model | inlineData format | +| Doubao | Main model | doubao-seed-2-0 series natively supported | +| Kimi (Moonshot) | Main model | kimi-k2.5 natively supported | +| ZhipuAI | glm-5v-turbo | Always uses dedicated vision model | +| MiniMax | MiniMax-Text-01 | Always uses dedicated vision model | + + + ZhipuAI and MiniMax text models do not support image understanding, so their dedicated vision models are always used automatically. + + +## Parameters + +| Parameter | Type | Required | Description | +| --- | --- | --- | --- | +| `image` | string | Yes | Local file path or HTTP(S) image URL | +| `question` | string | Yes | Question to ask about the image | + +Supported image formats: jpg, jpeg, png, gif, webp + +## Custom Configuration + +To specify a particular model for the vision tool, add to `config.json`: + +```json +{ + "tool": { + "vision": { + "model": "gpt-4o" + } + } +} +``` + +In most cases no configuration is needed. The tool works automatically as long as the main model supports multimodal input or any vision-capable API key is configured. + +## Use Cases + +- Describe image content +- Extract text from images (OCR) +- Identify objects, colors, scenes +- Analyze screenshots and scanned documents + + + Images larger than 1MB are automatically compressed (max edge 1536px). All images (including remote URLs) are converted to base64 for transmission to ensure compatibility with all model backends. + diff --git a/docs/ja/tools/vision.mdx b/docs/ja/tools/vision.mdx new file mode 100644 index 00000000..f34bf58a --- /dev/null +++ b/docs/ja/tools/vision.mdx @@ -0,0 +1,72 @@ +--- +title: vision - 画像分析 +description: 画像コンテンツの分析(認識、説明、OCR など) +--- + +Vision API を使用してローカル画像や画像 URL を分析します。コンテンツの説明、テキスト抽出(OCR)、オブジェクト認識などに対応しています。 + +## モデル選択 + +Vision ツールは多段階の自動選択+自動フォールバック戦略を採用しており、手動設定なしで利用可能です: + +1. **メインモデル** — 現在設定されているメインモデルで画像認識を実行(追加コストなし) +2. **その他の設定済みモデル** — API キーが設定されている他のマルチモーダルモデルを自動検出 +3. **OpenAI** — `open_ai_api_key` を使用して gpt-4.1-mini を呼び出し +4. **LinkAI** — `linkai_api_key` を使用して LinkAI ビジョンサービスを呼び出し + +`use_linkai=true` の場合、LinkAI が最優先になります。 + +現在のプロバイダーが失敗した場合、成功するかすべて失敗するまで自動的に次のプロバイダーを試行します。 + +### 対応モデル + +| ベンダー | ビジョンモデル | 説明 | +| --- | --- | --- | +| OpenAI / 互換プロトコル | メインモデル | すべての OpenAI 互換マルチモーダルモデルに対応 | +| 通義千問 (DashScope) | メインモデル | MultiModalConversation API 経由 | +| Claude | メインモデル | Anthropic ネイティブ画像形式 | +| Gemini | メインモデル | inlineData 形式 | +| 豆包 (Doubao) | メインモデル | doubao-seed-2-0 シリーズがネイティブ対応 | +| Kimi (Moonshot) | メインモデル | kimi-k2.5 がネイティブ対応 | +| 智谱 AI | glm-5v-turbo | 常にビジョン専用モデルを使用 | +| MiniMax | MiniMax-Text-01 | 常にビジョン専用モデルを使用 | + + + 智谱 AI と MiniMax のテキストモデルは画像理解に対応していないため、対応するビジョン専用モデルが自動的に使用されます。 + + +## パラメータ + +| パラメータ | 型 | 必須 | 説明 | +| --- | --- | --- | --- | +| `image` | string | はい | ローカルファイルパスまたは HTTP(S) 画像 URL | +| `question` | string | はい | 画像に対する質問 | + +対応画像形式:jpg、jpeg、png、gif、webp + +## カスタム設定 + +Vision ツールで使用するモデルを指定するには、`config.json` に以下を追加します: + +```json +{ + "tool": { + "vision": { + "model": "gpt-4o" + } + } +} +``` + +ほとんどの場合、設定は不要です。メインモデルがマルチモーダルに対応しているか、ビジョン対応の API キーが設定されていれば自動的に動作します。 + +## ユースケース + +- 画像コンテンツの説明 +- 画像からのテキスト抽出(OCR) +- オブジェクト、色、シーンの識別 +- スクリーンショットやスキャン文書の分析 + + + 1MB を超える画像は自動的に圧縮されます(最大辺 1536px)。すべての画像(リモート URL を含む)は base64 に変換して送信され、すべてのモデルバックエンドとの互換性を確保します。 + diff --git a/docs/tools/vision.mdx b/docs/tools/vision.mdx index 839212b3..4e1089e0 100644 --- a/docs/tools/vision.mdx +++ b/docs/tools/vision.mdx @@ -5,14 +5,49 @@ description: 分析图片内容(识别、描述、OCR 等) 使用 Vision API 分析本地图片或图片 URL,支持内容描述、文字提取(OCR)、物体识别等。 -## 依赖 +## 模型选择 -需要配置至少一个 API Key(通过 `env_config` 工具或工作空间 `.env` 文件配置): +Vision 工具采用多级自动选择 + 自动兜底策略,无需手动配置即可使用: -| 后端 | 环境变量 | 优先级 | +1. **主模型** — 优先使用当前配置的主模型进行图像识别(需要是多模态模型) +2. **其他已配置模型** — 自动发现已配置 API Key 的其他多模态模型作为备选 + +如果当前 provider 调用失败,会自动尝试下一个,直到成功或全部失败。 + +### 支持的模型 + +| 厂商 | 视觉模型 | 说明 | | --- | --- | --- | -| OpenAI | `OPENAI_API_KEY` | 优先使用 | -| LinkAI | `LINKAI_API_KEY` | 备选 | +| OpenAI / 兼容协议 | 使用主模型 | 支持所有 OpenAI 协议兼容的多模态模型 | +| 通义千问 (DashScope) | 使用主模型 | 例如 qwen3.6-plus 等 | +| Claude | 使用主模型 | Anthropic 原生图像格式 | +| Gemini | 使用主模型 | inlineData 格式 | +| 豆包 (Doubao) | 使用主模型 | doubao-seed-2-0 系列原生支持 | +| Kimi (Moonshot) | 使用主模型 | kimi-k2.5 原生支持 | +| 智谱 AI | glm-5v-turbo | 固定使用视觉专用模型 | +| MiniMax | MiniMax-Text-01 | 固定使用视觉专用模型 | + + + 智谱和 MiniMax 的文本模型不支持图像理解,因此始终使用对应的视觉专用模型,无需手动指定。 + + +> 当 `use_linkai=true` 时,默认使用 LinkAI 的多模态模型进行 + +## 自定义配置 + +如果希望指定 Vision 使用的模型,可在 `config.json` 中配置,例如: + +```json +{ + "tool": { + "vision": { + "model": "gpt-4o" + } + } +} +``` + +大多数情况下无需配置,主模型支持多模态或配置任意一个支持视觉的 API Key 即可自动工作。 ## 参数 @@ -20,17 +55,18 @@ description: 分析图片内容(识别、描述、OCR 等) | --- | --- | --- | --- | | `image` | string | 是 | 本地文件路径或 HTTP(S) 图片 URL | | `question` | string | 是 | 对图片提出的问题 | -| `model` | string | 否 | 模型名称(默认 gpt-4.1-mini) | 支持的图片格式:jpg、jpeg、png、gif、webp + + ## 使用场景 - 描述图片中的内容 - 提取图片中的文字(OCR) - 识别物体、颜色、场景 -- 分析截图、文档扫描件 +- 分析截图、文档扫描图片等 - 超过 1MB 的图片会自动压缩后上传。如果未配置任何 Vision API Key,该工具不会被加载。 + 超过 1MB 的图片会自动压缩后上传,所有图片(包括远程 URL)会统一转为 base64 传输,确保兼容所有模型后端。 diff --git a/models/bot.py b/models/bot.py index ca6e1aa1..f5f72e7d 100644 --- a/models/bot.py +++ b/models/bot.py @@ -2,12 +2,27 @@ Auto-replay chat robot abstract class """ - from bridge.context import Context from bridge.reply import Reply class Bot(object): + """ + Base class for all chat-bot implementations. + + Subclasses may also implement: + + call_with_tools(messages, tools=None, stream=False, **kwargs) + -> dict | generator (OpenAI-compatible format) + + call_vision(image_url, question, model=None, max_tokens=1000) + -> dict with keys: model, content, usage (or error/message) + + These are NOT defined here to avoid shadowing concrete implementations + provided by mixin classes (e.g. OpenAICompatibleBot) in the MRO. + Use ``hasattr(bot, 'call_vision')`` to detect support at runtime. + """ + def reply(self, query, context: Context = None) -> Reply: """ bot auto-reply content diff --git a/models/claudeapi/claude_api_bot.py b/models/claudeapi/claude_api_bot.py index 5dcf9173..ffbb74dd 100644 --- a/models/claudeapi/claude_api_bot.py +++ b/models/claudeapi/claude_api_bot.py @@ -1,7 +1,10 @@ # encoding:utf-8 +import base64 import json +import re import time +from typing import Optional import requests @@ -224,6 +227,79 @@ class ClaudeAPIBot(Bot, OpenAIImage): return 64000 return 8192 + @staticmethod + def _parse_data_url(data_url: str): + """Parse a data:;base64, URL into (media_type, base64_data).""" + m = re.match(r"^data:([^;]+);base64,(.+)$", data_url, re.DOTALL) + if m: + return m.group(1), m.group(2) + return None, None + + def call_vision(self, image_url: str, question: str, + model: Optional[str] = None, + max_tokens: int = 1000) -> dict: + """Analyze an image using Claude Messages API (native image blocks).""" + try: + actual_model = model or self._model_mapping(conf().get("model")) + + # Build Claude-native image content block + if image_url.startswith("data:"): + media_type, b64_data = self._parse_data_url(image_url) + if not b64_data: + return {"error": True, "message": "Invalid base64 data URL"} + image_block = { + "type": "image", + "source": {"type": "base64", + "media_type": media_type or "image/jpeg", + "data": b64_data}, + } + else: + image_block = { + "type": "image", + "source": {"type": "url", "url": image_url}, + } + + data = { + "model": actual_model, + "max_tokens": max_tokens, + "messages": [{ + "role": "user", + "content": [ + image_block, + {"type": "text", "text": question}, + ], + }], + } + + headers = { + "x-api-key": self.api_key, + "anthropic-version": "2023-06-01", + "content-type": "application/json", + } + proxies = {"http": self.proxy, "https": self.proxy} if self.proxy else None + resp = requests.post(f"{self.api_base}/messages", + headers=headers, json=data, proxies=proxies) + + if resp.status_code != 200: + return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"} + + body = resp.json() + text_parts = [b.get("text", "") for b in body.get("content", []) + if b.get("type") == "text"] + usage = body.get("usage", {}) + return { + "model": actual_model, + "content": "".join(text_parts), + "usage": { + "prompt_tokens": usage.get("input_tokens", 0), + "completion_tokens": usage.get("output_tokens", 0), + "total_tokens": usage.get("input_tokens", 0) + usage.get("output_tokens", 0), + }, + } + except Exception as e: + logger.error(f"[CLAUDE] call_vision error: {e}") + return {"error": True, "message": str(e)} + def call_with_tools(self, messages, tools=None, stream=False, **kwargs): """ Call Claude API with tool support for agent integration diff --git a/models/dashscope/dashscope_bot.py b/models/dashscope/dashscope_bot.py index 0887751f..4d4d628f 100644 --- a/models/dashscope/dashscope_bot.py +++ b/models/dashscope/dashscope_bot.py @@ -1,6 +1,8 @@ # encoding:utf-8 import json +from typing import Optional + from models.bot import Bot from models.session_manager import SessionManager from bridge.context import ContextType @@ -153,6 +155,56 @@ class DashscopeBot(Bot): else: return result + def call_vision(self, image_url: str, question: str, + model: Optional[str] = None, + max_tokens: int = 1000) -> dict: + """Analyze an image using DashScope MultiModalConversation API.""" + try: + dashscope.api_key = self.api_key + vision_model = model or "qwen-vl-max" + + # DashScope multimodal format: {"image": url} + {"text": question} + messages = [{ + "role": "user", + "content": [ + {"image": image_url}, + {"text": question}, + ], + }] + + response = MultiModalConversation.call( + model=vision_model, + messages=messages, + max_tokens=max_tokens, + ) + + if response.status_code != HTTPStatus.OK: + return { + "error": True, + "message": f"{response.code} - {response.message}", + } + + resp_dict = self._response_to_dict(response) + choice = resp_dict["output"]["choices"][0] + content = choice.get("message", {}).get("content", "") + if isinstance(content, list): + content = "".join( + item.get("text", "") for item in content if isinstance(item, dict) + ) + usage = resp_dict.get("usage", {}) + return { + "model": vision_model, + "content": content, + "usage": { + "prompt_tokens": usage.get("input_tokens", 0), + "completion_tokens": usage.get("output_tokens", 0), + "total_tokens": usage.get("total_tokens", 0), + }, + } + except Exception as e: + logger.error(f"[DASHSCOPE] call_vision error: {e}") + return {"error": True, "message": str(e)} + def call_with_tools(self, messages, tools=None, stream=False, **kwargs): """ Call DashScope API with tool support for agent integration diff --git a/models/doubao/doubao_bot.py b/models/doubao/doubao_bot.py index cfe4ba5c..b31516ec 100644 --- a/models/doubao/doubao_bot.py +++ b/models/doubao/doubao_bot.py @@ -2,6 +2,7 @@ import json import time +from typing import Optional import requests from models.bot import Bot @@ -147,6 +148,49 @@ class DoubaoBot(Bot): else: return result + def call_vision(self, image_url: str, question: str, + model: Optional[str] = None, + max_tokens: int = 1000) -> dict: + """Analyze an image using Doubao (Volcengine Ark) OpenAI-compatible API.""" + try: + vision_model = model or self.args.get("model", "doubao-seed-2-0-pro-260215") + payload = { + "model": vision_model, + "max_tokens": max_tokens, + "messages": [{ + "role": "user", + "content": [ + {"type": "text", "text": question}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], + }], + } + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + resp = requests.post(f"{self.base_url}/chat/completions", + headers=headers, json=payload, timeout=60) + if resp.status_code != 200: + return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"} + data = resp.json() + if "error" in data: + return {"error": True, "message": data["error"].get("message", str(data["error"]))} + content = data.get("choices", [{}])[0].get("message", {}).get("content", "") + usage = data.get("usage", {}) + return { + "model": vision_model, + "content": content, + "usage": { + "prompt_tokens": usage.get("prompt_tokens", 0), + "completion_tokens": usage.get("completion_tokens", 0), + "total_tokens": usage.get("total_tokens", 0), + }, + } + except Exception as e: + logger.error(f"[DOUBAO] call_vision error: {e}") + return {"error": True, "message": str(e)} + # ==================== Agent mode support ==================== def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs): @@ -434,31 +478,37 @@ class DoubaoBot(Bot): continue if role == "user": - text_parts = [] - tool_results = [] + has_tool_result = any( + isinstance(b, dict) and b.get("type") == "tool_result" for b in content + ) + if has_tool_result: + text_parts = [] + tool_results = [] - for block in content: - if not isinstance(block, dict): - continue - if block.get("type") == "text": - text_parts.append(block.get("text", "")) - elif block.get("type") == "tool_result": - tool_call_id = block.get("tool_use_id") or "" - result_content = block.get("content", "") - if not isinstance(result_content, str): - result_content = json.dumps(result_content, ensure_ascii=False) - tool_results.append({ - "role": "tool", - "tool_call_id": tool_call_id, - "content": result_content - }) + for block in content: + if not isinstance(block, dict): + continue + if block.get("type") == "text": + text_parts.append(block.get("text", "")) + elif block.get("type") == "tool_result": + tool_call_id = block.get("tool_use_id") or "" + result_content = block.get("content", "") + if not isinstance(result_content, str): + result_content = json.dumps(result_content, ensure_ascii=False) + tool_results.append({ + "role": "tool", + "tool_call_id": tool_call_id, + "content": result_content + }) - # Tool results first (must come right after assistant with tool_calls) - for tr in tool_results: - converted.append(tr) + for tr in tool_results: + converted.append(tr) - if text_parts: - converted.append({"role": "user", "content": "\n".join(text_parts)}) + if text_parts: + converted.append({"role": "user", "content": "\n".join(text_parts)}) + else: + # Keep as-is for multimodal content (e.g. image_url blocks) + converted.append(msg) elif role == "assistant": openai_msg = {"role": "assistant"} diff --git a/models/gemini/google_gemini_bot.py b/models/gemini/google_gemini_bot.py index e49a8bf3..aa7199ca 100644 --- a/models/gemini/google_gemini_bot.py +++ b/models/gemini/google_gemini_bot.py @@ -12,6 +12,8 @@ import mimetypes import os import re import time +from typing import Optional + import requests from models.bot import Bot from models.session_manager import SessionManager @@ -144,7 +146,12 @@ class GoogleGeminiBot(Bot): return "", [] pattern = r"\[图片:\s*([^\]]+)\]" image_paths = [m.strip().strip("'\"") for m in re.findall(pattern, content) if m.strip()] - cleaned_text = re.sub(pattern, "", content) + # Replace markers with path-only hints so the model still knows the + # original file location (needed when it calls tools like vision). + def _replace_with_hint(m): + path = m.group(1).strip().strip("'\"") + return f"[attached image: {path}]" + cleaned_text = re.sub(pattern, _replace_with_hint, content) cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text).strip() return cleaned_text, image_paths @@ -225,6 +232,57 @@ class GoogleGeminiBot(Bot): logger.warning(f"[Gemini] Unsupported image URL format: {image_url[:120]}") return None + def call_vision(self, image_url: str, question: str, + model: Optional[str] = None, + max_tokens: int = 1000) -> dict: + """Analyze an image using Gemini REST API.""" + try: + model_name = model or self.model or "gemini-2.0-flash" + image_part = self._build_inline_part_from_image_url({"url": image_url}) + if not image_part: + return {"error": True, "message": f"Cannot process image URL: {image_url[:120]}"} + + payload = { + "contents": [{ + "role": "user", + "parts": [image_part, {"text": question}], + }], + "generationConfig": {"maxOutputTokens": max_tokens}, + "safetySettings": [ + {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"}, + {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"}, + {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"}, + {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"}, + ], + } + endpoint = f"{self.api_base}/v1beta/models/{model_name}:generateContent" + headers = {"x-goog-api-key": self.api_key, "Content-Type": "application/json"} + resp = requests.post(endpoint, headers=headers, json=payload, timeout=60) + + if resp.status_code != 200: + return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"} + + body = resp.json() + candidates = body.get("candidates", []) + text_parts = [] + for part in candidates[0].get("content", {}).get("parts", []) if candidates else []: + if "text" in part: + text_parts.append(part["text"]) + + usage_meta = body.get("usageMetadata", {}) + return { + "model": model_name, + "content": "".join(text_parts), + "usage": { + "prompt_tokens": usage_meta.get("promptTokenCount", 0), + "completion_tokens": usage_meta.get("candidatesTokenCount", 0), + "total_tokens": usage_meta.get("totalTokenCount", 0), + }, + } + except Exception as e: + logger.error(f"[Gemini] call_vision error: {e}") + return {"error": True, "message": str(e)} + def call_with_tools(self, messages, tools=None, stream=False, **kwargs): """ Call Gemini API with tool support using REST API (following official docs) diff --git a/models/minimax/minimax_bot.py b/models/minimax/minimax_bot.py index af80e795..983a4132 100644 --- a/models/minimax/minimax_bot.py +++ b/models/minimax/minimax_bot.py @@ -2,6 +2,8 @@ import time import json +from typing import Optional + import requests from models.bot import Bot @@ -175,6 +177,51 @@ class MinimaxBot(Bot): else: return result + def call_vision(self, image_url: str, question: str, + model: Optional[str] = None, + max_tokens: int = 1000) -> dict: + """Analyze an image using MiniMax OpenAI-compatible API. + Always uses MiniMax-Text-01 — other MiniMax models do not support vision. + """ + try: + vision_model = "MiniMax-Text-01" + payload = { + "model": vision_model, + "max_tokens": max_tokens, + "messages": [{ + "role": "user", + "content": [ + {"type": "text", "text": question}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], + }], + } + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + resp = requests.post(f"{self.api_base}/chat/completions", + headers=headers, json=payload, timeout=60) + if resp.status_code != 200: + return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"} + data = resp.json() + if "error" in data: + return {"error": True, "message": data["error"].get("message", str(data["error"]))} + content = data.get("choices", [{}])[0].get("message", {}).get("content", "") + usage = data.get("usage", {}) + return { + "model": vision_model, + "content": content, + "usage": { + "prompt_tokens": usage.get("prompt_tokens", 0), + "completion_tokens": usage.get("completion_tokens", 0), + "total_tokens": usage.get("total_tokens", 0), + }, + } + except Exception as e: + logger.error(f"[MINIMAX] call_vision error: {e}") + return {"error": True, "message": str(e)} + def call_with_tools(self, messages, tools=None, stream=False, **kwargs): """ Call MiniMax API with tool support for agent integration @@ -273,37 +320,41 @@ class MinimaxBot(Bot): if role == "user": # Handle user message if isinstance(content, list): - # Extract text from content blocks - text_parts = [] - tool_results = [] + has_tool_result = any( + isinstance(b, dict) and b.get("type") == "tool_result" for b in content + ) + if has_tool_result: + text_parts = [] + tool_results = [] - for block in content: - if isinstance(block, dict): - if block.get("type") == "text": - text_parts.append(block.get("text", "")) - elif block.get("type") == "tool_result": - # Tool result should be a separate message with role="tool" - tool_call_id = block.get("tool_use_id") or "" - if not tool_call_id: - logger.warning(f"[MINIMAX] tool_result missing tool_use_id") - result_content = block.get("content", "") - if not isinstance(result_content, str): - result_content = json.dumps(result_content, ensure_ascii=False) - tool_results.append({ - "role": "tool", - "tool_call_id": tool_call_id, - "content": result_content - }) + for block in content: + if isinstance(block, dict): + if block.get("type") == "text": + text_parts.append(block.get("text", "")) + elif block.get("type") == "tool_result": + tool_call_id = block.get("tool_use_id") or "" + if not tool_call_id: + logger.warning(f"[MINIMAX] tool_result missing tool_use_id") + result_content = block.get("content", "") + if not isinstance(result_content, str): + result_content = json.dumps(result_content, ensure_ascii=False) + tool_results.append({ + "role": "tool", + "tool_call_id": tool_call_id, + "content": result_content + }) - if text_parts: - converted.append({ - "role": "user", - "content": "\n".join(text_parts) - }) + if text_parts: + converted.append({ + "role": "user", + "content": "\n".join(text_parts) + }) - # Add all tool results (not just the last one) - for tool_result in tool_results: - converted.append(tool_result) + for tool_result in tool_results: + converted.append(tool_result) + else: + # Keep as-is for multimodal content (e.g. image_url blocks) + converted.append(msg) else: # Simple text content converted.append({ diff --git a/models/moonshot/moonshot_bot.py b/models/moonshot/moonshot_bot.py index ded011ca..4d35400e 100644 --- a/models/moonshot/moonshot_bot.py +++ b/models/moonshot/moonshot_bot.py @@ -2,6 +2,7 @@ import json import time +from typing import Optional import requests from models.bot import Bot @@ -147,6 +148,49 @@ class MoonshotBot(Bot): else: return result + def call_vision(self, image_url: str, question: str, + model: Optional[str] = None, + max_tokens: int = 1000) -> dict: + """Analyze an image using Moonshot (Kimi) OpenAI-compatible API.""" + try: + vision_model = model or self.args.get("model", "kimi-k2.5") + payload = { + "model": vision_model, + "max_tokens": max_tokens, + "messages": [{ + "role": "user", + "content": [ + {"type": "text", "text": question}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], + }], + } + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + resp = requests.post(f"{self.base_url}/chat/completions", + headers=headers, json=payload, timeout=60) + if resp.status_code != 200: + return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"} + data = resp.json() + if "error" in data: + return {"error": True, "message": data["error"].get("message", str(data["error"]))} + content = data.get("choices", [{}])[0].get("message", {}).get("content", "") + usage = data.get("usage", {}) + return { + "model": vision_model, + "content": content, + "usage": { + "prompt_tokens": usage.get("prompt_tokens", 0), + "completion_tokens": usage.get("completion_tokens", 0), + "total_tokens": usage.get("total_tokens", 0), + }, + } + except Exception as e: + logger.error(f"[MOONSHOT] call_vision error: {e}") + return {"error": True, "message": str(e)} + # ==================== Agent mode support ==================== def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs): @@ -435,31 +479,37 @@ class MoonshotBot(Bot): continue if role == "user": - text_parts = [] - tool_results = [] + has_tool_result = any( + isinstance(b, dict) and b.get("type") == "tool_result" for b in content + ) + if has_tool_result: + text_parts = [] + tool_results = [] - for block in content: - if not isinstance(block, dict): - continue - if block.get("type") == "text": - text_parts.append(block.get("text", "")) - elif block.get("type") == "tool_result": - tool_call_id = block.get("tool_use_id") or "" - result_content = block.get("content", "") - if not isinstance(result_content, str): - result_content = json.dumps(result_content, ensure_ascii=False) - tool_results.append({ - "role": "tool", - "tool_call_id": tool_call_id, - "content": result_content - }) + for block in content: + if not isinstance(block, dict): + continue + if block.get("type") == "text": + text_parts.append(block.get("text", "")) + elif block.get("type") == "tool_result": + tool_call_id = block.get("tool_use_id") or "" + result_content = block.get("content", "") + if not isinstance(result_content, str): + result_content = json.dumps(result_content, ensure_ascii=False) + tool_results.append({ + "role": "tool", + "tool_call_id": tool_call_id, + "content": result_content + }) - # Tool results first (must come right after assistant with tool_calls) - for tr in tool_results: - converted.append(tr) + for tr in tool_results: + converted.append(tr) - if text_parts: - converted.append({"role": "user", "content": "\n".join(text_parts)}) + if text_parts: + converted.append({"role": "user", "content": "\n".join(text_parts)}) + else: + # Keep as-is for multimodal content (e.g. image_url blocks) + converted.append(msg) elif role == "assistant": openai_msg = {"role": "assistant"} diff --git a/models/openai_compatible_bot.py b/models/openai_compatible_bot.py index baac0681..6d4d314e 100644 --- a/models/openai_compatible_bot.py +++ b/models/openai_compatible_bot.py @@ -9,6 +9,8 @@ This includes: OpenAI, LinkAI, Azure OpenAI, and many third-party providers. import json import openai +import requests +from typing import Optional from common.log import logger from agent.protocol.message_utils import drop_orphaned_tool_results_openai @@ -306,3 +308,51 @@ class OpenAICompatibleBot: openai_messages.append(msg) return drop_orphaned_tool_results_openai(openai_messages) + + def call_vision(self, image_url: str, question: str, + model: Optional[str] = None, + max_tokens: int = 1000) -> dict: + """Analyze an image using the OpenAI-compatible /chat/completions endpoint.""" + try: + api_config = self.get_api_config() + vision_model = model or api_config.get("model", "gpt-4o") + api_key = api_config.get("api_key", "") + api_base = (api_config.get("api_base") or "https://api.openai.com/v1").rstrip("/") + + payload = { + "model": vision_model, + "messages": [{ + "role": "user", + "content": [ + {"type": "text", "text": question}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], + }], + } + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + resp = requests.post( + f"{api_base}/chat/completions", + headers=headers, json=payload, timeout=60, + ) + if resp.status_code != 200: + body = resp.text[:500] + logger.error(f"[{self.__class__.__name__}] call_vision HTTP {resp.status_code}: {body}") + return {"error": True, "message": f"HTTP {resp.status_code}: {body}"} + data = resp.json() + content = data.get("choices", [{}])[0].get("message", {}).get("content", "") + usage = data.get("usage", {}) + return { + "model": vision_model, + "content": content, + "usage": { + "prompt_tokens": usage.get("prompt_tokens", 0), + "completion_tokens": usage.get("completion_tokens", 0), + "total_tokens": usage.get("total_tokens", 0), + }, + } + except Exception as e: + logger.error(f"[{self.__class__.__name__}] call_vision error: {e}") + return {"error": True, "message": str(e)} diff --git a/models/zhipuai/zhipuai_bot.py b/models/zhipuai/zhipuai_bot.py index 4733cf9b..98ea5db1 100644 --- a/models/zhipuai/zhipuai_bot.py +++ b/models/zhipuai/zhipuai_bot.py @@ -2,6 +2,7 @@ import time import json +from typing import Optional from models.bot import Bot from models.zhipuai.zhipu_ai_session import ZhipuAISession @@ -149,6 +150,40 @@ class ZHIPUAIBot(Bot, ZhipuAIImage): else: return result + def call_vision(self, image_url: str, question: str, + model: Optional[str] = None, + max_tokens: int = 1000) -> dict: + """Analyze an image using ZhipuAI OpenAI-compatible SDK. + Always uses glm-5v-turbo — the text models (glm-5-turbo etc.) do not support vision. + """ + try: + vision_model = "glm-5v-turbo" + response = self.client.chat.completions.create( + model=vision_model, + max_tokens=max_tokens, + messages=[{ + "role": "user", + "content": [ + {"type": "text", "text": question}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], + }], + ) + content = response.choices[0].message.content or "" + usage = response.usage + return { + "model": vision_model, + "content": content, + "usage": { + "prompt_tokens": getattr(usage, "prompt_tokens", 0), + "completion_tokens": getattr(usage, "completion_tokens", 0), + "total_tokens": getattr(usage, "total_tokens", 0), + }, + } + except Exception as e: + logger.error(f"[ZHIPU_AI] call_vision error: {e}") + return {"error": True, "message": str(e)} + def call_with_tools(self, messages, tools=None, stream=False, **kwargs): """ Call ZhipuAI API with tool support for agent integration