From 3cd92ccda3e746f7315fe1f8db9a71962041dafa Mon Sep 17 00:00:00 2001
From: zhayujie <yjzha1996@163.com>
Date: Thu, 9 Apr 2026 21:29:53 +0800
Subject: [PATCH 1/4] feat: add port config

---
 common/cloud_client.py | 6 +++---
 config.py              | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/common/cloud_client.py b/common/cloud_client.py
index 656c1604..c71b02fe 100644
--- a/common/cloud_client.py
+++ b/common/cloud_client.py
@@ -47,8 +47,8 @@ CREDENTIAL_MAP = {
 
 
 class CloudClient(LinkAIClient):
-    def __init__(self, api_key: str, channel, host: str = ""):
-        super().__init__(api_key, host)
+    def __init__(self, api_key: str, channel, host: str = "", port=None):
+        super().__init__(api_key, host, port=port)
         self.channel = channel
         self.client_type = channel.channel_type
         self.channel_mgr = None
@@ -733,7 +733,7 @@ def start(channel, channel_mgr=None):
         return
 
     global chat_client
-    chat_client = CloudClient(api_key=conf().get("linkai_api_key"), host=conf().get("cloud_host", ""), channel=channel)
+    chat_client = CloudClient(api_key=conf().get("linkai_api_key"), host=conf().get("cloud_host", ""), port=conf().get("cloud_port"), channel=channel)
     chat_client.channel_mgr = channel_mgr
     chat_client.config = _build_config()
     chat_client.start()
diff --git a/config.py b/config.py
index 6edd9c04..2ccd505b 100644
--- a/config.py
+++ b/config.py
@@ -189,6 +189,7 @@ available_setting = {
     "linkai_app_code": "",
     "linkai_api_base": "https://api.link-ai.tech",  # linkAI服务地址
     "cloud_host": "client.link-ai.tech",
+    "cloud_port": None,
     "cloud_deployment_id": "",
     "minimax_api_key": "",
     "Minimax_group_id": "",

From 90d18353534ed8baff2562ad3c3bd9020e3b0ce8 Mon Sep 17 00:00:00 2001
From: 6vision <vision_wangpc@sina.com>
Date: Sat, 11 Apr 2026 15:45:34 +0800
Subject: [PATCH 2/4] fix: send generic file types (tar.gz, zip, etc.) as FILE
 instead of TEXT

Previously, files with extensions not in the known categories (image, document, video, audio) fell through to a fallback that returned ReplyType.TEXT, causing the file to never actually be sent to the user. Now the fallback uses ReplyType.FILE so all file types are delivered.

Made-with: Cursor
---
 bridge/agent_bridge.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/bridge/agent_bridge.py b/bridge/agent_bridge.py
index 84b7aad6..665abd22 100644
--- a/bridge/agent_bridge.py
+++ b/bridge/agent_bridge.py
@@ -498,10 +498,14 @@ class AgentBridge:
                 reply.text_content = text_response
             return reply
         
-        # For other unknown file types, return text with file info
-        message = text_response or file_info.get("message", "文件已准备")
-        message += f"\n\n[文件: {file_info.get('file_name', file_path)}]"
-        return Reply(ReplyType.TEXT, message)
+        # For all other file types (tar.gz, zip, etc.), also use FILE type
+        file_url = f"file://{file_path}"
+        logger.info(f"[AgentBridge] Sending generic file: {file_url}")
+        reply = Reply(ReplyType.FILE, file_url)
+        reply.file_name = file_info.get("file_name", os.path.basename(file_path))
+        if text_response:
+            reply.text_content = text_response
+        return reply
     
     def _migrate_config_to_env(self, workspace_root: str):
         """

From c34308cbd4679110b1be98dfd4825af98552c66e Mon Sep 17 00:00:00 2001
From: octo-patch <octo-patch@github.com>
Date: Sat, 11 Apr 2026 17:03:44 +0800
Subject: [PATCH 3/4] feat: add MiniMax-M2.7-highspeed model and MiniMax TTS
 support

- Add MiniMax-M2.7-highspeed constant to const.py and MODEL_LIST
- Update MinimaxBot default model from MiniMax-M2.1 to MiniMax-M2.7
- Add MinimaxVoice TTS provider (voice/minimax/minimax_voice.py)
  - Supports speech-2.8-hd and speech-2.8-turbo models
  - SSE streaming with hex-decoded audio chunks
  - Reuses MINIMAX_API_KEY
- Register MinimaxVoice in voice factory
- Add unit tests (14 tests, all passing)
- Update README with MiniMax-M2.7-highspeed and TTS configuration
---
 README.md                      |   5 +-
 common/const.py                |   3 +-
 models/minimax/minimax_bot.py  |   2 +-
 tests/test_minimax_provider.py | 184 +++++++++++++++++++++++++++++++++
 voice/factory.py               |   4 +
 voice/minimax/__init__.py      |   0
 voice/minimax/minimax_voice.py | 106 +++++++++++++++++++
 7 files changed, 300 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_minimax_provider.py
 create mode 100644 voice/minimax/__init__.py
 create mode 100644 voice/minimax/minimax_voice.py

diff --git a/README.md b/README.md
index 7478b99d..59609ecc 100644
--- a/README.md
+++ b/README.md
@@ -213,6 +213,7 @@ cow install-browser
 + 添加 `"speech_recognition": true` 将开启语音识别，默认使用 openai 的 whisper 模型识别为文字，同时以文字回复，该参数仅支持私聊 (注意由于语音消息无法匹配前缀，一旦开启将对所有语音自动回复，支持语音触发画图)；
 + 添加 `"group_speech_recognition": true` 将开启群组语音识别，默认使用 openai 的 whisper 模型识别为文字，同时以文字回复，参数仅支持群聊 (会匹配 group_chat_prefix 和 group_chat_keyword, 支持语音触发画图)；
 + 添加 `"voice_reply_voice": true` 将开启语音回复语音（同时作用于私聊和群聊）
++ 使用 MiniMax TTS：设置 `"text_to_voice": "minimax"`，并配置 `minimax_api_key`；可通过 `"tts_voice_id"` 指定发音人（如 `English_Graceful_Lady`），`"text_to_voice_model"` 指定模型（如 `speech-2.8-hd`、`speech-2.8-turbo`）
 </details>
 
 <details>
@@ -357,7 +358,7 @@ sudo docker logs -f chatgpt-on-wechat
     "minimax_api_key": ""
 }
 ```
- - `model`: 可填写 `MiniMax-M2.7、MiniMax-M2.5、MiniMax-M2.1、MiniMax-M2.1-lightning、MiniMax-M2、abab6.5-chat` 等
+ - `model`: 可填写 `MiniMax-M2.7、MiniMax-M2.7-highspeed、MiniMax-M2.5、MiniMax-M2.1、MiniMax-M2.1-lightning、MiniMax-M2、abab6.5-chat` 等
  - `minimax_api_key`：MiniMax 平台的 API-KEY，在 [控制台](https://platform.minimaxi.com/user-center/basic-information/interface-key) 创建
 
 方式二：OpenAI 兼容方式接入，配置如下：
@@ -370,7 +371,7 @@ sudo docker logs -f chatgpt-on-wechat
 }
 ```
 - `bot_type`: OpenAI 兼容方式
-- `model`: 可填 `MiniMax-M2.7、MiniMax-M2.5、MiniMax-M2.1、MiniMax-M2.1-lightning、MiniMax-M2`，参考[API文档](https://platform.minimaxi.com/document/%E5%AF%B9%E8%AF%9D?key=66701d281d57f38758d581d0#QklxsNSbaf6kM4j6wjO5eEek)
+- `model`: 可填 `MiniMax-M2.7、MiniMax-M2.7-highspeed、MiniMax-M2.5、MiniMax-M2.1、MiniMax-M2.1-lightning、MiniMax-M2`，参考[API文档](https://platform.minimaxi.com/document/%E5%AF%B9%E8%AF%9D?key=66701d281d57f38758d581d0#QklxsNSbaf6kM4j6wjO5eEek)
 - `open_ai_api_base`: MiniMax 平台 API 的 BASE URL
 - `open_ai_api_key`: MiniMax 平台的 API-KEY
 </details>
diff --git a/common/const.py b/common/const.py
index f7e67e52..ecaf5b0f 100644
--- a/common/const.py
+++ b/common/const.py
@@ -93,6 +93,7 @@ QWQ_PLUS = "qwq-plus"
 
 # MiniMax
 MINIMAX_M2_7 = "MiniMax-M2.7"  # MiniMax M2.7 - Latest
+MINIMAX_M2_7_HIGHSPEED = "MiniMax-M2.7-highspeed"  # MiniMax M2.7 highspeed
 MINIMAX_M2_5 = "MiniMax-M2.5"  # MiniMax M2.5
 MINIMAX_M2_1 = "MiniMax-M2.1"  # MiniMax M2.1
 MINIMAX_M2_1_LIGHTNING = "MiniMax-M2.1-lightning"  # MiniMax M2.1 极速版
@@ -175,7 +176,7 @@ MODEL_LIST = [
               QWEN36_PLUS, QWEN35_PLUS, QWEN3_MAX, QWEN_MAX, QWEN_PLUS, QWEN_TURBO, QWEN_LONG,
               
               # MiniMax
-              MiniMax, MINIMAX_M2_7, MINIMAX_M2_5, MINIMAX_M2_1, MINIMAX_M2_1_LIGHTNING, MINIMAX_M2, MINIMAX_ABAB6_5,
+              MiniMax, MINIMAX_M2_7, MINIMAX_M2_7_HIGHSPEED, MINIMAX_M2_5, MINIMAX_M2_1, MINIMAX_M2_1_LIGHTNING, MINIMAX_M2, MINIMAX_ABAB6_5,
 
               # GLM
               ZHIPU_AI, GLM_5_TURBO, GLM_5, GLM_4, GLM_4_PLUS, GLM_4_flash, GLM_4_LONG, GLM_4_ALLTOOLS,
diff --git a/models/minimax/minimax_bot.py b/models/minimax/minimax_bot.py
index af80e795..0fd45e66 100644
--- a/models/minimax/minimax_bot.py
+++ b/models/minimax/minimax_bot.py
@@ -20,7 +20,7 @@ class MinimaxBot(Bot):
     def __init__(self):
         super().__init__()
         self.args = {
-            "model": conf().get("model") or "MiniMax-M2.1",
+            "model": conf().get("model") or "MiniMax-M2.7",
             "temperature": conf().get("temperature", 0.3),
             "top_p": conf().get("top_p", 0.95),
         }
diff --git a/tests/test_minimax_provider.py b/tests/test_minimax_provider.py
new file mode 100644
index 00000000..cfad7fd7
--- /dev/null
+++ b/tests/test_minimax_provider.py
@@ -0,0 +1,184 @@
+# encoding:utf-8
+"""
+Unit tests for MiniMax provider additions:
+  - MiniMax-M2.7-highspeed constant in const.py
+  - Default model update in MinimaxBot
+  - MinimaxVoice TTS provider
+"""
+import sys
+import os
+import json
+import unittest
+from unittest.mock import MagicMock, patch, PropertyMock
+
+# Add project root to path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+
+class TestMinimaxConst(unittest.TestCase):
+    """Test that MiniMax-M2.7-highspeed is properly registered in const.py."""
+
+    def test_m2_7_highspeed_constant_defined(self):
+        from common import const
+        self.assertTrue(hasattr(const, "MINIMAX_M2_7_HIGHSPEED"))
+        self.assertEqual(const.MINIMAX_M2_7_HIGHSPEED, "MiniMax-M2.7-highspeed")
+
+    def test_m2_7_constant_defined(self):
+        from common import const
+        self.assertEqual(const.MINIMAX_M2_7, "MiniMax-M2.7")
+
+    def test_m2_7_highspeed_in_model_list(self):
+        from common import const
+        self.assertIn("MiniMax-M2.7-highspeed", const.MODEL_LIST)
+
+    def test_m2_7_in_model_list(self):
+        from common import const
+        self.assertIn("MiniMax-M2.7", const.MODEL_LIST)
+
+    def test_minimax_provider_key_defined(self):
+        from common import const
+        self.assertEqual(const.MiniMax, "minimax")
+
+
+class TestMinimaxBotDefaultModel(unittest.TestCase):
+    """Test that MinimaxBot defaults to MiniMax-M2.7."""
+
+    def test_default_model_is_m2_7(self):
+        # Patch conf() to return empty config
+        mock_conf = MagicMock()
+        mock_conf.get = MagicMock(side_effect=lambda key, default=None: default)
+
+        with patch("models.minimax.minimax_bot.conf", return_value=mock_conf):
+            with patch("models.minimax.minimax_bot.SessionManager"):
+                from models.minimax import minimax_bot
+                # Reload to pick up patches
+                import importlib
+                importlib.reload(minimax_bot)
+                with patch("models.minimax.minimax_bot.conf", return_value=mock_conf):
+                    bot = minimax_bot.MinimaxBot.__new__(minimax_bot.MinimaxBot)
+                    bot.args = {
+                        "model": mock_conf.get("model") or "MiniMax-M2.7",
+                    }
+                    self.assertEqual(bot.args["model"], "MiniMax-M2.7")
+
+    def test_default_model_string(self):
+        """Verify the fallback string literal in minimax_bot.py is MiniMax-M2.7."""
+        import ast
+        bot_path = os.path.join(os.path.dirname(__file__), "..", "models", "minimax", "minimax_bot.py")
+        with open(bot_path) as f:
+            source = f.read()
+        # Verify MiniMax-M2.7 is in the source (not M2.1)
+        self.assertIn("MiniMax-M2.7", source)
+        self.assertNotIn('"MiniMax-M2.1"', source)
+
+
+class TestMinimaxVoice(unittest.TestCase):
+    """Test MinimaxVoice TTS provider."""
+
+    def _make_voice(self, api_key="test-key", api_base="https://api.minimax.io/v1"):
+        mock_conf = MagicMock()
+        def conf_get(key, default=None):
+            return {
+                "minimax_api_key": api_key,
+                "minimax_api_base": api_base,
+            }.get(key, default)
+        mock_conf.get = conf_get
+        with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
+            from voice.minimax.minimax_voice import MinimaxVoice
+            return MinimaxVoice()
+
+    def test_instantiation(self):
+        voice = self._make_voice()
+        self.assertIsNotNone(voice)
+
+    def test_api_base_strips_v1_suffix(self):
+        voice = self._make_voice(api_base="https://api.minimax.io/v1")
+        self.assertEqual(voice.api_base, "https://api.minimax.io")
+
+    def test_api_base_no_trailing_slash(self):
+        voice = self._make_voice(api_base="https://api.minimax.io")
+        self.assertEqual(voice.api_base, "https://api.minimax.io")
+
+    def test_voice_to_text_not_supported(self):
+        voice = self._make_voice()
+        with self.assertRaises(NotImplementedError):
+            voice.voiceToText("dummy.wav")
+
+    def test_text_to_voice_success(self):
+        """Test textToVoice with mocked SSE stream response."""
+        import os
+        os.makedirs("tmp", exist_ok=True)
+
+        # Build fake SSE stream bytes
+        audio_hex = bytes([0x49, 0x44, 0x33]).hex()  # "ID3" MP3 magic bytes
+        sse_line = f'data: {{"data": {{"audio": "{audio_hex}", "status": 2}}}}\n\n'
+        done_line = "data: [DONE]\n\n"
+        fake_body = (sse_line + done_line).encode("utf-8")
+
+        mock_response = MagicMock()
+        mock_response.raise_for_status = MagicMock()
+        mock_response.iter_lines.return_value = [
+            line.encode("utf-8") for line in (sse_line + done_line).splitlines() if line
+        ]
+
+        mock_conf = MagicMock()
+        def conf_get(key, default=None):
+            return {
+                "minimax_api_key": "test-key",
+                "minimax_api_base": "https://api.minimax.io",
+            }.get(key, default)
+        mock_conf.get = conf_get
+
+        with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
+            with patch("voice.minimax.minimax_voice.requests.post", return_value=mock_response):
+                from voice.minimax import minimax_voice
+                import importlib
+                importlib.reload(minimax_voice)
+                with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
+                    voice = minimax_voice.MinimaxVoice()
+                    from bridge.reply import ReplyType
+                    reply = voice.textToVoice("Hello, world!")
+                    self.assertEqual(reply.type, ReplyType.VOICE)
+                    self.assertTrue(reply.content.endswith(".mp3"))
+
+    def test_text_to_voice_no_audio_returns_error(self):
+        """Test that empty SSE stream returns an ERROR reply."""
+        mock_response = MagicMock()
+        mock_response.raise_for_status = MagicMock()
+        mock_response.iter_lines.return_value = []
+
+        mock_conf = MagicMock()
+        def conf_get(key, default=None):
+            return {
+                "minimax_api_key": "test-key",
+                "minimax_api_base": "https://api.minimax.io",
+            }.get(key, default)
+        mock_conf.get = conf_get
+
+        with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
+            with patch("voice.minimax.minimax_voice.requests.post", return_value=mock_response):
+                from voice.minimax import minimax_voice
+                import importlib
+                importlib.reload(minimax_voice)
+                with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
+                    voice = minimax_voice.MinimaxVoice()
+                    from bridge.reply import ReplyType
+                    reply = voice.textToVoice("Hello")
+                    self.assertEqual(reply.type, ReplyType.ERROR)
+
+
+class TestVoiceFactory(unittest.TestCase):
+    """Test that minimax is registered in the voice factory."""
+
+    def test_minimax_voice_factory(self):
+        mock_conf = MagicMock()
+        mock_conf.get = MagicMock(return_value=None)
+        with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
+            from voice.factory import create_voice
+            voice = create_voice("minimax")
+            from voice.minimax.minimax_voice import MinimaxVoice
+            self.assertIsInstance(voice, MinimaxVoice)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/voice/factory.py b/voice/factory.py
index 8562f634..abe7ba57 100644
--- a/voice/factory.py
+++ b/voice/factory.py
@@ -54,4 +54,8 @@ def create_voice(voice_type):
         from voice.tencent.tencent_voice import TencentVoice
 
         return TencentVoice()
+    elif voice_type == "minimax":
+        from voice.minimax.minimax_voice import MinimaxVoice
+
+        return MinimaxVoice()
     raise RuntimeError
diff --git a/voice/minimax/__init__.py b/voice/minimax/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/voice/minimax/minimax_voice.py b/voice/minimax/minimax_voice.py
new file mode 100644
index 00000000..1446a3f1
--- /dev/null
+++ b/voice/minimax/minimax_voice.py
@@ -0,0 +1,106 @@
+# encoding:utf-8
+"""
+MiniMax TTS voice service
+"""
+import datetime
+import random
+import requests
+
+from bridge.reply import Reply, ReplyType
+from common.log import logger
+from config import conf
+from voice.voice import Voice
+
+
+MINIMAX_TTS_VOICES = [
+    "English_Graceful_Lady",
+    "English_Insightful_Speaker",
+    "English_radiant_girl",
+    "English_Persuasive_Man",
+    "English_Lucky_Robot",
+    "English_expressive_narrator",
+    "Chinese_Warm_Woman",
+    "Chinese_Gentle_Man",
+]
+
+
+class MinimaxVoice(Voice):
+    def __init__(self):
+        self.api_key = conf().get("minimax_api_key")
+        self.api_base = conf().get("minimax_api_base") or "https://api.minimax.io"
+        # Strip trailing /v1 if present so we can always append /v1/t2a_v2
+        self.api_base = self.api_base.rstrip("/")
+        if self.api_base.endswith("/v1"):
+            self.api_base = self.api_base[:-3]
+
+    def voiceToText(self, voice_file):
+        """MiniMax does not provide an ASR endpoint; raise NotImplementedError."""
+        raise NotImplementedError("MiniMax voice-to-text is not supported")
+
+    def textToVoice(self, text):
+        try:
+            model = conf().get("text_to_voice_model") or "speech-2.8-hd"
+            voice_id = conf().get("tts_voice_id") or "English_Graceful_Lady"
+
+            url = f"{self.api_base}/v1/t2a_v2"
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {self.api_key}",
+            }
+            payload = {
+                "model": model,
+                "text": text,
+                "stream": True,
+                "voice_setting": {
+                    "voice_id": voice_id,
+                    "speed": 1,
+                    "vol": 1,
+                    "pitch": 0,
+                },
+                "audio_setting": {
+                    "sample_rate": 32000,
+                    "bitrate": 128000,
+                    "format": "mp3",
+                    "channel": 1,
+                },
+            }
+
+            response = requests.post(url, headers=headers, json=payload, stream=True, timeout=60)
+            response.raise_for_status()
+
+            # Parse SSE stream and collect hex-encoded audio chunks
+            audio_chunks = []
+            buffer = ""
+            for raw in response.iter_lines():
+                if not raw:
+                    continue
+                line = raw.decode("utf-8") if isinstance(raw, bytes) else raw
+                if not line.startswith("data:"):
+                    continue
+                json_str = line[5:].strip()
+                if not json_str or json_str == "[DONE]":
+                    continue
+                try:
+                    import json
+                    event_data = json.loads(json_str)
+                    audio_hex = event_data.get("data", {}).get("audio")
+                    if audio_hex:
+                        audio_chunks.append(bytes.fromhex(audio_hex))
+                except Exception:
+                    continue
+
+            if not audio_chunks:
+                logger.error("[MINIMAX] TTS returned no audio data")
+                return Reply(ReplyType.ERROR, "语音合成失败，未获取到音频数据")
+
+            audio_data = b"".join(audio_chunks)
+            file_name = "tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + str(random.randint(0, 1000)) + ".mp3"
+            with open(file_name, "wb") as f:
+                f.write(audio_data)
+
+            logger.info(f"[MINIMAX] textToVoice success, file={file_name}")
+            return Reply(ReplyType.VOICE, file_name)
+
+        except Exception as e:
+            logger.error(f"[MINIMAX] textToVoice error: {e}")
+            return Reply(ReplyType.ERROR, "遇到了一点小问题，请稍后再试")

From 26693acc3f8d299983bfbb63694ca3fe4b8ba323 Mon Sep 17 00:00:00 2001
From: zhayujie <yjzha1996@163.com>
Date: Sat, 11 Apr 2026 19:46:11 +0800
Subject: [PATCH 4/4] feat(vision): prioritize main model for image recognition
 with multi-provider fallback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add call_vision method to all bot implementations (DashScope, Claude,
  Gemini, ZhipuAI, MiniMax, Doubao, Moonshot, OpenAICompatibleBot)
  using each vendor's native multimodal API format
- Remove call_with_tools/call_vision from Bot base class to fix MRO
  shadowing issue with OpenAICompatibleBot mixin
- Refactor vision tool provider resolution: MainModel → other configured
  models (auto-discovered) → OpenAI → LinkAI, with automatic fallback
- Return actual model name used in call_vision responses
- Sync config.json API keys to .env bidirectionally on startup
- Fix bot instance cache to detect bot_type/use_linkai config changes
- Add SSE reconnection support for web console
- Preserve image path hints in Gemini text for correct vision tool calls
- Update docs/tools/vision.mdx
---
 agent/tools/vision/vision.py       | 258 +++++++++++++++++-----
 bridge/agent_bridge.py             |  84 ++++----
 bridge/agent_initializer.py        |  58 ++---
 channel/web/static/js/console.js   | 336 ++++++++++++++++-------------
 channel/web/web_channel.py         |  15 +-
 docs/en/tools/vision.mdx           |  72 +++++++
 docs/ja/tools/vision.mdx           |  72 +++++++
 docs/tools/vision.mdx              |  52 ++++-
 models/bot.py                      |  17 +-
 models/claudeapi/claude_api_bot.py |  76 +++++++
 models/dashscope/dashscope_bot.py  |  52 +++++
 models/doubao/doubao_bot.py        |  94 ++++++--
 models/gemini/google_gemini_bot.py |  60 +++++-
 models/minimax/minimax_bot.py      | 107 ++++++---
 models/moonshot/moonshot_bot.py    |  94 ++++++--
 models/openai_compatible_bot.py    |  50 +++++
 models/zhipuai/zhipuai_bot.py      |  35 +++
 17 files changed, 1173 insertions(+), 359 deletions(-)
 create mode 100644 docs/en/tools/vision.mdx
 create mode 100644 docs/ja/tools/vision.mdx

diff --git a/agent/tools/vision/vision.py b/agent/tools/vision/vision.py
index 3f8ad308..8a2756c2 100644
--- a/agent/tools/vision/vision.py
+++ b/agent/tools/vision/vision.py
@@ -1,7 +1,13 @@
 """
-Vision tool - Analyze images using OpenAI-compatible Vision API.
+Vision tool - Analyze images using Vision API.
 Supports local files (auto base64-encoded) and HTTP URLs.
-Providers are tried in priority order with automatic fallback on failure.
+
+Provider priority (default):
+  1. Main model via bot.call_vision — zero extra cost
+  2. Other models whose API key is configured — auto-discovered
+  3. OpenAI / LinkAI raw HTTP — reliable fallback
+  When use_linkai=true, LinkAI is promoted to #1.
+  When tool.vision.model is set, that model is used exclusively first.
 """
 
 import base64
@@ -14,10 +20,11 @@ from typing import Any, Dict, List, Optional
 import requests
 
 from agent.tools.base_tool import BaseTool, ToolResult
+from common import const
 from common.log import logger
 from config import conf
 
-DEFAULT_MODEL = "gpt-4.1-mini"
+DEFAULT_MODEL = const.GPT_41_MINI
 DEFAULT_TIMEOUT = 60
 MAX_TOKENS = 1000
 COMPRESS_THRESHOLD = 1_048_576  # 1 MB
@@ -30,8 +37,20 @@ SUPPORTED_EXTENSIONS = {
     "webp": "image/webp",
 }
 
+_MAIN_MODEL_PROVIDER_NAME = "MainModel"
 
-OPENAI_COMPATIBLE_BOT_TYPES = {"openai", "openAI", "chatGPT"}
+# (config_key_for_api_key, bot_type, default_vision_model, provider_display_name)
+# Auto-discovered as fallback vision providers when their API key is configured.
+# OpenAI and LinkAI are handled separately (raw HTTP providers), so not listed here.
+_DISCOVERABLE_MODELS = [
+    ("moonshot_api_key", const.MOONSHOT, const.KIMI_K2_5, "Moonshot"),
+    ("ark_api_key", const.DOUBAO, const.DOUBAO_SEED_2_PRO, "Doubao"),
+    ("dashscope_api_key", const.QWEN_DASHSCOPE, const.QWEN36_PLUS, "DashScope"),
+    ("claude_api_key", const.CLAUDEAPI, const.CLAUDE_4_6_SONNET, "Claude"),
+    ("gemini_api_key", const.GEMINI, const.GEMINI_31_FLASH_LITE_PRE, "Gemini"),
+    ("zhipu_ai_api_key", const.ZHIPU_AI, const.GLM_4_7, "ZhipuAI"),
+    ("minimax_api_key", const.MiniMax, const.MINIMAX_M2_7, "MiniMax"),
+]
 
 
 @dataclass
@@ -42,6 +61,8 @@ class VisionProvider:
     api_base: str
     extra_headers: dict = field(default_factory=dict)
     model_override: Optional[str] = None
+    use_bot: bool = False  # When True, call via bot.call_vision instead of raw HTTP
+    fallback_bot: Any = None  # Bot instance for non-main-model providers
 
 
 class VisionAPIError(Exception):
@@ -50,13 +71,12 @@ class VisionAPIError(Exception):
 
 
 class Vision(BaseTool):
-    """Analyze images using OpenAI-compatible Vision API"""
+    """Analyze images using Vision API"""
 
     name: str = "vision"
     description: str = (
         "Analyze a local image or image URL (jpg/jpeg/png) using Vision API. "
         "Can describe content, extract text, identify objects, colors, etc. "
-        "Requires OPENAI_API_KEY or LINKAI_API_KEY."
     )
 
     params: dict = {
@@ -70,13 +90,6 @@ class Vision(BaseTool):
                 "type": "string",
                 "description": "Question to ask about the image",
             },
-            "model": {
-                "type": "string",
-                "description": (
-                    f"Vision model to use (default: {DEFAULT_MODEL}). "
-                    "Options: gpt-4.1-mini, gpt-4.1, gpt-4o-mini, gpt-4o"
-                ),
-            },
         },
         "required": ["image", "question"],
     }
@@ -86,15 +99,11 @@ class Vision(BaseTool):
 
     @staticmethod
     def is_available() -> bool:
-        return bool(
-            conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY")
-            or conf().get("linkai_api_key") or os.environ.get("LINKAI_API_KEY")
-        )
+        return True
 
     def execute(self, args: Dict[str, Any]) -> ToolResult:
         image = args.get("image", "").strip()
         question = args.get("question", "").strip()
-        model = args.get("model", DEFAULT_MODEL).strip() or DEFAULT_MODEL
 
         if not image:
             return ToolResult.fail("Error: 'image' parameter is required")
@@ -104,11 +113,12 @@ class Vision(BaseTool):
         providers = self._resolve_providers()
         if not providers:
             return ToolResult.fail(
-                "Error: No API key configured for Vision.\n"
-                "Please configure one of the following using env_config tool:\n"
-                "  1. OPENAI_API_KEY (preferred): env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n"
-                "  2. LINKAI_API_KEY (fallback): env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")\n\n"
-                "Get your key at: https://platform.openai.com/api-keys or https://link-ai.tech"
+                "Error: No model available for Vision.\n"
+                "The main model does not support vision and no other API keys are configured.\n"
+                "Options:\n"
+                "  1. Switch to a multimodal model (e.g. qwen3.6-plus, claude-sonnet-4-6, gemini-2.0-flash)\n"
+                "  2. Configure OPENAI_API_KEY: env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n"
+                "  3. Configure LINKAI_API_KEY: env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")"
             )
 
         try:
@@ -116,7 +126,7 @@ class Vision(BaseTool):
         except Exception as e:
             return ToolResult.fail(f"Error: {e}")
 
-        return self._call_with_fallback(providers, model, question, image_content)
+        return self._call_with_fallback(providers, DEFAULT_MODEL, question, image_content)
 
     def _call_with_fallback(self, providers: List[VisionProvider], model: str,
                             question: str, image_content: dict) -> ToolResult:
@@ -125,9 +135,14 @@ class Vision(BaseTool):
         for i, provider in enumerate(providers):
             use_model = provider.model_override or model
             try:
-                logger.debug(f"[Vision] Trying provider '{provider.name}' "
-                             f"with model '{use_model}' ({i + 1}/{len(providers)})")
-                return self._call_api(provider, use_model, question, image_content)
+                logger.info(f"[Vision] Trying provider '{provider.name}' "
+                            f"with model '{use_model}' ({i + 1}/{len(providers)})")
+                if provider.use_bot:
+                    result = self._call_via_bot(use_model, question, image_content, provider)
+                else:
+                    result = self._call_api(provider, use_model, question, image_content)
+                logger.info(f"[Vision] ✅ Success via {provider.name} (model={use_model})")
+                return result
             except VisionAPIError as e:
                 errors.append(f"[{provider.name}/{use_model}] {e}")
                 logger.warning(f"[Vision] Provider '{provider.name}' failed: {e}")
@@ -148,35 +163,113 @@ class Vision(BaseTool):
     def _resolve_providers(self) -> List[VisionProvider]:
         """
         Build an ordered list of available providers.
-        Each provider builder returns a VisionProvider or None.
-        To add a new provider, append a builder method to _PROVIDER_BUILDERS.
+
+        Priority:
+          - use_linkai=true  → [LinkAI, MainModel, OtherModels…, OpenAI]
+          - default          → [MainModel, OtherModels…, OpenAI, LinkAI]
+
+        "OtherModels" are auto-discovered from configured API keys.
+        The main model's bot_type is excluded from OtherModels to avoid
+        duplicating the MainModel provider.
         """
+        use_linkai = conf().get("use_linkai", False) and conf().get("linkai_api_key")
         providers: List[VisionProvider] = []
-        for builder in self._PROVIDER_BUILDERS:
-            provider = builder(self)
-            if provider:
-                providers.append(provider)
+
+        if use_linkai:
+            self._append_provider(providers, self._build_linkai_provider)
+            self._append_provider(providers, self._build_main_model_provider)
+            self._append_other_model_providers(providers)
+            self._append_provider(providers, self._build_openai_provider)
+        else:
+            self._append_provider(providers, self._build_main_model_provider)
+            self._append_other_model_providers(providers)
+            self._append_provider(providers, self._build_openai_provider)
+            self._append_provider(providers, self._build_linkai_provider)
+
         return providers
 
-    def _build_custom_model_provider(self) -> Optional[VisionProvider]:
+    @staticmethod
+    def _append_provider(providers: List[VisionProvider], builder) -> None:
+        p = builder()
+        if p:
+            providers.append(p)
+
+    def _append_other_model_providers(self, providers: List[VisionProvider]) -> None:
         """
-        When bot_type is openai-compatible and a custom model is configured,
-        try the user's own model first — it may already support multimodal input.
+        Auto-discover other models whose API key is configured.
+        Skip the main model's own bot_type (already covered by MainModel provider).
+        Skip bot_types that already have a provider in the list (e.g. OpenAI).
         """
-        bot_type = conf().get("bot_type", "")
-        if bot_type not in OPENAI_COMPATIBLE_BOT_TYPES:
+        # Determine main model's bot_type so we can skip it
+        main_bot_type = None
+        if self.model and hasattr(self.model, '_resolve_bot_type'):
+            main_bot_type = self.model._resolve_bot_type(conf().get("model", ""))
+
+        existing_names = {p.name for p in providers}
+
+        for config_key, bot_type, default_model, display_name in _DISCOVERABLE_MODELS:
+            if display_name in existing_names:
+                continue
+            if bot_type == main_bot_type:
+                continue
+            api_key = conf().get(config_key, "")
+            if not api_key or not api_key.strip():
+                continue
+
+            # Create a bot instance and check if it supports call_vision
+            try:
+                from models.bot_factory import create_bot
+                bot = create_bot(bot_type)
+                if not hasattr(bot, 'call_vision'):
+                    continue
+            except Exception:
+                continue
+
+            providers.append(VisionProvider(
+                name=display_name,
+                api_key="",
+                api_base="",
+                model_override=default_model,
+                use_bot=True,
+                fallback_bot=bot,
+            ))
+
+    def _resolve_vision_model(self) -> Optional[str]:
+        """
+        Determine which model to use for vision.
+
+        1. User explicit config: tool.vision.model in config.json
+        2. Fallback to the main configured model name
+        """
+        tool_conf = conf().get("tool", {})
+        user_vision_model = tool_conf.get("vision", {}).get("model") if isinstance(tool_conf, dict) else None
+        if user_vision_model:
+            return user_vision_model
+        model_name = conf().get("model", "")
+        return model_name or None
+
+    def _build_main_model_provider(self) -> Optional[VisionProvider]:
+        """
+        Use the vendor's own model for vision via bot.call_vision.
+        Only available when the bot class has call_vision.
+        """
+        if not (self.model and hasattr(self.model, 'bot')):
             return None
-        custom_model = conf().get("model", "")
-        if not custom_model or custom_model == DEFAULT_MODEL:
+        try:
+            bot = self.model.bot
+            if not hasattr(bot, 'call_vision'):
+                return None
+        except Exception:
             return None
-        api_key = conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY")
-        if not api_key:
-            return None
-        api_base = (conf().get("open_ai_api_base") or os.environ.get("OPENAI_API_BASE", "")).rstrip("/") \
-            or "https://api.openai.com/v1"
+
+        vision_model = self._resolve_vision_model()
+
         return VisionProvider(
-            name="CustomModel", api_key=api_key, api_base=self._ensure_v1(api_base),
-            model_override=custom_model,
+            name=_MAIN_MODEL_PROVIDER_NAME,
+            api_key="",
+            api_base="",
+            model_override=vision_model,
+            use_bot=True,
         )
 
     def _build_openai_provider(self) -> Optional[VisionProvider]:
@@ -200,7 +293,54 @@ class Vision(BaseTool):
         return VisionProvider(name="LinkAI", api_key=api_key, api_base=self._ensure_v1(api_base),
                               extra_headers=extra)
 
-    _PROVIDER_BUILDERS = [_build_custom_model_provider, _build_openai_provider, _build_linkai_provider]
+    def _call_via_bot(self, model: str, question: str, image_content: dict,
+                      provider: Optional[VisionProvider] = None) -> ToolResult:
+        """
+        Call a model's call_vision with vendor-native API format.
+        Uses the provider's _fallback_bot if set, otherwise the main model bot.
+        Raises VisionAPIError on failure so fallback can proceed.
+        """
+        try:
+            bot = (provider and provider.fallback_bot) or self.model.bot
+        except Exception as e:
+            raise VisionAPIError(f"Cannot access bot: {e}")
+
+        # Extract the raw image URL from the OpenAI-format image_content block
+        image_url = image_content.get("image_url", {}).get("url", "")
+        if not image_url:
+            raise VisionAPIError("No image URL in content block")
+
+        try:
+            response = bot.call_vision(
+                image_url=image_url,
+                question=question,
+                model=model,
+                max_tokens=MAX_TOKENS,
+            )
+        except Exception as e:
+            raise VisionAPIError(f"call_vision failed: {e}")
+
+        if response is NotImplemented:
+            raise VisionAPIError("Bot does not support vision")
+
+        if isinstance(response, dict) and response.get("error"):
+            raise VisionAPIError(f"API error - {response.get('message', 'Unknown')}")
+
+        content = response.get("content", "") if isinstance(response, dict) else ""
+        if not content:
+            raise VisionAPIError("Empty response from main model")
+
+        usage_info = response.get("usage", {}) if isinstance(response, dict) else {}
+
+        # Use the actual model name from the bot response if available
+        actual_model = response.get("model", model) if isinstance(response, dict) else model
+        provider_name = provider.name if provider else _MAIN_MODEL_PROVIDER_NAME
+        return ToolResult.success({
+            "model": actual_model,
+            "provider": provider_name,
+            "content": content,
+            "usage": usage_info,
+        })
 
     @staticmethod
     def _ensure_v1(api_base: str) -> str:
@@ -213,9 +353,13 @@ class Vision(BaseTool):
         return api_base.rstrip("/") + "/v1"
 
     def _build_image_content(self, image: str) -> dict:
-        """Build the image_url content block for the API request."""
+        """
+        Build the image_url content block.
+        Both remote URLs and local files are converted to base64 data URLs
+        so every bot backend can consume them without extra downloads.
+        """
         if image.startswith(("http://", "https://")):
-            return {"type": "image_url", "image_url": {"url": image}}
+            return self._download_to_data_url(image)
 
         if not os.path.isfile(image):
             raise FileNotFoundError(f"Image file not found: {image}")
@@ -239,6 +383,19 @@ class Vision(BaseTool):
         data_url = f"data:{mime_type};base64,{b64}"
         return {"type": "image_url", "image_url": {"url": data_url}}
 
+    @staticmethod
+    def _download_to_data_url(url: str) -> dict:
+        """Download a remote image and return it as a base64 data URL."""
+        resp = requests.get(url, timeout=30)
+        if resp.status_code != 200:
+            raise VisionAPIError(f"Failed to download image: HTTP {resp.status_code}")
+        content_type = resp.headers.get("Content-Type", "image/jpeg").split(";")[0].strip()
+        if not content_type.startswith("image/"):
+            content_type = "image/jpeg"
+        b64 = base64.b64encode(resp.content).decode("ascii")
+        data_url = f"data:{content_type};base64,{b64}"
+        return {"type": "image_url", "image_url": {"url": data_url}}
+
     @staticmethod
     def _maybe_compress(path: str) -> str:
         """Compress image to under COMPRESS_THRESHOLD with max long-edge 1536px."""
@@ -312,7 +469,6 @@ class Vision(BaseTool):
                     ],
                 }
             ],
-            "max_completion_tokens": MAX_TOKENS,
         }
 
         headers = {
diff --git a/bridge/agent_bridge.py b/bridge/agent_bridge.py
index 84b7aad6..cc54f52e 100644
--- a/bridge/agent_bridge.py
+++ b/bridge/agent_bridge.py
@@ -124,14 +124,15 @@ class AgentLLMModel(LLMModel):
 
     @property
     def bot(self):
-        """Lazy load the bot, re-create when model changes"""
+        """Lazy load the bot, re-create when model or bot_type changes"""
         from models.bot_factory import create_bot
         cur_model = self.model
-        if self._bot is None or self._bot_model != cur_model:
-            bot_type = self._resolve_bot_type(cur_model)
-            self._bot = create_bot(bot_type)
+        cur_bot_type = self._resolve_bot_type(cur_model)
+        if self._bot is None or self._bot_model != cur_model or getattr(self, '_bot_type', None) != cur_bot_type:
+            self._bot = create_bot(cur_bot_type)
             self._bot = add_openai_compatible_support(self._bot)
             self._bot_model = cur_model
+            self._bot_type = cur_bot_type
         return self._bot
 
     def call(self, request: LLMRequest):
@@ -505,15 +506,15 @@ class AgentBridge:
     
     def _migrate_config_to_env(self, workspace_root: str):
         """
-        Migrate API keys from config.json to .env file if not already set
-        
+        Sync API keys from config.json to .env file.
+        Adds new keys and updates changed values on each startup.
+
         Args:
             workspace_root: Workspace directory path (not used, kept for compatibility)
         """
         from config import conf
         import os
         
-        # Mapping from config.json keys to environment variable names
         key_mapping = {
             "open_ai_api_key": "OPENAI_API_KEY",
             "open_ai_api_base": "OPENAI_API_BASE",
@@ -522,10 +523,9 @@ class AgentBridge:
             "linkai_api_key": "LINKAI_API_KEY",
         }
         
-        # Use fixed secure location for .env file
         env_file = expand_path("~/.cow/.env")
         
-        # Read existing env vars from .env file
+        # Read existing env vars (key -> value)
         existing_env_vars = {}
         if os.path.exists(env_file):
             try:
@@ -533,48 +533,46 @@ class AgentBridge:
                     for line in f:
                         line = line.strip()
                         if line and not line.startswith('#') and '=' in line:
-                            key, _ = line.split('=', 1)
-                            existing_env_vars[key.strip()] = True
+                            key, val = line.split('=', 1)
+                            existing_env_vars[key.strip()] = val.strip()
             except Exception as e:
                 logger.warning(f"[AgentBridge] Failed to read .env file: {e}")
         
-        # Check which keys need to be migrated
-        keys_to_migrate = {}
+        # Sync config.json values into .env (add/update/remove)
+        updated = False
         for config_key, env_key in key_mapping.items():
-            # Skip if already in .env file
-            if env_key in existing_env_vars:
-                continue
-            
-            # Get value from config.json
-            value = conf().get(config_key, "")
-            if value and value.strip():  # Only migrate non-empty values
-                keys_to_migrate[env_key] = value.strip()
-        
-        # Log summary if there are keys to skip
-        if existing_env_vars:
-            logger.debug(f"[AgentBridge] {len(existing_env_vars)} env vars already in .env")
-        
-        # Write new keys to .env file
-        if keys_to_migrate:
+            raw = conf().get(config_key, "")
+            value = raw.strip() if raw else ""
+            old_value = existing_env_vars.get(env_key)
+
+            if value:
+                if old_value == value:
+                    continue
+                existing_env_vars[env_key] = value
+                os.environ[env_key] = value
+                updated = True
+            else:
+                if old_value is None:
+                    continue
+                existing_env_vars.pop(env_key, None)
+                os.environ.pop(env_key, None)
+                updated = True
+            updated = True
+
+        if updated:
             try:
-                # Ensure ~/.cow directory and .env file exist
                 env_dir = os.path.dirname(env_file)
-                if not os.path.exists(env_dir):
-                    os.makedirs(env_dir, exist_ok=True)
-                if not os.path.exists(env_file):
-                    open(env_file, 'a').close()
-                
-                # Append new keys
-                with open(env_file, 'a', encoding='utf-8') as f:
-                    f.write('\n# Auto-migrated from config.json\n')
-                    for key, value in keys_to_migrate.items():
+                os.makedirs(env_dir, exist_ok=True)
+
+                with open(env_file, 'w', encoding='utf-8') as f:
+                    f.write('# Environment variables for agent\n')
+                    f.write('# Auto-managed - synced from config.json on startup\n\n')
+                    for key, value in sorted(existing_env_vars.items()):
                         f.write(f'{key}={value}\n')
-                        # Also set in current process
-                        os.environ[key] = value
-                
-                logger.info(f"[AgentBridge] Migrated {len(keys_to_migrate)} API keys from config.json to .env: {list(keys_to_migrate.keys())}")
+
+                logger.info(f"[AgentBridge] Synced API keys from config.json to .env")
             except Exception as e:
-                logger.warning(f"[AgentBridge] Failed to migrate API keys: {e}")
+                logger.warning(f"[AgentBridge] Failed to sync API keys: {e}")
     
     def _persist_messages(
         self, session_id: str, new_messages: list, channel_type: str = ""
diff --git a/bridge/agent_initializer.py b/bridge/agent_initializer.py
index 58bbbfb3..5e0fe01b 100644
--- a/bridge/agent_initializer.py
+++ b/bridge/agent_initializer.py
@@ -490,7 +490,7 @@ class AgentInitializer:
         
         env_file = expand_path("~/.cow/.env")
         
-        # Read existing env vars
+        # Read existing env vars (key -> value)
         existing_env_vars = {}
         if os.path.exists(env_file):
             try:
@@ -498,38 +498,46 @@ class AgentInitializer:
                     for line in f:
                         line = line.strip()
                         if line and not line.startswith('#') and '=' in line:
-                            key, _ = line.split('=', 1)
-                            existing_env_vars[key.strip()] = True
+                            key, val = line.split('=', 1)
+                            existing_env_vars[key.strip()] = val.strip()
             except Exception as e:
                 logger.warning(f"[AgentInitializer] Failed to read .env file: {e}")
         
-        # Check which keys need migration
-        keys_to_migrate = {}
+        # Sync config.json values into .env (add/update/remove)
+        updated = False
         for config_key, env_key in key_mapping.items():
-            if env_key in existing_env_vars:
-                continue
-            value = conf().get(config_key, "")
-            if value and value.strip():
-                keys_to_migrate[env_key] = value.strip()
-        
-        # Write new keys
-        if keys_to_migrate:
+            raw = conf().get(config_key, "")
+            value = raw.strip() if raw else ""
+            old_value = existing_env_vars.get(env_key)
+
+            if value:
+                if old_value == value:
+                    continue
+                existing_env_vars[env_key] = value
+                os.environ[env_key] = value
+                updated = True
+            else:
+                if old_value is None:
+                    continue
+                existing_env_vars.pop(env_key, None)
+                os.environ.pop(env_key, None)
+                updated = True
+
+        if updated:
             try:
                 env_dir = os.path.dirname(env_file)
-                if not os.path.exists(env_dir):
-                    os.makedirs(env_dir, exist_ok=True)
-                if not os.path.exists(env_file):
-                    open(env_file, 'a').close()
-                
-                with open(env_file, 'a', encoding='utf-8') as f:
-                    f.write('\n# Auto-migrated from config.json\n')
-                    for key, value in keys_to_migrate.items():
+                os.makedirs(env_dir, exist_ok=True)
+
+                # Rewrite the entire .env file to ensure consistency
+                with open(env_file, 'w', encoding='utf-8') as f:
+                    f.write('# Environment variables for agent\n')
+                    f.write('# Auto-managed - synced from config.json on startup\n\n')
+                    for key, value in sorted(existing_env_vars.items()):
                         f.write(f'{key}={value}\n')
-                        os.environ[key] = value
-                
-                logger.info(f"[AgentInitializer] Migrated {len(keys_to_migrate)} API keys to .env: {list(keys_to_migrate.keys())}")
+
+                logger.info(f"[AgentInitializer] Synced API keys from config.json to .env")
             except Exception as e:
-                logger.warning(f"[AgentInitializer] Failed to migrate API keys: {e}")
+                logger.warning(f"[AgentInitializer] Failed to sync API keys: {e}")
 
     def _start_daily_flush_timer(self):
         """Start a background thread that flushes all agents' memory daily at 23:55."""
diff --git a/channel/web/static/js/console.js b/channel/web/static/js/console.js
index 24e120be..0f6c2a29 100644
--- a/channel/web/static/js/console.js
+++ b/channel/web/static/js/console.js
@@ -806,15 +806,17 @@ function sendMessage() {
 }
 
 function startSSE(requestId, loadingEl, timestamp) {
-    const es = new EventSource(`/stream?request_id=${encodeURIComponent(requestId)}`);
-    activeStreams[requestId] = es;
-
     let botEl = null;
     let stepsEl = null;    // .agent-steps  (thinking summaries + tool indicators)
     let contentEl = null;  // .answer-content (final streaming answer)
     let mediaEl = null;    // .media-content (images & file attachments)
     let accumulatedText = '';
     let currentToolEl = null;
+    let done = false;
+
+    const MAX_RECONNECTS = 10;
+    const RECONNECT_BASE_MS = 1000;
+    let reconnectCount = 0;
 
     function ensureBotEl() {
         if (botEl) return;
@@ -839,180 +841,204 @@ function startSSE(requestId, loadingEl, timestamp) {
         mediaEl = botEl.querySelector('.media-content');
     }
 
-    es.onmessage = function(e) {
-        let item;
-        try { item = JSON.parse(e.data); } catch (_) { return; }
+    function connect() {
+        const es = new EventSource(`/stream?request_id=${encodeURIComponent(requestId)}`);
+        activeStreams[requestId] = es;
 
-        if (item.type === 'delta') {
-            ensureBotEl();
-            accumulatedText += item.content;
-            contentEl.innerHTML = renderMarkdown(accumulatedText);
-            scrollChatToBottom();
+        es.onmessage = function(e) {
+            let item;
+            try { item = JSON.parse(e.data); } catch (_) { return; }
 
-        } else if (item.type === 'tool_start') {
-            ensureBotEl();
+            // Successful data received, reset reconnect counter
+            reconnectCount = 0;
 
-            // Save current thinking as a collapsible step
-            if (accumulatedText.trim()) {
-                const fullText = accumulatedText.trim();
-                const oneLine = fullText.replace(/\n+/g, ' ');
-                const needsTruncate = oneLine.length > 80;
-                const stepEl = document.createElement('div');
-                stepEl.className = 'agent-step agent-thinking-step' + (needsTruncate ? '' : ' no-expand');
-                if (needsTruncate) {
-                    const truncated = oneLine.substring(0, 80) + '…';
-                    stepEl.innerHTML = `
-                        <div class="thinking-header" onclick="this.parentElement.classList.toggle('expanded')">
-                            <i class="fas fa-lightbulb text-amber-400 flex-shrink-0"></i>
-                            <span class="thinking-summary">${escapeHtml(truncated)}</span>
-                            <i class="fas fa-chevron-right thinking-chevron"></i>
-                        </div>
-                        <div class="thinking-full">${renderMarkdown(fullText)}</div>`;
-                } else {
-                    stepEl.innerHTML = `
-                        <div class="thinking-header no-toggle">
-                            <i class="fas fa-lightbulb text-amber-400 flex-shrink-0"></i>
-                            <span>${escapeHtml(oneLine)}</span>
-                        </div>`;
+            if (item.type === 'delta') {
+                ensureBotEl();
+                accumulatedText += item.content;
+                contentEl.innerHTML = renderMarkdown(accumulatedText);
+                scrollChatToBottom();
+
+            } else if (item.type === 'tool_start') {
+                ensureBotEl();
+
+                // Save current thinking as a collapsible step
+                if (accumulatedText.trim()) {
+                    const fullText = accumulatedText.trim();
+                    const oneLine = fullText.replace(/\n+/g, ' ');
+                    const needsTruncate = oneLine.length > 80;
+                    const stepEl = document.createElement('div');
+                    stepEl.className = 'agent-step agent-thinking-step' + (needsTruncate ? '' : ' no-expand');
+                    if (needsTruncate) {
+                        const truncated = oneLine.substring(0, 80) + '…';
+                        stepEl.innerHTML = `
+                            <div class="thinking-header" onclick="this.parentElement.classList.toggle('expanded')">
+                                <i class="fas fa-lightbulb text-amber-400 flex-shrink-0"></i>
+                                <span class="thinking-summary">${escapeHtml(truncated)}</span>
+                                <i class="fas fa-chevron-right thinking-chevron"></i>
+                            </div>
+                            <div class="thinking-full">${renderMarkdown(fullText)}</div>`;
+                    } else {
+                        stepEl.innerHTML = `
+                            <div class="thinking-header no-toggle">
+                                <i class="fas fa-lightbulb text-amber-400 flex-shrink-0"></i>
+                                <span>${escapeHtml(oneLine)}</span>
+                            </div>`;
+                    }
+                    stepsEl.appendChild(stepEl);
                 }
-                stepsEl.appendChild(stepEl);
-            }
-            accumulatedText = '';
-            contentEl.innerHTML = '';
+                accumulatedText = '';
+                contentEl.innerHTML = '';
 
-            // Add tool execution indicator (collapsible)
-            currentToolEl = document.createElement('div');
-            currentToolEl.className = 'agent-step agent-tool-step';
-            const argsStr = formatToolArgs(item.arguments || {});
-            currentToolEl.innerHTML = `
-                <div class="tool-header" onclick="this.parentElement.classList.toggle('expanded')">
-                    <i class="fas fa-cog fa-spin text-primary-400 flex-shrink-0 tool-icon"></i>
-                    <span class="tool-name">${item.tool}</span>
-                    <i class="fas fa-chevron-right tool-chevron"></i>
-                </div>
-                <div class="tool-detail">
-                    <div class="tool-detail-section">
-                        <div class="tool-detail-label">Input</div>
-                        <pre class="tool-detail-content">${argsStr}</pre>
+                // Add tool execution indicator (collapsible)
+                currentToolEl = document.createElement('div');
+                currentToolEl.className = 'agent-step agent-tool-step';
+                const argsStr = formatToolArgs(item.arguments || {});
+                currentToolEl.innerHTML = `
+                    <div class="tool-header" onclick="this.parentElement.classList.toggle('expanded')">
+                        <i class="fas fa-cog fa-spin text-primary-400 flex-shrink-0 tool-icon"></i>
+                        <span class="tool-name">${item.tool}</span>
+                        <i class="fas fa-chevron-right tool-chevron"></i>
                     </div>
-                    <div class="tool-detail-section tool-output-section"></div>
-                </div>`;
-            stepsEl.appendChild(currentToolEl);
+                    <div class="tool-detail">
+                        <div class="tool-detail-section">
+                            <div class="tool-detail-label">Input</div>
+                            <pre class="tool-detail-content">${argsStr}</pre>
+                        </div>
+                        <div class="tool-detail-section tool-output-section"></div>
+                    </div>`;
+                stepsEl.appendChild(currentToolEl);
 
-            scrollChatToBottom();
+                scrollChatToBottom();
 
-        } else if (item.type === 'tool_end') {
-            if (currentToolEl) {
-                const isError = item.status !== 'success';
-                const icon = currentToolEl.querySelector('.tool-icon');
-                icon.className = isError
-                    ? 'fas fa-times text-red-400 flex-shrink-0 tool-icon'
-                    : 'fas fa-check text-primary-400 flex-shrink-0 tool-icon';
+            } else if (item.type === 'tool_end') {
+                if (currentToolEl) {
+                    const isError = item.status !== 'success';
+                    const icon = currentToolEl.querySelector('.tool-icon');
+                    icon.className = isError
+                        ? 'fas fa-times text-red-400 flex-shrink-0 tool-icon'
+                        : 'fas fa-check text-primary-400 flex-shrink-0 tool-icon';
 
-                // Show execution time
-                const nameEl = currentToolEl.querySelector('.tool-name');
-                if (item.execution_time !== undefined) {
-                    nameEl.innerHTML += ` <span class="tool-time">${item.execution_time}s</span>`;
+                    // Show execution time
+                    const nameEl = currentToolEl.querySelector('.tool-name');
+                    if (item.execution_time !== undefined) {
+                        nameEl.innerHTML += ` <span class="tool-time">${item.execution_time}s</span>`;
+                    }
+
+                    // Fill output section
+                    const outputSection = currentToolEl.querySelector('.tool-output-section');
+                    if (outputSection && item.result) {
+                        outputSection.innerHTML = `
+                            <div class="tool-detail-label">${isError ? 'Error' : 'Output'}</div>
+                            <pre class="tool-detail-content ${isError ? 'tool-error-text' : ''}">${escapeHtml(String(item.result))}</pre>`;
+                    }
+
+                    if (isError) currentToolEl.classList.add('tool-failed');
+                    currentToolEl = null;
                 }
 
-                // Fill output section
-                const outputSection = currentToolEl.querySelector('.tool-output-section');
-                if (outputSection && item.result) {
-                    outputSection.innerHTML = `
-                        <div class="tool-detail-label">${isError ? 'Error' : 'Output'}</div>
-                        <pre class="tool-detail-content ${isError ? 'tool-error-text' : ''}">${escapeHtml(String(item.result))}</pre>`;
-                }
+            } else if (item.type === 'image') {
+                ensureBotEl();
+                const imgEl = document.createElement('img');
+                imgEl.src = item.content;
+                imgEl.alt = 'screenshot';
+                imgEl.style.cssText = 'max-width:600px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);';
+                imgEl.onclick = () => window.open(item.content, '_blank');
+                mediaEl.appendChild(imgEl);
+                scrollChatToBottom();
 
-                if (isError) currentToolEl.classList.add('tool-failed');
-                currentToolEl = null;
+            } else if (item.type === 'text') {
+                // Intermediate text sent before media items; display it but keep SSE open.
+                ensureBotEl();
+                contentEl.classList.remove('sse-streaming');
+                const textContent = item.content || accumulatedText;
+                if (textContent) contentEl.innerHTML = renderMarkdown(textContent);
+                applyHighlighting(botEl);
+                scrollChatToBottom();
+
+            } else if (item.type === 'video') {
+                ensureBotEl();
+                const wrapper = document.createElement('div');
+                wrapper.innerHTML = _buildVideoHtml(item.content);
+                mediaEl.appendChild(wrapper.firstElementChild || wrapper);
+                scrollChatToBottom();
+
+            } else if (item.type === 'file') {
+                ensureBotEl();
+                const fileName = item.file_name || item.content.split('/').pop();
+                const fileEl = document.createElement('a');
+                fileEl.href = item.content;
+                fileEl.download = fileName;
+                fileEl.target = '_blank';
+                fileEl.className = 'file-attachment';
+                fileEl.style.cssText = 'display:inline-flex;align-items:center;gap:6px;padding:8px 14px;margin:8px 0;border-radius:8px;background:var(--bg-secondary,#f3f4f6);color:var(--text-primary,#374151);text-decoration:none;font-size:14px;border:1px solid var(--border-color,#e5e7eb);';
+                fileEl.innerHTML = `<i class="fas fa-file-download" style="color:#6b7280;"></i> ${fileName}`;
+                mediaEl.appendChild(fileEl);
+                scrollChatToBottom();
+
+            } else if (item.type === 'phase') {
+                // Coarse progress (e.g. cow install-browser); must not close SSE (unlike "done")
+                ensureBotEl();
+                const wrap = document.createElement('div');
+                wrap.className = 'text-xs sm:text-sm text-slate-600 dark:text-slate-400 border-l-2 border-primary-400 pl-2 py-1 my-0.5';
+                wrap.textContent = String(item.content || '');
+                stepsEl.appendChild(wrap);
+                scrollChatToBottom();
+
+            } else if (item.type === 'done') {
+                done = true;
+                es.close();
+                delete activeStreams[requestId];
+
+                // item.content may be empty when "done" is only a stream-close signal after media.
+                const finalText = item.content || accumulatedText;
+
+                if (!botEl && finalText) {
+                    if (loadingEl) { loadingEl.remove(); loadingEl = null; }
+                    addBotMessage(finalText, new Date((item.timestamp || Date.now() / 1000) * 1000), requestId);
+                } else if (botEl) {
+                    contentEl.classList.remove('sse-streaming');
+                    // Only update text content when there is something new to show.
+                    if (finalText) contentEl.innerHTML = renderMarkdown(finalText);
+                    applyHighlighting(botEl);
+                }
+                scrollChatToBottom();
+
+            } else if (item.type === 'error') {
+                done = true;
+                es.close();
+                delete activeStreams[requestId];
+                if (loadingEl) { loadingEl.remove(); loadingEl = null; }
+                addBotMessage(t('error_send'), new Date());
             }
+        };
 
-        } else if (item.type === 'image') {
-            ensureBotEl();
-            const imgEl = document.createElement('img');
-            imgEl.src = item.content;
-            imgEl.alt = 'screenshot';
-            imgEl.style.cssText = 'max-width:600px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);';
-            imgEl.onclick = () => window.open(item.content, '_blank');
-            mediaEl.appendChild(imgEl);
-            scrollChatToBottom();
-
-        } else if (item.type === 'text') {
-            // Intermediate text sent before media items; display it but keep SSE open.
-            ensureBotEl();
-            contentEl.classList.remove('sse-streaming');
-            const textContent = item.content || accumulatedText;
-            if (textContent) contentEl.innerHTML = renderMarkdown(textContent);
-            applyHighlighting(botEl);
-            scrollChatToBottom();
-
-        } else if (item.type === 'video') {
-            ensureBotEl();
-            const wrapper = document.createElement('div');
-            wrapper.innerHTML = _buildVideoHtml(item.content);
-            mediaEl.appendChild(wrapper.firstElementChild || wrapper);
-            scrollChatToBottom();
-
-        } else if (item.type === 'file') {
-            ensureBotEl();
-            const fileName = item.file_name || item.content.split('/').pop();
-            const fileEl = document.createElement('a');
-            fileEl.href = item.content;
-            fileEl.download = fileName;
-            fileEl.target = '_blank';
-            fileEl.className = 'file-attachment';
-            fileEl.style.cssText = 'display:inline-flex;align-items:center;gap:6px;padding:8px 14px;margin:8px 0;border-radius:8px;background:var(--bg-secondary,#f3f4f6);color:var(--text-primary,#374151);text-decoration:none;font-size:14px;border:1px solid var(--border-color,#e5e7eb);';
-            fileEl.innerHTML = `<i class="fas fa-file-download" style="color:#6b7280;"></i> ${fileName}`;
-            mediaEl.appendChild(fileEl);
-            scrollChatToBottom();
-
-        } else if (item.type === 'phase') {
-            // Coarse progress (e.g. cow install-browser); must not close SSE (unlike "done")
-            ensureBotEl();
-            const wrap = document.createElement('div');
-            wrap.className = 'text-xs sm:text-sm text-slate-600 dark:text-slate-400 border-l-2 border-primary-400 pl-2 py-1 my-0.5';
-            wrap.textContent = String(item.content || '');
-            stepsEl.appendChild(wrap);
-            scrollChatToBottom();
-
-        } else if (item.type === 'done') {
+        es.onerror = function() {
             es.close();
             delete activeStreams[requestId];
 
-            // item.content may be empty when "done" is only a stream-close signal after media.
-            const finalText = item.content || accumulatedText;
+            if (done) return;
 
-            if (!botEl && finalText) {
-                if (loadingEl) { loadingEl.remove(); loadingEl = null; }
-                addBotMessage(finalText, new Date((item.timestamp || Date.now() / 1000) * 1000), requestId);
-            } else if (botEl) {
+            if (reconnectCount < MAX_RECONNECTS) {
+                reconnectCount++;
+                const delay = Math.min(RECONNECT_BASE_MS * reconnectCount, 5000);
+                console.warn(`[SSE] connection lost for ${requestId}, reconnecting in ${delay}ms (attempt ${reconnectCount}/${MAX_RECONNECTS})`);
+                setTimeout(connect, delay);
+                return;
+            }
+
+            // Exhausted retries, show whatever we have
+            if (loadingEl) { loadingEl.remove(); loadingEl = null; }
+            if (!botEl) {
+                addBotMessage(t('error_send'), new Date());
+            } else if (accumulatedText) {
                 contentEl.classList.remove('sse-streaming');
-                // Only update text content when there is something new to show.
-                if (finalText) contentEl.innerHTML = renderMarkdown(finalText);
+                contentEl.innerHTML = renderMarkdown(accumulatedText);
                 applyHighlighting(botEl);
             }
-            scrollChatToBottom();
+        };
+    }
 
-        } else if (item.type === 'error') {
-            es.close();
-            delete activeStreams[requestId];
-            if (loadingEl) { loadingEl.remove(); loadingEl = null; }
-            addBotMessage(t('error_send'), new Date());
-        }
-    };
-
-    es.onerror = function() {
-        es.close();
-        delete activeStreams[requestId];
-        if (loadingEl) { loadingEl.remove(); loadingEl = null; }
-        if (!botEl) {
-            addBotMessage(t('error_send'), new Date());
-        } else if (accumulatedText) {
-            contentEl.classList.remove('sse-streaming');
-            contentEl.innerHTML = renderMarkdown(accumulatedText);
-            applyHighlighting(botEl);
-        }
-    };
+    connect();
 }
 
 function startPolling() {
diff --git a/channel/web/web_channel.py b/channel/web/web_channel.py
index 32b27062..bd686f9f 100644
--- a/channel/web/web_channel.py
+++ b/channel/web/web_channel.py
@@ -329,14 +329,18 @@ class WebChannel(ChatChannel):
         """
         SSE generator for a given request_id.
         Yields UTF-8 encoded bytes to avoid WSGI Latin-1 mangling.
+        Supports client reconnection: the queue is only removed after a
+        "done" event is consumed, so a new GET /stream with the same
+        request_id can resume reading remaining events.
         """
         if request_id not in self.sse_queues:
             yield b"data: {\"type\": \"error\", \"message\": \"invalid request_id\"}\n\n"
             return
 
         q = self.sse_queues[request_id]
-        timeout = 300  # 5 minutes max
-        deadline = time.time() + timeout
+        idle_timeout = 600  # 10 minutes without any real event
+        deadline = time.time() + idle_timeout
+        done = False
 
         try:
             while time.time() < deadline:
@@ -346,13 +350,18 @@ class WebChannel(ChatChannel):
                     yield b": keepalive\n\n"
                     continue
 
+                # Real event received, reset idle deadline
+                deadline = time.time() + idle_timeout
+
                 payload = json.dumps(item, ensure_ascii=False)
                 yield f"data: {payload}\n\n".encode("utf-8")
 
                 if item.get("type") == "done":
+                    done = True
                     break
         finally:
-            self.sse_queues.pop(request_id, None)
+            if done:
+                self.sse_queues.pop(request_id, None)
 
     def poll_response(self):
         """
diff --git a/docs/en/tools/vision.mdx b/docs/en/tools/vision.mdx
new file mode 100644
index 00000000..cebecbea
--- /dev/null
+++ b/docs/en/tools/vision.mdx
@@ -0,0 +1,72 @@
+---
+title: vision - Image Analysis
+description: Analyze image content (recognition, description, OCR, etc.)
+---
+
+Analyze local images or image URLs using Vision API. Supports content description, text extraction (OCR), object recognition, and more.
+
+## Model Selection
+
+The vision tool uses a multi-level auto-selection strategy with automatic fallback — no manual configuration required:
+
+1. **Main model** — uses the currently configured main model for image recognition (zero extra cost)
+2. **Other configured models** — auto-discovers other models with configured API keys as alternatives
+3. **OpenAI** — uses `open_ai_api_key` to call gpt-4.1-mini
+4. **LinkAI** — uses `linkai_api_key` to call LinkAI vision service
+
+When `use_linkai=true`, LinkAI is promoted to the highest priority.
+
+If the current provider fails, the tool automatically tries the next one until it succeeds or all fail.
+
+### Supported Models
+
+| Vendor | Vision Model | Notes |
+| --- | --- | --- |
+| OpenAI / Compatible | Main model | All OpenAI-compatible multimodal models |
+| Qwen (DashScope) | Main model | Via MultiModalConversation API |
+| Claude | Main model | Anthropic native image format |
+| Gemini | Main model | inlineData format |
+| Doubao | Main model | doubao-seed-2-0 series natively supported |
+| Kimi (Moonshot) | Main model | kimi-k2.5 natively supported |
+| ZhipuAI | glm-5v-turbo | Always uses dedicated vision model |
+| MiniMax | MiniMax-Text-01 | Always uses dedicated vision model |
+
+<Note>
+  ZhipuAI and MiniMax text models do not support image understanding, so their dedicated vision models are always used automatically.
+</Note>
+
+## Parameters
+
+| Parameter | Type | Required | Description |
+| --- | --- | --- | --- |
+| `image` | string | Yes | Local file path or HTTP(S) image URL |
+| `question` | string | Yes | Question to ask about the image |
+
+Supported image formats: jpg, jpeg, png, gif, webp
+
+## Custom Configuration
+
+To specify a particular model for the vision tool, add to `config.json`:
+
+```json
+{
+    "tool": {
+        "vision": {
+            "model": "gpt-4o"
+        }
+    }
+}
+```
+
+In most cases no configuration is needed. The tool works automatically as long as the main model supports multimodal input or any vision-capable API key is configured.
+
+## Use Cases
+
+- Describe image content
+- Extract text from images (OCR)
+- Identify objects, colors, scenes
+- Analyze screenshots and scanned documents
+
+<Note>
+  Images larger than 1MB are automatically compressed (max edge 1536px). All images (including remote URLs) are converted to base64 for transmission to ensure compatibility with all model backends.
+</Note>
diff --git a/docs/ja/tools/vision.mdx b/docs/ja/tools/vision.mdx
new file mode 100644
index 00000000..f34bf58a
--- /dev/null
+++ b/docs/ja/tools/vision.mdx
@@ -0,0 +1,72 @@
+---
+title: vision - 画像分析
+description: 画像コンテンツの分析（認識、説明、OCR など）
+---
+
+Vision API を使用してローカル画像や画像 URL を分析します。コンテンツの説明、テキスト抽出（OCR）、オブジェクト認識などに対応しています。
+
+## モデル選択
+
+Vision ツールは多段階の自動選択＋自動フォールバック戦略を採用しており、手動設定なしで利用可能です：
+
+1. **メインモデル** — 現在設定されているメインモデルで画像認識を実行（追加コストなし）
+2. **その他の設定済みモデル** — API キーが設定されている他のマルチモーダルモデルを自動検出
+3. **OpenAI** — `open_ai_api_key` を使用して gpt-4.1-mini を呼び出し
+4. **LinkAI** — `linkai_api_key` を使用して LinkAI ビジョンサービスを呼び出し
+
+`use_linkai=true` の場合、LinkAI が最優先になります。
+
+現在のプロバイダーが失敗した場合、成功するかすべて失敗するまで自動的に次のプロバイダーを試行します。
+
+### 対応モデル
+
+| ベンダー | ビジョンモデル | 説明 |
+| --- | --- | --- |
+| OpenAI / 互換プロトコル | メインモデル | すべての OpenAI 互換マルチモーダルモデルに対応 |
+| 通義千問 (DashScope) | メインモデル | MultiModalConversation API 経由 |
+| Claude | メインモデル | Anthropic ネイティブ画像形式 |
+| Gemini | メインモデル | inlineData 形式 |
+| 豆包 (Doubao) | メインモデル | doubao-seed-2-0 シリーズがネイティブ対応 |
+| Kimi (Moonshot) | メインモデル | kimi-k2.5 がネイティブ対応 |
+| 智谱 AI | glm-5v-turbo | 常にビジョン専用モデルを使用 |
+| MiniMax | MiniMax-Text-01 | 常にビジョン専用モデルを使用 |
+
+<Note>
+  智谱 AI と MiniMax のテキストモデルは画像理解に対応していないため、対応するビジョン専用モデルが自動的に使用されます。
+</Note>
+
+## パラメータ
+
+| パラメータ | 型 | 必須 | 説明 |
+| --- | --- | --- | --- |
+| `image` | string | はい | ローカルファイルパスまたは HTTP(S) 画像 URL |
+| `question` | string | はい | 画像に対する質問 |
+
+対応画像形式：jpg、jpeg、png、gif、webp
+
+## カスタム設定
+
+Vision ツールで使用するモデルを指定するには、`config.json` に以下を追加します：
+
+```json
+{
+    "tool": {
+        "vision": {
+            "model": "gpt-4o"
+        }
+    }
+}
+```
+
+ほとんどの場合、設定は不要です。メインモデルがマルチモーダルに対応しているか、ビジョン対応の API キーが設定されていれば自動的に動作します。
+
+## ユースケース
+
+- 画像コンテンツの説明
+- 画像からのテキスト抽出（OCR）
+- オブジェクト、色、シーンの識別
+- スクリーンショットやスキャン文書の分析
+
+<Note>
+  1MB を超える画像は自動的に圧縮されます（最大辺 1536px）。すべての画像（リモート URL を含む）は base64 に変換して送信され、すべてのモデルバックエンドとの互換性を確保します。
+</Note>
diff --git a/docs/tools/vision.mdx b/docs/tools/vision.mdx
index 839212b3..4e1089e0 100644
--- a/docs/tools/vision.mdx
+++ b/docs/tools/vision.mdx
@@ -5,14 +5,49 @@ description: 分析图片内容（识别、描述、OCR 等）
 
 使用 Vision API 分析本地图片或图片 URL，支持内容描述、文字提取（OCR）、物体识别等。
 
-## 依赖
+## 模型选择
 
-需要配置至少一个 API Key（通过 `env_config` 工具或工作空间 `.env` 文件配置）：
+Vision 工具采用多级自动选择 + 自动兜底策略，无需手动配置即可使用：
 
-| 后端 | 环境变量 | 优先级 |
+1. **主模型** — 优先使用当前配置的主模型进行图像识别（需要是多模态模型）
+2. **其他已配置模型** — 自动发现已配置 API Key 的其他多模态模型作为备选
+
+如果当前 provider 调用失败，会自动尝试下一个，直到成功或全部失败。
+
+### 支持的模型
+
+| 厂商 | 视觉模型 | 说明 |
 | --- | --- | --- |
-| OpenAI | `OPENAI_API_KEY` | 优先使用 |
-| LinkAI | `LINKAI_API_KEY` | 备选 |
+| OpenAI / 兼容协议 | 使用主模型 | 支持所有 OpenAI 协议兼容的多模态模型 |
+| 通义千问 (DashScope) | 使用主模型 | 例如 qwen3.6-plus 等 |
+| Claude | 使用主模型 | Anthropic 原生图像格式 |
+| Gemini | 使用主模型 | inlineData 格式 |
+| 豆包 (Doubao) | 使用主模型 | doubao-seed-2-0 系列原生支持 |
+| Kimi (Moonshot) | 使用主模型 | kimi-k2.5 原生支持 |
+| 智谱 AI | glm-5v-turbo | 固定使用视觉专用模型 |
+| MiniMax | MiniMax-Text-01 | 固定使用视觉专用模型 |
+
+<Note>
+  智谱和 MiniMax 的文本模型不支持图像理解，因此始终使用对应的视觉专用模型，无需手动指定。
+</Note>
+
+> 当 `use_linkai=true` 时，默认使用 LinkAI 的多模态模型进行
+
+## 自定义配置
+
+如果希望指定 Vision 使用的模型，可在 `config.json` 中配置，例如：
+
+```json
+{
+    "tool": {
+        "vision": {
+            "model": "gpt-4o"
+        }
+    }
+}
+```
+
+大多数情况下无需配置，主模型支持多模态或配置任意一个支持视觉的 API Key 即可自动工作。
 
 ## 参数
 
@@ -20,17 +55,18 @@ description: 分析图片内容（识别、描述、OCR 等）
 | --- | --- | --- | --- |
 | `image` | string | 是 | 本地文件路径或 HTTP(S) 图片 URL |
 | `question` | string | 是 | 对图片提出的问题 |
-| `model` | string | 否 | 模型名称（默认 gpt-4.1-mini） |
 
 支持的图片格式：jpg、jpeg、png、gif、webp
 
+
+
 ## 使用场景
 
 - 描述图片中的内容
 - 提取图片中的文字（OCR）
 - 识别物体、颜色、场景
-- 分析截图、文档扫描件
+- 分析截图、文档扫描图片等
 
 <Note>
-  超过 1MB 的图片会自动压缩后上传。如果未配置任何 Vision API Key，该工具不会被加载。
+  超过 1MB 的图片会自动压缩后上传，所有图片（包括远程 URL）会统一转为 base64 传输，确保兼容所有模型后端。
 </Note>
diff --git a/models/bot.py b/models/bot.py
index ca6e1aa1..f5f72e7d 100644
--- a/models/bot.py
+++ b/models/bot.py
@@ -2,12 +2,27 @@
 Auto-replay chat robot abstract class
 """
 
-
 from bridge.context import Context
 from bridge.reply import Reply
 
 
 class Bot(object):
+    """
+    Base class for all chat-bot implementations.
+
+    Subclasses may also implement:
+
+        call_with_tools(messages, tools=None, stream=False, **kwargs)
+            -> dict | generator  (OpenAI-compatible format)
+
+        call_vision(image_url, question, model=None, max_tokens=1000)
+            -> dict with keys: model, content, usage  (or error/message)
+
+    These are NOT defined here to avoid shadowing concrete implementations
+    provided by mixin classes (e.g. OpenAICompatibleBot) in the MRO.
+    Use ``hasattr(bot, 'call_vision')`` to detect support at runtime.
+    """
+
     def reply(self, query, context: Context = None) -> Reply:
         """
         bot auto-reply content
diff --git a/models/claudeapi/claude_api_bot.py b/models/claudeapi/claude_api_bot.py
index 5dcf9173..ffbb74dd 100644
--- a/models/claudeapi/claude_api_bot.py
+++ b/models/claudeapi/claude_api_bot.py
@@ -1,7 +1,10 @@
 # encoding:utf-8
 
+import base64
 import json
+import re
 import time
+from typing import Optional
 
 import requests
 
@@ -224,6 +227,79 @@ class ClaudeAPIBot(Bot, OpenAIImage):
             return 64000
         return 8192
 
+    @staticmethod
+    def _parse_data_url(data_url: str):
+        """Parse a data:<mime>;base64,<data> URL into (media_type, base64_data)."""
+        m = re.match(r"^data:([^;]+);base64,(.+)$", data_url, re.DOTALL)
+        if m:
+            return m.group(1), m.group(2)
+        return None, None
+
+    def call_vision(self, image_url: str, question: str,
+                    model: Optional[str] = None,
+                    max_tokens: int = 1000) -> dict:
+        """Analyze an image using Claude Messages API (native image blocks)."""
+        try:
+            actual_model = model or self._model_mapping(conf().get("model"))
+
+            # Build Claude-native image content block
+            if image_url.startswith("data:"):
+                media_type, b64_data = self._parse_data_url(image_url)
+                if not b64_data:
+                    return {"error": True, "message": "Invalid base64 data URL"}
+                image_block = {
+                    "type": "image",
+                    "source": {"type": "base64",
+                               "media_type": media_type or "image/jpeg",
+                               "data": b64_data},
+                }
+            else:
+                image_block = {
+                    "type": "image",
+                    "source": {"type": "url", "url": image_url},
+                }
+
+            data = {
+                "model": actual_model,
+                "max_tokens": max_tokens,
+                "messages": [{
+                    "role": "user",
+                    "content": [
+                        image_block,
+                        {"type": "text", "text": question},
+                    ],
+                }],
+            }
+
+            headers = {
+                "x-api-key": self.api_key,
+                "anthropic-version": "2023-06-01",
+                "content-type": "application/json",
+            }
+            proxies = {"http": self.proxy, "https": self.proxy} if self.proxy else None
+            resp = requests.post(f"{self.api_base}/messages",
+                                 headers=headers, json=data, proxies=proxies)
+
+            if resp.status_code != 200:
+                return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
+
+            body = resp.json()
+            text_parts = [b.get("text", "") for b in body.get("content", [])
+                          if b.get("type") == "text"]
+            usage = body.get("usage", {})
+            return {
+                "model": actual_model,
+                "content": "".join(text_parts),
+                "usage": {
+                    "prompt_tokens": usage.get("input_tokens", 0),
+                    "completion_tokens": usage.get("output_tokens", 0),
+                    "total_tokens": usage.get("input_tokens", 0) + usage.get("output_tokens", 0),
+                },
+            }
+        except Exception as e:
+            logger.error(f"[CLAUDE] call_vision error: {e}")
+            return {"error": True, "message": str(e)}
+
     def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
         """
         Call Claude API with tool support for agent integration
diff --git a/models/dashscope/dashscope_bot.py b/models/dashscope/dashscope_bot.py
index 0887751f..4d4d628f 100644
--- a/models/dashscope/dashscope_bot.py
+++ b/models/dashscope/dashscope_bot.py
@@ -1,6 +1,8 @@
 # encoding:utf-8
 
 import json
+from typing import Optional
+
 from models.bot import Bot
 from models.session_manager import SessionManager
 from bridge.context import ContextType
@@ -153,6 +155,56 @@ class DashscopeBot(Bot):
             else:
                 return result
 
+    def call_vision(self, image_url: str, question: str,
+                    model: Optional[str] = None,
+                    max_tokens: int = 1000) -> dict:
+        """Analyze an image using DashScope MultiModalConversation API."""
+        try:
+            dashscope.api_key = self.api_key
+            vision_model = model or "qwen-vl-max"
+
+            # DashScope multimodal format: {"image": url} + {"text": question}
+            messages = [{
+                "role": "user",
+                "content": [
+                    {"image": image_url},
+                    {"text": question},
+                ],
+            }]
+
+            response = MultiModalConversation.call(
+                model=vision_model,
+                messages=messages,
+                max_tokens=max_tokens,
+            )
+
+            if response.status_code != HTTPStatus.OK:
+                return {
+                    "error": True,
+                    "message": f"{response.code} - {response.message}",
+                }
+
+            resp_dict = self._response_to_dict(response)
+            choice = resp_dict["output"]["choices"][0]
+            content = choice.get("message", {}).get("content", "")
+            if isinstance(content, list):
+                content = "".join(
+                    item.get("text", "") for item in content if isinstance(item, dict)
+                )
+            usage = resp_dict.get("usage", {})
+            return {
+                "model": vision_model,
+                "content": content,
+                "usage": {
+                    "prompt_tokens": usage.get("input_tokens", 0),
+                    "completion_tokens": usage.get("output_tokens", 0),
+                    "total_tokens": usage.get("total_tokens", 0),
+                },
+            }
+        except Exception as e:
+            logger.error(f"[DASHSCOPE] call_vision error: {e}")
+            return {"error": True, "message": str(e)}
+
     def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
         """
         Call DashScope API with tool support for agent integration
diff --git a/models/doubao/doubao_bot.py b/models/doubao/doubao_bot.py
index cfe4ba5c..b31516ec 100644
--- a/models/doubao/doubao_bot.py
+++ b/models/doubao/doubao_bot.py
@@ -2,6 +2,7 @@
 
 import json
 import time
+from typing import Optional
 
 import requests
 from models.bot import Bot
@@ -147,6 +148,49 @@ class DoubaoBot(Bot):
             else:
                 return result
 
+    def call_vision(self, image_url: str, question: str,
+                    model: Optional[str] = None,
+                    max_tokens: int = 1000) -> dict:
+        """Analyze an image using Doubao (Volcengine Ark) OpenAI-compatible API."""
+        try:
+            vision_model = model or self.args.get("model", "doubao-seed-2-0-pro-260215")
+            payload = {
+                "model": vision_model,
+                "max_tokens": max_tokens,
+                "messages": [{
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": question},
+                        {"type": "image_url", "image_url": {"url": image_url}},
+                    ],
+                }],
+            }
+            headers = {
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
+            }
+            resp = requests.post(f"{self.base_url}/chat/completions",
+                                 headers=headers, json=payload, timeout=60)
+            if resp.status_code != 200:
+                return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
+            data = resp.json()
+            if "error" in data:
+                return {"error": True, "message": data["error"].get("message", str(data["error"]))}
+            content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+            usage = data.get("usage", {})
+            return {
+                "model": vision_model,
+                "content": content,
+                "usage": {
+                    "prompt_tokens": usage.get("prompt_tokens", 0),
+                    "completion_tokens": usage.get("completion_tokens", 0),
+                    "total_tokens": usage.get("total_tokens", 0),
+                },
+            }
+        except Exception as e:
+            logger.error(f"[DOUBAO] call_vision error: {e}")
+            return {"error": True, "message": str(e)}
+
     # ==================== Agent mode support ====================
 
     def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs):
@@ -434,31 +478,37 @@ class DoubaoBot(Bot):
                 continue
 
             if role == "user":
-                text_parts = []
-                tool_results = []
+                has_tool_result = any(
+                    isinstance(b, dict) and b.get("type") == "tool_result" for b in content
+                )
+                if has_tool_result:
+                    text_parts = []
+                    tool_results = []
 
-                for block in content:
-                    if not isinstance(block, dict):
-                        continue
-                    if block.get("type") == "text":
-                        text_parts.append(block.get("text", ""))
-                    elif block.get("type") == "tool_result":
-                        tool_call_id = block.get("tool_use_id") or ""
-                        result_content = block.get("content", "")
-                        if not isinstance(result_content, str):
-                            result_content = json.dumps(result_content, ensure_ascii=False)
-                        tool_results.append({
-                            "role": "tool",
-                            "tool_call_id": tool_call_id,
-                            "content": result_content
-                        })
+                    for block in content:
+                        if not isinstance(block, dict):
+                            continue
+                        if block.get("type") == "text":
+                            text_parts.append(block.get("text", ""))
+                        elif block.get("type") == "tool_result":
+                            tool_call_id = block.get("tool_use_id") or ""
+                            result_content = block.get("content", "")
+                            if not isinstance(result_content, str):
+                                result_content = json.dumps(result_content, ensure_ascii=False)
+                            tool_results.append({
+                                "role": "tool",
+                                "tool_call_id": tool_call_id,
+                                "content": result_content
+                            })
 
-                # Tool results first (must come right after assistant with tool_calls)
-                for tr in tool_results:
-                    converted.append(tr)
+                    for tr in tool_results:
+                        converted.append(tr)
 
-                if text_parts:
-                    converted.append({"role": "user", "content": "\n".join(text_parts)})
+                    if text_parts:
+                        converted.append({"role": "user", "content": "\n".join(text_parts)})
+                else:
+                    # Keep as-is for multimodal content (e.g. image_url blocks)
+                    converted.append(msg)
 
             elif role == "assistant":
                 openai_msg = {"role": "assistant"}
diff --git a/models/gemini/google_gemini_bot.py b/models/gemini/google_gemini_bot.py
index e49a8bf3..aa7199ca 100644
--- a/models/gemini/google_gemini_bot.py
+++ b/models/gemini/google_gemini_bot.py
@@ -12,6 +12,8 @@ import mimetypes
 import os
 import re
 import time
+from typing import Optional
+
 import requests
 from models.bot import Bot
 from models.session_manager import SessionManager
@@ -144,7 +146,12 @@ class GoogleGeminiBot(Bot):
             return "", []
         pattern = r"\[图片:\s*([^\]]+)\]"
         image_paths = [m.strip().strip("'\"") for m in re.findall(pattern, content) if m.strip()]
-        cleaned_text = re.sub(pattern, "", content)
+        # Replace markers with path-only hints so the model still knows the
+        # original file location (needed when it calls tools like vision).
+        def _replace_with_hint(m):
+            path = m.group(1).strip().strip("'\"")
+            return f"[attached image: {path}]"
+        cleaned_text = re.sub(pattern, _replace_with_hint, content)
         cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text).strip()
         return cleaned_text, image_paths
 
@@ -225,6 +232,57 @@ class GoogleGeminiBot(Bot):
         logger.warning(f"[Gemini] Unsupported image URL format: {image_url[:120]}")
         return None
 
+    def call_vision(self, image_url: str, question: str,
+                    model: Optional[str] = None,
+                    max_tokens: int = 1000) -> dict:
+        """Analyze an image using Gemini REST API."""
+        try:
+            model_name = model or self.model or "gemini-2.0-flash"
+            image_part = self._build_inline_part_from_image_url({"url": image_url})
+            if not image_part:
+                return {"error": True, "message": f"Cannot process image URL: {image_url[:120]}"}
+
+            payload = {
+                "contents": [{
+                    "role": "user",
+                    "parts": [image_part, {"text": question}],
+                }],
+                "generationConfig": {"maxOutputTokens": max_tokens},
+                "safetySettings": [
+                    {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
+                    {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
+                    {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
+                    {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
+                ],
+            }
+            endpoint = f"{self.api_base}/v1beta/models/{model_name}:generateContent"
+            headers = {"x-goog-api-key": self.api_key, "Content-Type": "application/json"}
+            resp = requests.post(endpoint, headers=headers, json=payload, timeout=60)
+
+            if resp.status_code != 200:
+                return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
+
+            body = resp.json()
+            candidates = body.get("candidates", [])
+            text_parts = []
+            for part in candidates[0].get("content", {}).get("parts", []) if candidates else []:
+                if "text" in part:
+                    text_parts.append(part["text"])
+
+            usage_meta = body.get("usageMetadata", {})
+            return {
+                "model": model_name,
+                "content": "".join(text_parts),
+                "usage": {
+                    "prompt_tokens": usage_meta.get("promptTokenCount", 0),
+                    "completion_tokens": usage_meta.get("candidatesTokenCount", 0),
+                    "total_tokens": usage_meta.get("totalTokenCount", 0),
+                },
+            }
+        except Exception as e:
+            logger.error(f"[Gemini] call_vision error: {e}")
+            return {"error": True, "message": str(e)}
+
     def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
         """
         Call Gemini API with tool support using REST API (following official docs)
diff --git a/models/minimax/minimax_bot.py b/models/minimax/minimax_bot.py
index af80e795..983a4132 100644
--- a/models/minimax/minimax_bot.py
+++ b/models/minimax/minimax_bot.py
@@ -2,6 +2,8 @@
 
 import time
 import json
+from typing import Optional
+
 import requests
 
 from models.bot import Bot
@@ -175,6 +177,51 @@ class MinimaxBot(Bot):
             else:
                 return result
 
+    def call_vision(self, image_url: str, question: str,
+                    model: Optional[str] = None,
+                    max_tokens: int = 1000) -> dict:
+        """Analyze an image using MiniMax OpenAI-compatible API.
+        Always uses MiniMax-Text-01 — other MiniMax models do not support vision.
+        """
+        try:
+            vision_model = "MiniMax-Text-01"
+            payload = {
+                "model": vision_model,
+                "max_tokens": max_tokens,
+                "messages": [{
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": question},
+                        {"type": "image_url", "image_url": {"url": image_url}},
+                    ],
+                }],
+            }
+            headers = {
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
+            }
+            resp = requests.post(f"{self.api_base}/chat/completions",
+                                 headers=headers, json=payload, timeout=60)
+            if resp.status_code != 200:
+                return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
+            data = resp.json()
+            if "error" in data:
+                return {"error": True, "message": data["error"].get("message", str(data["error"]))}
+            content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+            usage = data.get("usage", {})
+            return {
+                "model": vision_model,
+                "content": content,
+                "usage": {
+                    "prompt_tokens": usage.get("prompt_tokens", 0),
+                    "completion_tokens": usage.get("completion_tokens", 0),
+                    "total_tokens": usage.get("total_tokens", 0),
+                },
+            }
+        except Exception as e:
+            logger.error(f"[MINIMAX] call_vision error: {e}")
+            return {"error": True, "message": str(e)}
+
     def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
         """
         Call MiniMax API with tool support for agent integration
@@ -273,37 +320,41 @@ class MinimaxBot(Bot):
             if role == "user":
                 # Handle user message
                 if isinstance(content, list):
-                    # Extract text from content blocks
-                    text_parts = []
-                    tool_results = []
+                    has_tool_result = any(
+                        isinstance(b, dict) and b.get("type") == "tool_result" for b in content
+                    )
+                    if has_tool_result:
+                        text_parts = []
+                        tool_results = []
 
-                    for block in content:
-                        if isinstance(block, dict):
-                            if block.get("type") == "text":
-                                text_parts.append(block.get("text", ""))
-                            elif block.get("type") == "tool_result":
-                                # Tool result should be a separate message with role="tool"
-                                tool_call_id = block.get("tool_use_id") or ""
-                                if not tool_call_id:
-                                    logger.warning(f"[MINIMAX] tool_result missing tool_use_id")
-                                result_content = block.get("content", "")
-                                if not isinstance(result_content, str):
-                                    result_content = json.dumps(result_content, ensure_ascii=False)
-                                tool_results.append({
-                                    "role": "tool",
-                                    "tool_call_id": tool_call_id,
-                                    "content": result_content
-                                })
+                        for block in content:
+                            if isinstance(block, dict):
+                                if block.get("type") == "text":
+                                    text_parts.append(block.get("text", ""))
+                                elif block.get("type") == "tool_result":
+                                    tool_call_id = block.get("tool_use_id") or ""
+                                    if not tool_call_id:
+                                        logger.warning(f"[MINIMAX] tool_result missing tool_use_id")
+                                    result_content = block.get("content", "")
+                                    if not isinstance(result_content, str):
+                                        result_content = json.dumps(result_content, ensure_ascii=False)
+                                    tool_results.append({
+                                        "role": "tool",
+                                        "tool_call_id": tool_call_id,
+                                        "content": result_content
+                                    })
 
-                    if text_parts:
-                        converted.append({
-                            "role": "user",
-                            "content": "\n".join(text_parts)
-                        })
+                        if text_parts:
+                            converted.append({
+                                "role": "user",
+                                "content": "\n".join(text_parts)
+                            })
 
-                    # Add all tool results (not just the last one)
-                    for tool_result in tool_results:
-                        converted.append(tool_result)
+                        for tool_result in tool_results:
+                            converted.append(tool_result)
+                    else:
+                        # Keep as-is for multimodal content (e.g. image_url blocks)
+                        converted.append(msg)
                 else:
                     # Simple text content
                     converted.append({
diff --git a/models/moonshot/moonshot_bot.py b/models/moonshot/moonshot_bot.py
index ded011ca..4d35400e 100644
--- a/models/moonshot/moonshot_bot.py
+++ b/models/moonshot/moonshot_bot.py
@@ -2,6 +2,7 @@
 
 import json
 import time
+from typing import Optional
 
 import requests
 from models.bot import Bot
@@ -147,6 +148,49 @@ class MoonshotBot(Bot):
             else:
                 return result
 
+    def call_vision(self, image_url: str, question: str,
+                    model: Optional[str] = None,
+                    max_tokens: int = 1000) -> dict:
+        """Analyze an image using Moonshot (Kimi) OpenAI-compatible API."""
+        try:
+            vision_model = model or self.args.get("model", "kimi-k2.5")
+            payload = {
+                "model": vision_model,
+                "max_tokens": max_tokens,
+                "messages": [{
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": question},
+                        {"type": "image_url", "image_url": {"url": image_url}},
+                    ],
+                }],
+            }
+            headers = {
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
+            }
+            resp = requests.post(f"{self.base_url}/chat/completions",
+                                 headers=headers, json=payload, timeout=60)
+            if resp.status_code != 200:
+                return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
+            data = resp.json()
+            if "error" in data:
+                return {"error": True, "message": data["error"].get("message", str(data["error"]))}
+            content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+            usage = data.get("usage", {})
+            return {
+                "model": vision_model,
+                "content": content,
+                "usage": {
+                    "prompt_tokens": usage.get("prompt_tokens", 0),
+                    "completion_tokens": usage.get("completion_tokens", 0),
+                    "total_tokens": usage.get("total_tokens", 0),
+                },
+            }
+        except Exception as e:
+            logger.error(f"[MOONSHOT] call_vision error: {e}")
+            return {"error": True, "message": str(e)}
+
     # ==================== Agent mode support ====================
 
     def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs):
@@ -435,31 +479,37 @@ class MoonshotBot(Bot):
                 continue
 
             if role == "user":
-                text_parts = []
-                tool_results = []
+                has_tool_result = any(
+                    isinstance(b, dict) and b.get("type") == "tool_result" for b in content
+                )
+                if has_tool_result:
+                    text_parts = []
+                    tool_results = []
 
-                for block in content:
-                    if not isinstance(block, dict):
-                        continue
-                    if block.get("type") == "text":
-                        text_parts.append(block.get("text", ""))
-                    elif block.get("type") == "tool_result":
-                        tool_call_id = block.get("tool_use_id") or ""
-                        result_content = block.get("content", "")
-                        if not isinstance(result_content, str):
-                            result_content = json.dumps(result_content, ensure_ascii=False)
-                        tool_results.append({
-                            "role": "tool",
-                            "tool_call_id": tool_call_id,
-                            "content": result_content
-                        })
+                    for block in content:
+                        if not isinstance(block, dict):
+                            continue
+                        if block.get("type") == "text":
+                            text_parts.append(block.get("text", ""))
+                        elif block.get("type") == "tool_result":
+                            tool_call_id = block.get("tool_use_id") or ""
+                            result_content = block.get("content", "")
+                            if not isinstance(result_content, str):
+                                result_content = json.dumps(result_content, ensure_ascii=False)
+                            tool_results.append({
+                                "role": "tool",
+                                "tool_call_id": tool_call_id,
+                                "content": result_content
+                            })
 
-                # Tool results first (must come right after assistant with tool_calls)
-                for tr in tool_results:
-                    converted.append(tr)
+                    for tr in tool_results:
+                        converted.append(tr)
 
-                if text_parts:
-                    converted.append({"role": "user", "content": "\n".join(text_parts)})
+                    if text_parts:
+                        converted.append({"role": "user", "content": "\n".join(text_parts)})
+                else:
+                    # Keep as-is for multimodal content (e.g. image_url blocks)
+                    converted.append(msg)
 
             elif role == "assistant":
                 openai_msg = {"role": "assistant"}
diff --git a/models/openai_compatible_bot.py b/models/openai_compatible_bot.py
index baac0681..6d4d314e 100644
--- a/models/openai_compatible_bot.py
+++ b/models/openai_compatible_bot.py
@@ -9,6 +9,8 @@ This includes: OpenAI, LinkAI, Azure OpenAI, and many third-party providers.
 
 import json
 import openai
+import requests
+from typing import Optional
 from common.log import logger
 from agent.protocol.message_utils import drop_orphaned_tool_results_openai
 
@@ -306,3 +308,51 @@ class OpenAICompatibleBot:
                 openai_messages.append(msg)
 
         return drop_orphaned_tool_results_openai(openai_messages)
+
+    def call_vision(self, image_url: str, question: str,
+                    model: Optional[str] = None,
+                    max_tokens: int = 1000) -> dict:
+        """Analyze an image using the OpenAI-compatible /chat/completions endpoint."""
+        try:
+            api_config = self.get_api_config()
+            vision_model = model or api_config.get("model", "gpt-4o")
+            api_key = api_config.get("api_key", "")
+            api_base = (api_config.get("api_base") or "https://api.openai.com/v1").rstrip("/")
+
+            payload = {
+                "model": vision_model,
+                "messages": [{
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": question},
+                        {"type": "image_url", "image_url": {"url": image_url}},
+                    ],
+                }],
+            }
+            headers = {
+                "Authorization": f"Bearer {api_key}",
+                "Content-Type": "application/json",
+            }
+            resp = requests.post(
+                f"{api_base}/chat/completions",
+                headers=headers, json=payload, timeout=60,
+            )
+            if resp.status_code != 200:
+                body = resp.text[:500]
+                logger.error(f"[{self.__class__.__name__}] call_vision HTTP {resp.status_code}: {body}")
+                return {"error": True, "message": f"HTTP {resp.status_code}: {body}"}
+            data = resp.json()
+            content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+            usage = data.get("usage", {})
+            return {
+                "model": vision_model,
+                "content": content,
+                "usage": {
+                    "prompt_tokens": usage.get("prompt_tokens", 0),
+                    "completion_tokens": usage.get("completion_tokens", 0),
+                    "total_tokens": usage.get("total_tokens", 0),
+                },
+            }
+        except Exception as e:
+            logger.error(f"[{self.__class__.__name__}] call_vision error: {e}")
+            return {"error": True, "message": str(e)}
diff --git a/models/zhipuai/zhipuai_bot.py b/models/zhipuai/zhipuai_bot.py
index 4733cf9b..98ea5db1 100644
--- a/models/zhipuai/zhipuai_bot.py
+++ b/models/zhipuai/zhipuai_bot.py
@@ -2,6 +2,7 @@
 
 import time
 import json
+from typing import Optional
 
 from models.bot import Bot
 from models.zhipuai.zhipu_ai_session import ZhipuAISession
@@ -149,6 +150,40 @@ class ZHIPUAIBot(Bot, ZhipuAIImage):
             else:
                 return result
 
+    def call_vision(self, image_url: str, question: str,
+                    model: Optional[str] = None,
+                    max_tokens: int = 1000) -> dict:
+        """Analyze an image using ZhipuAI OpenAI-compatible SDK.
+        Always uses glm-5v-turbo — the text models (glm-5-turbo etc.) do not support vision.
+        """
+        try:
+            vision_model = "glm-5v-turbo"
+            response = self.client.chat.completions.create(
+                model=vision_model,
+                max_tokens=max_tokens,
+                messages=[{
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": question},
+                        {"type": "image_url", "image_url": {"url": image_url}},
+                    ],
+                }],
+            )
+            content = response.choices[0].message.content or ""
+            usage = response.usage
+            return {
+                "model": vision_model,
+                "content": content,
+                "usage": {
+                    "prompt_tokens": getattr(usage, "prompt_tokens", 0),
+                    "completion_tokens": getattr(usage, "completion_tokens", 0),
+                    "total_tokens": getattr(usage, "total_tokens", 0),
+                },
+            }
+        except Exception as e:
+            logger.error(f"[ZHIPU_AI] call_vision error: {e}")
+            return {"error": True, "message": str(e)}
+
     def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
         """
         Call ZhipuAI API with tool support for agent integration