Merge branch 'master' into feat-knowledge

This commit is contained in:
zhayujie
2026-04-12 16:46:38 +08:00
25 changed files with 1502 additions and 390 deletions

View File

@@ -214,6 +214,7 @@ cow install-browser
+ 添加 `"speech_recognition": true` 将开启语音识别,默认使用 openai 的 whisper 模型识别为文字,同时以文字回复,该参数仅支持私聊 (注意由于语音消息无法匹配前缀,一旦开启将对所有语音自动回复,支持语音触发画图)
+ 添加 `"group_speech_recognition": true` 将开启群组语音识别,默认使用 openai 的 whisper 模型识别为文字,同时以文字回复,参数仅支持群聊 (会匹配 group_chat_prefix 和 group_chat_keyword, 支持语音触发画图)
+ 添加 `"voice_reply_voice": true` 将开启语音回复语音(同时作用于私聊和群聊)
+ 使用 MiniMax TTS设置 `"text_to_voice": "minimax"`,并配置 `minimax_api_key`;可通过 `"tts_voice_id"` 指定发音人(如 `English_Graceful_Lady``"text_to_voice_model"` 指定模型(如 `speech-2.8-hd``speech-2.8-turbo`
</details>
<details>
@@ -358,7 +359,7 @@ sudo docker logs -f chatgpt-on-wechat
"minimax_api_key": ""
}
```
- `model`: 可填写 `MiniMax-M2.7、MiniMax-M2.5、MiniMax-M2.1、MiniMax-M2.1-lightning、MiniMax-M2、abab6.5-chat`
- `model`: 可填写 `MiniMax-M2.7、MiniMax-M2.7-highspeed、MiniMax-M2.5、MiniMax-M2.1、MiniMax-M2.1-lightning、MiniMax-M2、abab6.5-chat`
- `minimax_api_key`MiniMax 平台的 API-KEY在 [控制台](https://platform.minimaxi.com/user-center/basic-information/interface-key) 创建
方式二OpenAI 兼容方式接入,配置如下:
@@ -371,7 +372,7 @@ sudo docker logs -f chatgpt-on-wechat
}
```
- `bot_type`: OpenAI 兼容方式
- `model`: 可填 `MiniMax-M2.7、MiniMax-M2.5、MiniMax-M2.1、MiniMax-M2.1-lightning、MiniMax-M2`,参考[API文档](https://platform.minimaxi.com/document/%E5%AF%B9%E8%AF%9D?key=66701d281d57f38758d581d0#QklxsNSbaf6kM4j6wjO5eEek)
- `model`: 可填 `MiniMax-M2.7、MiniMax-M2.7-highspeed、MiniMax-M2.5、MiniMax-M2.1、MiniMax-M2.1-lightning、MiniMax-M2`,参考[API文档](https://platform.minimaxi.com/document/%E5%AF%B9%E8%AF%9D?key=66701d281d57f38758d581d0#QklxsNSbaf6kM4j6wjO5eEek)
- `open_ai_api_base`: MiniMax 平台 API 的 BASE URL
- `open_ai_api_key`: MiniMax 平台的 API-KEY
</details>

View File

@@ -1,7 +1,13 @@
"""
Vision tool - Analyze images using OpenAI-compatible Vision API.
Vision tool - Analyze images using Vision API.
Supports local files (auto base64-encoded) and HTTP URLs.
Providers are tried in priority order with automatic fallback on failure.
Provider priority (default):
1. Main model via bot.call_vision — zero extra cost
2. Other models whose API key is configured — auto-discovered
3. OpenAI / LinkAI raw HTTP — reliable fallback
When use_linkai=true, LinkAI is promoted to #1.
When tool.vision.model is set, that model is used exclusively first.
"""
import base64
@@ -14,10 +20,11 @@ from typing import Any, Dict, List, Optional
import requests
from agent.tools.base_tool import BaseTool, ToolResult
from common import const
from common.log import logger
from config import conf
DEFAULT_MODEL = "gpt-4.1-mini"
DEFAULT_MODEL = const.GPT_41_MINI
DEFAULT_TIMEOUT = 60
MAX_TOKENS = 1000
COMPRESS_THRESHOLD = 1_048_576 # 1 MB
@@ -30,8 +37,20 @@ SUPPORTED_EXTENSIONS = {
"webp": "image/webp",
}
_MAIN_MODEL_PROVIDER_NAME = "MainModel"
OPENAI_COMPATIBLE_BOT_TYPES = {"openai", "openAI", "chatGPT"}
# (config_key_for_api_key, bot_type, default_vision_model, provider_display_name)
# Auto-discovered as fallback vision providers when their API key is configured.
# OpenAI and LinkAI are handled separately (raw HTTP providers), so not listed here.
_DISCOVERABLE_MODELS = [
("moonshot_api_key", const.MOONSHOT, const.KIMI_K2_5, "Moonshot"),
("ark_api_key", const.DOUBAO, const.DOUBAO_SEED_2_PRO, "Doubao"),
("dashscope_api_key", const.QWEN_DASHSCOPE, const.QWEN36_PLUS, "DashScope"),
("claude_api_key", const.CLAUDEAPI, const.CLAUDE_4_6_SONNET, "Claude"),
("gemini_api_key", const.GEMINI, const.GEMINI_31_FLASH_LITE_PRE, "Gemini"),
("zhipu_ai_api_key", const.ZHIPU_AI, const.GLM_4_7, "ZhipuAI"),
("minimax_api_key", const.MiniMax, const.MINIMAX_M2_7, "MiniMax"),
]
@dataclass
@@ -42,6 +61,8 @@ class VisionProvider:
api_base: str
extra_headers: dict = field(default_factory=dict)
model_override: Optional[str] = None
use_bot: bool = False # When True, call via bot.call_vision instead of raw HTTP
fallback_bot: Any = None # Bot instance for non-main-model providers
class VisionAPIError(Exception):
@@ -50,13 +71,12 @@ class VisionAPIError(Exception):
class Vision(BaseTool):
"""Analyze images using OpenAI-compatible Vision API"""
"""Analyze images using Vision API"""
name: str = "vision"
description: str = (
"Analyze a local image or image URL (jpg/jpeg/png) using Vision API. "
"Can describe content, extract text, identify objects, colors, etc. "
"Requires OPENAI_API_KEY or LINKAI_API_KEY."
)
params: dict = {
@@ -70,13 +90,6 @@ class Vision(BaseTool):
"type": "string",
"description": "Question to ask about the image",
},
"model": {
"type": "string",
"description": (
f"Vision model to use (default: {DEFAULT_MODEL}). "
"Options: gpt-4.1-mini, gpt-4.1, gpt-4o-mini, gpt-4o"
),
},
},
"required": ["image", "question"],
}
@@ -86,15 +99,11 @@ class Vision(BaseTool):
@staticmethod
def is_available() -> bool:
return bool(
conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY")
or conf().get("linkai_api_key") or os.environ.get("LINKAI_API_KEY")
)
return True
def execute(self, args: Dict[str, Any]) -> ToolResult:
image = args.get("image", "").strip()
question = args.get("question", "").strip()
model = args.get("model", DEFAULT_MODEL).strip() or DEFAULT_MODEL
if not image:
return ToolResult.fail("Error: 'image' parameter is required")
@@ -104,11 +113,12 @@ class Vision(BaseTool):
providers = self._resolve_providers()
if not providers:
return ToolResult.fail(
"Error: No API key configured for Vision.\n"
"Please configure one of the following using env_config tool:\n"
" 1. OPENAI_API_KEY (preferred): env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n"
" 2. LINKAI_API_KEY (fallback): env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")\n\n"
"Get your key at: https://platform.openai.com/api-keys or https://link-ai.tech"
"Error: No model available for Vision.\n"
"The main model does not support vision and no other API keys are configured.\n"
"Options:\n"
" 1. Switch to a multimodal model (e.g. qwen3.6-plus, claude-sonnet-4-6, gemini-2.0-flash)\n"
" 2. Configure OPENAI_API_KEY: env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n"
" 3. Configure LINKAI_API_KEY: env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")"
)
try:
@@ -116,7 +126,7 @@ class Vision(BaseTool):
except Exception as e:
return ToolResult.fail(f"Error: {e}")
return self._call_with_fallback(providers, model, question, image_content)
return self._call_with_fallback(providers, DEFAULT_MODEL, question, image_content)
def _call_with_fallback(self, providers: List[VisionProvider], model: str,
question: str, image_content: dict) -> ToolResult:
@@ -125,9 +135,14 @@ class Vision(BaseTool):
for i, provider in enumerate(providers):
use_model = provider.model_override or model
try:
logger.debug(f"[Vision] Trying provider '{provider.name}' "
logger.info(f"[Vision] Trying provider '{provider.name}' "
f"with model '{use_model}' ({i + 1}/{len(providers)})")
return self._call_api(provider, use_model, question, image_content)
if provider.use_bot:
result = self._call_via_bot(use_model, question, image_content, provider)
else:
result = self._call_api(provider, use_model, question, image_content)
logger.info(f"[Vision] ✅ Success via {provider.name} (model={use_model})")
return result
except VisionAPIError as e:
errors.append(f"[{provider.name}/{use_model}] {e}")
logger.warning(f"[Vision] Provider '{provider.name}' failed: {e}")
@@ -148,35 +163,113 @@ class Vision(BaseTool):
def _resolve_providers(self) -> List[VisionProvider]:
"""
Build an ordered list of available providers.
Each provider builder returns a VisionProvider or None.
To add a new provider, append a builder method to _PROVIDER_BUILDERS.
Priority:
- use_linkai=true → [LinkAI, MainModel, OtherModels…, OpenAI]
- default → [MainModel, OtherModels…, OpenAI, LinkAI]
"OtherModels" are auto-discovered from configured API keys.
The main model's bot_type is excluded from OtherModels to avoid
duplicating the MainModel provider.
"""
use_linkai = conf().get("use_linkai", False) and conf().get("linkai_api_key")
providers: List[VisionProvider] = []
for builder in self._PROVIDER_BUILDERS:
provider = builder(self)
if provider:
providers.append(provider)
if use_linkai:
self._append_provider(providers, self._build_linkai_provider)
self._append_provider(providers, self._build_main_model_provider)
self._append_other_model_providers(providers)
self._append_provider(providers, self._build_openai_provider)
else:
self._append_provider(providers, self._build_main_model_provider)
self._append_other_model_providers(providers)
self._append_provider(providers, self._build_openai_provider)
self._append_provider(providers, self._build_linkai_provider)
return providers
def _build_custom_model_provider(self) -> Optional[VisionProvider]:
@staticmethod
def _append_provider(providers: List[VisionProvider], builder) -> None:
p = builder()
if p:
providers.append(p)
def _append_other_model_providers(self, providers: List[VisionProvider]) -> None:
"""
When bot_type is openai-compatible and a custom model is configured,
try the user's own model first — it may already support multimodal input.
Auto-discover other models whose API key is configured.
Skip the main model's own bot_type (already covered by MainModel provider).
Skip bot_types that already have a provider in the list (e.g. OpenAI).
"""
bot_type = conf().get("bot_type", "")
if bot_type not in OPENAI_COMPATIBLE_BOT_TYPES:
# Determine main model's bot_type so we can skip it
main_bot_type = None
if self.model and hasattr(self.model, '_resolve_bot_type'):
main_bot_type = self.model._resolve_bot_type(conf().get("model", ""))
existing_names = {p.name for p in providers}
for config_key, bot_type, default_model, display_name in _DISCOVERABLE_MODELS:
if display_name in existing_names:
continue
if bot_type == main_bot_type:
continue
api_key = conf().get(config_key, "")
if not api_key or not api_key.strip():
continue
# Create a bot instance and check if it supports call_vision
try:
from models.bot_factory import create_bot
bot = create_bot(bot_type)
if not hasattr(bot, 'call_vision'):
continue
except Exception:
continue
providers.append(VisionProvider(
name=display_name,
api_key="",
api_base="",
model_override=default_model,
use_bot=True,
fallback_bot=bot,
))
def _resolve_vision_model(self) -> Optional[str]:
"""
Determine which model to use for vision.
1. User explicit config: tool.vision.model in config.json
2. Fallback to the main configured model name
"""
tool_conf = conf().get("tool", {})
user_vision_model = tool_conf.get("vision", {}).get("model") if isinstance(tool_conf, dict) else None
if user_vision_model:
return user_vision_model
model_name = conf().get("model", "")
return model_name or None
def _build_main_model_provider(self) -> Optional[VisionProvider]:
"""
Use the vendor's own model for vision via bot.call_vision.
Only available when the bot class has call_vision.
"""
if not (self.model and hasattr(self.model, 'bot')):
return None
custom_model = conf().get("model", "")
if not custom_model or custom_model == DEFAULT_MODEL:
try:
bot = self.model.bot
if not hasattr(bot, 'call_vision'):
return None
api_key = conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY")
if not api_key:
except Exception:
return None
api_base = (conf().get("open_ai_api_base") or os.environ.get("OPENAI_API_BASE", "")).rstrip("/") \
or "https://api.openai.com/v1"
vision_model = self._resolve_vision_model()
return VisionProvider(
name="CustomModel", api_key=api_key, api_base=self._ensure_v1(api_base),
model_override=custom_model,
name=_MAIN_MODEL_PROVIDER_NAME,
api_key="",
api_base="",
model_override=vision_model,
use_bot=True,
)
def _build_openai_provider(self) -> Optional[VisionProvider]:
@@ -200,7 +293,54 @@ class Vision(BaseTool):
return VisionProvider(name="LinkAI", api_key=api_key, api_base=self._ensure_v1(api_base),
extra_headers=extra)
_PROVIDER_BUILDERS = [_build_custom_model_provider, _build_openai_provider, _build_linkai_provider]
def _call_via_bot(self, model: str, question: str, image_content: dict,
provider: Optional[VisionProvider] = None) -> ToolResult:
"""
Call a model's call_vision with vendor-native API format.
Uses the provider's _fallback_bot if set, otherwise the main model bot.
Raises VisionAPIError on failure so fallback can proceed.
"""
try:
bot = (provider and provider.fallback_bot) or self.model.bot
except Exception as e:
raise VisionAPIError(f"Cannot access bot: {e}")
# Extract the raw image URL from the OpenAI-format image_content block
image_url = image_content.get("image_url", {}).get("url", "")
if not image_url:
raise VisionAPIError("No image URL in content block")
try:
response = bot.call_vision(
image_url=image_url,
question=question,
model=model,
max_tokens=MAX_TOKENS,
)
except Exception as e:
raise VisionAPIError(f"call_vision failed: {e}")
if response is NotImplemented:
raise VisionAPIError("Bot does not support vision")
if isinstance(response, dict) and response.get("error"):
raise VisionAPIError(f"API error - {response.get('message', 'Unknown')}")
content = response.get("content", "") if isinstance(response, dict) else ""
if not content:
raise VisionAPIError("Empty response from main model")
usage_info = response.get("usage", {}) if isinstance(response, dict) else {}
# Use the actual model name from the bot response if available
actual_model = response.get("model", model) if isinstance(response, dict) else model
provider_name = provider.name if provider else _MAIN_MODEL_PROVIDER_NAME
return ToolResult.success({
"model": actual_model,
"provider": provider_name,
"content": content,
"usage": usage_info,
})
@staticmethod
def _ensure_v1(api_base: str) -> str:
@@ -213,9 +353,13 @@ class Vision(BaseTool):
return api_base.rstrip("/") + "/v1"
def _build_image_content(self, image: str) -> dict:
"""Build the image_url content block for the API request."""
"""
Build the image_url content block.
Both remote URLs and local files are converted to base64 data URLs
so every bot backend can consume them without extra downloads.
"""
if image.startswith(("http://", "https://")):
return {"type": "image_url", "image_url": {"url": image}}
return self._download_to_data_url(image)
if not os.path.isfile(image):
raise FileNotFoundError(f"Image file not found: {image}")
@@ -239,6 +383,19 @@ class Vision(BaseTool):
data_url = f"data:{mime_type};base64,{b64}"
return {"type": "image_url", "image_url": {"url": data_url}}
@staticmethod
def _download_to_data_url(url: str) -> dict:
"""Download a remote image and return it as a base64 data URL."""
resp = requests.get(url, timeout=30)
if resp.status_code != 200:
raise VisionAPIError(f"Failed to download image: HTTP {resp.status_code}")
content_type = resp.headers.get("Content-Type", "image/jpeg").split(";")[0].strip()
if not content_type.startswith("image/"):
content_type = "image/jpeg"
b64 = base64.b64encode(resp.content).decode("ascii")
data_url = f"data:{content_type};base64,{b64}"
return {"type": "image_url", "image_url": {"url": data_url}}
@staticmethod
def _maybe_compress(path: str) -> str:
"""Compress image to under COMPRESS_THRESHOLD with max long-edge 1536px."""
@@ -312,7 +469,6 @@ class Vision(BaseTool):
],
}
],
"max_completion_tokens": MAX_TOKENS,
}
headers = {

View File

@@ -124,14 +124,15 @@ class AgentLLMModel(LLMModel):
@property
def bot(self):
"""Lazy load the bot, re-create when model changes"""
"""Lazy load the bot, re-create when model or bot_type changes"""
from models.bot_factory import create_bot
cur_model = self.model
if self._bot is None or self._bot_model != cur_model:
bot_type = self._resolve_bot_type(cur_model)
self._bot = create_bot(bot_type)
cur_bot_type = self._resolve_bot_type(cur_model)
if self._bot is None or self._bot_model != cur_model or getattr(self, '_bot_type', None) != cur_bot_type:
self._bot = create_bot(cur_bot_type)
self._bot = add_openai_compatible_support(self._bot)
self._bot_model = cur_model
self._bot_type = cur_bot_type
return self._bot
def call(self, request: LLMRequest):
@@ -498,14 +499,19 @@ class AgentBridge:
reply.text_content = text_response
return reply
# For other unknown file types, return text with file info
message = text_response or file_info.get("message", "文件已准备")
message += f"\n\n[文件: {file_info.get('file_name', file_path)}]"
return Reply(ReplyType.TEXT, message)
# For all other file types (tar.gz, zip, etc.), also use FILE type
file_url = f"file://{file_path}"
logger.info(f"[AgentBridge] Sending generic file: {file_url}")
reply = Reply(ReplyType.FILE, file_url)
reply.file_name = file_info.get("file_name", os.path.basename(file_path))
if text_response:
reply.text_content = text_response
return reply
def _migrate_config_to_env(self, workspace_root: str):
"""
Migrate API keys from config.json to .env file if not already set
Sync API keys from config.json to .env file.
Adds new keys and updates changed values on each startup.
Args:
workspace_root: Workspace directory path (not used, kept for compatibility)
@@ -513,7 +519,6 @@ class AgentBridge:
from config import conf
import os
# Mapping from config.json keys to environment variable names
key_mapping = {
"open_ai_api_key": "OPENAI_API_KEY",
"open_ai_api_base": "OPENAI_API_BASE",
@@ -522,10 +527,9 @@ class AgentBridge:
"linkai_api_key": "LINKAI_API_KEY",
}
# Use fixed secure location for .env file
env_file = expand_path("~/.cow/.env")
# Read existing env vars from .env file
# Read existing env vars (key -> value)
existing_env_vars = {}
if os.path.exists(env_file):
try:
@@ -533,48 +537,46 @@ class AgentBridge:
for line in f:
line = line.strip()
if line and not line.startswith('#') and '=' in line:
key, _ = line.split('=', 1)
existing_env_vars[key.strip()] = True
key, val = line.split('=', 1)
existing_env_vars[key.strip()] = val.strip()
except Exception as e:
logger.warning(f"[AgentBridge] Failed to read .env file: {e}")
# Check which keys need to be migrated
keys_to_migrate = {}
# Sync config.json values into .env (add/update/remove)
updated = False
for config_key, env_key in key_mapping.items():
# Skip if already in .env file
if env_key in existing_env_vars:
raw = conf().get(config_key, "")
value = raw.strip() if raw else ""
old_value = existing_env_vars.get(env_key)
if value:
if old_value == value:
continue
existing_env_vars[env_key] = value
os.environ[env_key] = value
updated = True
else:
if old_value is None:
continue
existing_env_vars.pop(env_key, None)
os.environ.pop(env_key, None)
updated = True
updated = True
# Get value from config.json
value = conf().get(config_key, "")
if value and value.strip(): # Only migrate non-empty values
keys_to_migrate[env_key] = value.strip()
# Log summary if there are keys to skip
if existing_env_vars:
logger.debug(f"[AgentBridge] {len(existing_env_vars)} env vars already in .env")
# Write new keys to .env file
if keys_to_migrate:
if updated:
try:
# Ensure ~/.cow directory and .env file exist
env_dir = os.path.dirname(env_file)
if not os.path.exists(env_dir):
os.makedirs(env_dir, exist_ok=True)
if not os.path.exists(env_file):
open(env_file, 'a').close()
# Append new keys
with open(env_file, 'a', encoding='utf-8') as f:
f.write('\n# Auto-migrated from config.json\n')
for key, value in keys_to_migrate.items():
with open(env_file, 'w', encoding='utf-8') as f:
f.write('# Environment variables for agent\n')
f.write('# Auto-managed - synced from config.json on startup\n\n')
for key, value in sorted(existing_env_vars.items()):
f.write(f'{key}={value}\n')
# Also set in current process
os.environ[key] = value
logger.info(f"[AgentBridge] Migrated {len(keys_to_migrate)} API keys from config.json to .env: {list(keys_to_migrate.keys())}")
logger.info(f"[AgentBridge] Synced API keys from config.json to .env")
except Exception as e:
logger.warning(f"[AgentBridge] Failed to migrate API keys: {e}")
logger.warning(f"[AgentBridge] Failed to sync API keys: {e}")
def _persist_messages(
self, session_id: str, new_messages: list, channel_type: str = ""

View File

@@ -490,7 +490,7 @@ class AgentInitializer:
env_file = expand_path("~/.cow/.env")
# Read existing env vars
# Read existing env vars (key -> value)
existing_env_vars = {}
if os.path.exists(env_file):
try:
@@ -498,38 +498,46 @@ class AgentInitializer:
for line in f:
line = line.strip()
if line and not line.startswith('#') and '=' in line:
key, _ = line.split('=', 1)
existing_env_vars[key.strip()] = True
key, val = line.split('=', 1)
existing_env_vars[key.strip()] = val.strip()
except Exception as e:
logger.warning(f"[AgentInitializer] Failed to read .env file: {e}")
# Check which keys need migration
keys_to_migrate = {}
# Sync config.json values into .env (add/update/remove)
updated = False
for config_key, env_key in key_mapping.items():
if env_key in existing_env_vars:
continue
value = conf().get(config_key, "")
if value and value.strip():
keys_to_migrate[env_key] = value.strip()
raw = conf().get(config_key, "")
value = raw.strip() if raw else ""
old_value = existing_env_vars.get(env_key)
# Write new keys
if keys_to_migrate:
if value:
if old_value == value:
continue
existing_env_vars[env_key] = value
os.environ[env_key] = value
updated = True
else:
if old_value is None:
continue
existing_env_vars.pop(env_key, None)
os.environ.pop(env_key, None)
updated = True
if updated:
try:
env_dir = os.path.dirname(env_file)
if not os.path.exists(env_dir):
os.makedirs(env_dir, exist_ok=True)
if not os.path.exists(env_file):
open(env_file, 'a').close()
with open(env_file, 'a', encoding='utf-8') as f:
f.write('\n# Auto-migrated from config.json\n')
for key, value in keys_to_migrate.items():
# Rewrite the entire .env file to ensure consistency
with open(env_file, 'w', encoding='utf-8') as f:
f.write('# Environment variables for agent\n')
f.write('# Auto-managed - synced from config.json on startup\n\n')
for key, value in sorted(existing_env_vars.items()):
f.write(f'{key}={value}\n')
os.environ[key] = value
logger.info(f"[AgentInitializer] Migrated {len(keys_to_migrate)} API keys to .env: {list(keys_to_migrate.keys())}")
logger.info(f"[AgentInitializer] Synced API keys from config.json to .env")
except Exception as e:
logger.warning(f"[AgentInitializer] Failed to migrate API keys: {e}")
logger.warning(f"[AgentInitializer] Failed to sync API keys: {e}")
def _start_daily_flush_timer(self):
"""Start a background thread that flushes all agents' memory daily at 23:55."""

View File

@@ -823,9 +823,6 @@ function sendMessage() {
}
function startSSE(requestId, loadingEl, timestamp) {
const es = new EventSource(`/stream?request_id=${encodeURIComponent(requestId)}`);
activeStreams[requestId] = es;
let botEl = null;
let stepsEl = null; // .agent-steps (thinking summaries + tool indicators)
let contentEl = null; // .answer-content (final streaming answer)
@@ -834,6 +831,11 @@ function startSSE(requestId, loadingEl, timestamp) {
let currentToolEl = null;
let currentReasoningEl = null; // live reasoning bubble
let reasoningText = '';
let done = false;
const MAX_RECONNECTS = 10;
const RECONNECT_BASE_MS = 1000;
let reconnectCount = 0;
function ensureBotEl() {
if (botEl) return;
@@ -858,10 +860,17 @@ function startSSE(requestId, loadingEl, timestamp) {
mediaEl = botEl.querySelector('.media-content');
}
function connect() {
const es = new EventSource(`/stream?request_id=${encodeURIComponent(requestId)}`);
activeStreams[requestId] = es;
es.onmessage = function(e) {
let item;
try { item = JSON.parse(e.data); } catch (_) { return; }
// Successful data received, reset reconnect counter
reconnectCount = 0;
if (item.type === 'reasoning') {
ensureBotEl();
reasoningText += item.content;
@@ -877,7 +886,6 @@ function startSSE(requestId, loadingEl, timestamp) {
<div class="thinking-full"></div>`;
stepsEl.appendChild(currentReasoningEl);
}
// Stream reasoning as a single-line summary (collapsed); full text available on expand
const oneLine = reasoningText.trim().replace(/\n+/g, ' ');
currentReasoningEl.querySelector('.thinking-summary').textContent =
oneLine.length > 80 ? oneLine.substring(0, 80) + '…' : oneLine;
@@ -897,8 +905,6 @@ function startSSE(requestId, loadingEl, timestamp) {
scrollChatToBottom();
} else if (item.type === 'message_end') {
// Backend already strips reasoning_content; all deltas are real content.
// Freeze accumulated text as visible content before tool execution begins.
if (item.has_tool_calls && accumulatedText.trim()) {
ensureBotEl();
const frozenEl = document.createElement('div');
@@ -1017,16 +1023,10 @@ function startSSE(requestId, loadingEl, timestamp) {
scrollChatToBottom();
} else if (item.type === 'done') {
done = true;
es.close();
delete activeStreams[requestId];
if (currentReasoningEl) {
if (reasoningText.trim().replace(/\n+/g, ' ').length <= 80)
currentReasoningEl.classList.add('no-expand');
currentReasoningEl = null;
reasoningText = '';
}
// item.content may be empty when "done" is only a stream-close signal after media.
const finalText = item.content || accumulatedText;
@@ -1038,11 +1038,11 @@ function startSSE(requestId, loadingEl, timestamp) {
// Only update text content when there is something new to show.
if (finalText) contentEl.innerHTML = renderMarkdown(finalText);
applyHighlighting(botEl);
bindChatKnowledgeLinks(botEl);
}
scrollChatToBottom();
} else if (item.type === 'error') {
done = true;
es.close();
delete activeStreams[requestId];
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
@@ -1053,6 +1053,25 @@ function startSSE(requestId, loadingEl, timestamp) {
es.onerror = function() {
es.close();
delete activeStreams[requestId];
if (done) return;
if (currentReasoningEl) {
if (reasoningText.trim().replace(/\n+/g, ' ').length <= 80)
currentReasoningEl.classList.add('no-expand');
currentReasoningEl = null;
reasoningText = '';
}
if (reconnectCount < MAX_RECONNECTS) {
reconnectCount++;
const delay = Math.min(RECONNECT_BASE_MS * reconnectCount, 5000);
console.warn(`[SSE] connection lost for ${requestId}, reconnecting in ${delay}ms (attempt ${reconnectCount}/${MAX_RECONNECTS})`);
setTimeout(connect, delay);
return;
}
// Exhausted retries, show whatever we have
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
if (!botEl) {
addBotMessage(t('error_send'), new Date());
@@ -1060,10 +1079,14 @@ function startSSE(requestId, loadingEl, timestamp) {
contentEl.classList.remove('sse-streaming');
contentEl.innerHTML = renderMarkdown(accumulatedText);
applyHighlighting(botEl);
bindChatKnowledgeLinks(botEl);
}
};
}
connect();
}
function startPolling() {
if (isPolling) return;
isPolling = true;

View File

@@ -339,14 +339,18 @@ class WebChannel(ChatChannel):
"""
SSE generator for a given request_id.
Yields UTF-8 encoded bytes to avoid WSGI Latin-1 mangling.
Supports client reconnection: the queue is only removed after a
"done" event is consumed, so a new GET /stream with the same
request_id can resume reading remaining events.
"""
if request_id not in self.sse_queues:
yield b"data: {\"type\": \"error\", \"message\": \"invalid request_id\"}\n\n"
return
q = self.sse_queues[request_id]
timeout = 300 # 5 minutes max
deadline = time.time() + timeout
idle_timeout = 600 # 10 minutes without any real event
deadline = time.time() + idle_timeout
done = False
try:
while time.time() < deadline:
@@ -356,12 +360,17 @@ class WebChannel(ChatChannel):
yield b": keepalive\n\n"
continue
# Real event received, reset idle deadline
deadline = time.time() + idle_timeout
payload = json.dumps(item, ensure_ascii=False)
yield f"data: {payload}\n\n".encode("utf-8")
if item.get("type") == "done":
done = True
break
finally:
if done:
self.sse_queues.pop(request_id, None)
def poll_response(self):

View File

@@ -47,8 +47,8 @@ CREDENTIAL_MAP = {
class CloudClient(LinkAIClient):
def __init__(self, api_key: str, channel, host: str = ""):
super().__init__(api_key, host)
def __init__(self, api_key: str, channel, host: str = "", port=None):
super().__init__(api_key, host, port=port)
self.channel = channel
self.client_type = channel.channel_type
self.channel_mgr = None
@@ -770,7 +770,7 @@ def start(channel, channel_mgr=None):
return
global chat_client
chat_client = CloudClient(api_key=conf().get("linkai_api_key"), host=conf().get("cloud_host", ""), channel=channel)
chat_client = CloudClient(api_key=conf().get("linkai_api_key"), host=conf().get("cloud_host", ""), port=conf().get("cloud_port"), channel=channel)
chat_client.channel_mgr = channel_mgr
chat_client.config = _build_config()
chat_client.start()

View File

@@ -93,6 +93,7 @@ QWQ_PLUS = "qwq-plus"
# MiniMax
MINIMAX_M2_7 = "MiniMax-M2.7" # MiniMax M2.7 - Latest
MINIMAX_M2_7_HIGHSPEED = "MiniMax-M2.7-highspeed" # MiniMax M2.7 highspeed
MINIMAX_M2_5 = "MiniMax-M2.5" # MiniMax M2.5
MINIMAX_M2_1 = "MiniMax-M2.1" # MiniMax M2.1
MINIMAX_M2_1_LIGHTNING = "MiniMax-M2.1-lightning" # MiniMax M2.1 极速版
@@ -175,7 +176,7 @@ MODEL_LIST = [
QWEN36_PLUS, QWEN35_PLUS, QWEN3_MAX, QWEN_MAX, QWEN_PLUS, QWEN_TURBO, QWEN_LONG,
# MiniMax
MiniMax, MINIMAX_M2_7, MINIMAX_M2_5, MINIMAX_M2_1, MINIMAX_M2_1_LIGHTNING, MINIMAX_M2, MINIMAX_ABAB6_5,
MiniMax, MINIMAX_M2_7, MINIMAX_M2_7_HIGHSPEED, MINIMAX_M2_5, MINIMAX_M2_1, MINIMAX_M2_1_LIGHTNING, MINIMAX_M2, MINIMAX_ABAB6_5,
# GLM
ZHIPU_AI, GLM_5_TURBO, GLM_5, GLM_4, GLM_4_PLUS, GLM_4_flash, GLM_4_LONG, GLM_4_ALLTOOLS,

View File

@@ -189,6 +189,7 @@ available_setting = {
"linkai_app_code": "",
"linkai_api_base": "https://api.link-ai.tech",
"cloud_host": "client.link-ai.tech",
"cloud_port": None,
"cloud_deployment_id": "",
"minimax_api_key": "",
"Minimax_group_id": "",

72
docs/en/tools/vision.mdx Normal file
View File

@@ -0,0 +1,72 @@
---
title: vision - Image Analysis
description: Analyze image content (recognition, description, OCR, etc.)
---
Analyze local images or image URLs using Vision API. Supports content description, text extraction (OCR), object recognition, and more.
## Model Selection
The vision tool uses a multi-level auto-selection strategy with automatic fallback — no manual configuration required:
1. **Main model** — uses the currently configured main model for image recognition (zero extra cost)
2. **Other configured models** — auto-discovers other models with configured API keys as alternatives
3. **OpenAI** — uses `open_ai_api_key` to call gpt-4.1-mini
4. **LinkAI** — uses `linkai_api_key` to call LinkAI vision service
When `use_linkai=true`, LinkAI is promoted to the highest priority.
If the current provider fails, the tool automatically tries the next one until it succeeds or all fail.
### Supported Models
| Vendor | Vision Model | Notes |
| --- | --- | --- |
| OpenAI / Compatible | Main model | All OpenAI-compatible multimodal models |
| Qwen (DashScope) | Main model | Via MultiModalConversation API |
| Claude | Main model | Anthropic native image format |
| Gemini | Main model | inlineData format |
| Doubao | Main model | doubao-seed-2-0 series natively supported |
| Kimi (Moonshot) | Main model | kimi-k2.5 natively supported |
| ZhipuAI | glm-5v-turbo | Always uses dedicated vision model |
| MiniMax | MiniMax-Text-01 | Always uses dedicated vision model |
<Note>
ZhipuAI and MiniMax text models do not support image understanding, so their dedicated vision models are always used automatically.
</Note>
## Parameters
| Parameter | Type | Required | Description |
| --- | --- | --- | --- |
| `image` | string | Yes | Local file path or HTTP(S) image URL |
| `question` | string | Yes | Question to ask about the image |
Supported image formats: jpg, jpeg, png, gif, webp
## Custom Configuration
To specify a particular model for the vision tool, add to `config.json`:
```json
{
"tool": {
"vision": {
"model": "gpt-4o"
}
}
}
```
In most cases no configuration is needed. The tool works automatically as long as the main model supports multimodal input or any vision-capable API key is configured.
## Use Cases
- Describe image content
- Extract text from images (OCR)
- Identify objects, colors, scenes
- Analyze screenshots and scanned documents
<Note>
Images larger than 1MB are automatically compressed (max edge 1536px). All images (including remote URLs) are converted to base64 for transmission to ensure compatibility with all model backends.
</Note>

72
docs/ja/tools/vision.mdx Normal file
View File

@@ -0,0 +1,72 @@
---
title: vision - 画像分析
description: 画像コンテンツの分析認識、説明、OCR など)
---
Vision API を使用してローカル画像や画像 URL を分析します。コンテンツの説明、テキスト抽出OCR、オブジェクト認識などに対応しています。
## モデル選択
Vision ツールは多段階の自動選択+自動フォールバック戦略を採用しており、手動設定なしで利用可能です:
1. **メインモデル** — 現在設定されているメインモデルで画像認識を実行(追加コストなし)
2. **その他の設定済みモデル** — API キーが設定されている他のマルチモーダルモデルを自動検出
3. **OpenAI** — `open_ai_api_key` を使用して gpt-4.1-mini を呼び出し
4. **LinkAI** — `linkai_api_key` を使用して LinkAI ビジョンサービスを呼び出し
`use_linkai=true` の場合、LinkAI が最優先になります。
現在のプロバイダーが失敗した場合、成功するかすべて失敗するまで自動的に次のプロバイダーを試行します。
### 対応モデル
| ベンダー | ビジョンモデル | 説明 |
| --- | --- | --- |
| OpenAI / 互換プロトコル | メインモデル | すべての OpenAI 互換マルチモーダルモデルに対応 |
| 通義千問 (DashScope) | メインモデル | MultiModalConversation API 経由 |
| Claude | メインモデル | Anthropic ネイティブ画像形式 |
| Gemini | メインモデル | inlineData 形式 |
| 豆包 (Doubao) | メインモデル | doubao-seed-2-0 シリーズがネイティブ対応 |
| Kimi (Moonshot) | メインモデル | kimi-k2.5 がネイティブ対応 |
| 智谱 AI | glm-5v-turbo | 常にビジョン専用モデルを使用 |
| MiniMax | MiniMax-Text-01 | 常にビジョン専用モデルを使用 |
<Note>
智谱 AI と MiniMax のテキストモデルは画像理解に対応していないため、対応するビジョン専用モデルが自動的に使用されます。
</Note>
## パラメータ
| パラメータ | 型 | 必須 | 説明 |
| --- | --- | --- | --- |
| `image` | string | はい | ローカルファイルパスまたは HTTP(S) 画像 URL |
| `question` | string | はい | 画像に対する質問 |
対応画像形式jpg、jpeg、png、gif、webp
## カスタム設定
Vision ツールで使用するモデルを指定するには、`config.json` に以下を追加します:
```json
{
"tool": {
"vision": {
"model": "gpt-4o"
}
}
}
```
ほとんどの場合、設定は不要です。メインモデルがマルチモーダルに対応しているか、ビジョン対応の API キーが設定されていれば自動的に動作します。
## ユースケース
- 画像コンテンツの説明
- 画像からのテキスト抽出OCR
- オブジェクト、色、シーンの識別
- スクリーンショットやスキャン文書の分析
<Note>
1MB を超える画像は自動的に圧縮されます(最大辺 1536px。すべての画像リモート URL を含む)は base64 に変換して送信され、すべてのモデルバックエンドとの互換性を確保します。
</Note>

View File

@@ -5,14 +5,49 @@ description: 分析图片内容识别、描述、OCR 等)
使用 Vision API 分析本地图片或图片 URL支持内容描述、文字提取OCR、物体识别等。
## 依赖
## 模型选择
需要配置至少一个 API Key通过 `env_config` 工具或工作空间 `.env` 文件配置)
Vision 工具采用多级自动选择 + 自动兜底策略,无需手动配置即可使用
| 后端 | 环境变量 | 优先级 |
1. **主模型** — 优先使用当前配置的主模型进行图像识别(需要是多模态模型)
2. **其他已配置模型** — 自动发现已配置 API Key 的其他多模态模型作为备选
如果当前 provider 调用失败,会自动尝试下一个,直到成功或全部失败。
### 支持的模型
| 厂商 | 视觉模型 | 说明 |
| --- | --- | --- |
| OpenAI | `OPENAI_API_KEY` | 优先使用 |
| LinkAI | `LINKAI_API_KEY` | 备选 |
| OpenAI / 兼容协议 | 使用主模型 | 支持所有 OpenAI 协议兼容的多模态模型 |
| 通义千问 (DashScope) | 使用主模型 | 例如 qwen3.6-plus 等 |
| Claude | 使用主模型 | Anthropic 原生图像格式 |
| Gemini | 使用主模型 | inlineData 格式 |
| 豆包 (Doubao) | 使用主模型 | doubao-seed-2-0 系列原生支持 |
| Kimi (Moonshot) | 使用主模型 | kimi-k2.5 原生支持 |
| 智谱 AI | glm-5v-turbo | 固定使用视觉专用模型 |
| MiniMax | MiniMax-Text-01 | 固定使用视觉专用模型 |
<Note>
智谱和 MiniMax 的文本模型不支持图像理解,因此始终使用对应的视觉专用模型,无需手动指定。
</Note>
> 当 `use_linkai=true` 时,默认使用 LinkAI 的多模态模型进行
## 自定义配置
如果希望指定 Vision 使用的模型,可在 `config.json` 中配置,例如:
```json
{
"tool": {
"vision": {
"model": "gpt-4o"
}
}
}
```
大多数情况下无需配置,主模型支持多模态或配置任意一个支持视觉的 API Key 即可自动工作。
## 参数
@@ -20,17 +55,18 @@ description: 分析图片内容识别、描述、OCR 等)
| --- | --- | --- | --- |
| `image` | string | 是 | 本地文件路径或 HTTP(S) 图片 URL |
| `question` | string | 是 | 对图片提出的问题 |
| `model` | string | 否 | 模型名称(默认 gpt-4.1-mini |
支持的图片格式jpg、jpeg、png、gif、webp
## 使用场景
- 描述图片中的内容
- 提取图片中的文字OCR
- 识别物体、颜色、场景
- 分析截图、文档扫描
- 分析截图、文档扫描图片等
<Note>
超过 1MB 的图片会自动压缩后上传。如果未配置任何 Vision API Key该工具不会被加载
超过 1MB 的图片会自动压缩后上传,所有图片(包括远程 URL会统一转为 base64 传输,确保兼容所有模型后端
</Note>

View File

@@ -2,12 +2,27 @@
Auto-replay chat robot abstract class
"""
from bridge.context import Context
from bridge.reply import Reply
class Bot(object):
"""
Base class for all chat-bot implementations.
Subclasses may also implement:
call_with_tools(messages, tools=None, stream=False, **kwargs)
-> dict | generator (OpenAI-compatible format)
call_vision(image_url, question, model=None, max_tokens=1000)
-> dict with keys: model, content, usage (or error/message)
These are NOT defined here to avoid shadowing concrete implementations
provided by mixin classes (e.g. OpenAICompatibleBot) in the MRO.
Use ``hasattr(bot, 'call_vision')`` to detect support at runtime.
"""
def reply(self, query, context: Context = None) -> Reply:
"""
bot auto-reply content

View File

@@ -1,7 +1,10 @@
# encoding:utf-8
import base64
import json
import re
import time
from typing import Optional
import requests
@@ -224,6 +227,79 @@ class ClaudeAPIBot(Bot, OpenAIImage):
return 64000
return 8192
@staticmethod
def _parse_data_url(data_url: str):
"""Parse a data:<mime>;base64,<data> URL into (media_type, base64_data)."""
m = re.match(r"^data:([^;]+);base64,(.+)$", data_url, re.DOTALL)
if m:
return m.group(1), m.group(2)
return None, None
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""Analyze an image using Claude Messages API (native image blocks)."""
try:
actual_model = model or self._model_mapping(conf().get("model"))
# Build Claude-native image content block
if image_url.startswith("data:"):
media_type, b64_data = self._parse_data_url(image_url)
if not b64_data:
return {"error": True, "message": "Invalid base64 data URL"}
image_block = {
"type": "image",
"source": {"type": "base64",
"media_type": media_type or "image/jpeg",
"data": b64_data},
}
else:
image_block = {
"type": "image",
"source": {"type": "url", "url": image_url},
}
data = {
"model": actual_model,
"max_tokens": max_tokens,
"messages": [{
"role": "user",
"content": [
image_block,
{"type": "text", "text": question},
],
}],
}
headers = {
"x-api-key": self.api_key,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
}
proxies = {"http": self.proxy, "https": self.proxy} if self.proxy else None
resp = requests.post(f"{self.api_base}/messages",
headers=headers, json=data, proxies=proxies)
if resp.status_code != 200:
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
body = resp.json()
text_parts = [b.get("text", "") for b in body.get("content", [])
if b.get("type") == "text"]
usage = body.get("usage", {})
return {
"model": actual_model,
"content": "".join(text_parts),
"usage": {
"prompt_tokens": usage.get("input_tokens", 0),
"completion_tokens": usage.get("output_tokens", 0),
"total_tokens": usage.get("input_tokens", 0) + usage.get("output_tokens", 0),
},
}
except Exception as e:
logger.error(f"[CLAUDE] call_vision error: {e}")
return {"error": True, "message": str(e)}
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
"""
Call Claude API with tool support for agent integration

View File

@@ -1,6 +1,8 @@
# encoding:utf-8
import json
from typing import Optional
from models.bot import Bot
from models.session_manager import SessionManager
from bridge.context import ContextType
@@ -153,6 +155,56 @@ class DashscopeBot(Bot):
else:
return result
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""Analyze an image using DashScope MultiModalConversation API."""
try:
dashscope.api_key = self.api_key
vision_model = model or "qwen-vl-max"
# DashScope multimodal format: {"image": url} + {"text": question}
messages = [{
"role": "user",
"content": [
{"image": image_url},
{"text": question},
],
}]
response = MultiModalConversation.call(
model=vision_model,
messages=messages,
max_tokens=max_tokens,
)
if response.status_code != HTTPStatus.OK:
return {
"error": True,
"message": f"{response.code} - {response.message}",
}
resp_dict = self._response_to_dict(response)
choice = resp_dict["output"]["choices"][0]
content = choice.get("message", {}).get("content", "")
if isinstance(content, list):
content = "".join(
item.get("text", "") for item in content if isinstance(item, dict)
)
usage = resp_dict.get("usage", {})
return {
"model": vision_model,
"content": content,
"usage": {
"prompt_tokens": usage.get("input_tokens", 0),
"completion_tokens": usage.get("output_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
},
}
except Exception as e:
logger.error(f"[DASHSCOPE] call_vision error: {e}")
return {"error": True, "message": str(e)}
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
"""
Call DashScope API with tool support for agent integration

View File

@@ -2,6 +2,7 @@
import json
import time
from typing import Optional
import requests
from models.bot import Bot
@@ -147,6 +148,49 @@ class DoubaoBot(Bot):
else:
return result
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""Analyze an image using Doubao (Volcengine Ark) OpenAI-compatible API."""
try:
vision_model = model or self.args.get("model", "doubao-seed-2-0-pro-260215")
payload = {
"model": vision_model,
"max_tokens": max_tokens,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": image_url}},
],
}],
}
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
resp = requests.post(f"{self.base_url}/chat/completions",
headers=headers, json=payload, timeout=60)
if resp.status_code != 200:
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
data = resp.json()
if "error" in data:
return {"error": True, "message": data["error"].get("message", str(data["error"]))}
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
usage = data.get("usage", {})
return {
"model": vision_model,
"content": content,
"usage": {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
},
}
except Exception as e:
logger.error(f"[DOUBAO] call_vision error: {e}")
return {"error": True, "message": str(e)}
# ==================== Agent mode support ====================
def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs):
@@ -434,6 +478,10 @@ class DoubaoBot(Bot):
continue
if role == "user":
has_tool_result = any(
isinstance(b, dict) and b.get("type") == "tool_result" for b in content
)
if has_tool_result:
text_parts = []
tool_results = []
@@ -453,12 +501,14 @@ class DoubaoBot(Bot):
"content": result_content
})
# Tool results first (must come right after assistant with tool_calls)
for tr in tool_results:
converted.append(tr)
if text_parts:
converted.append({"role": "user", "content": "\n".join(text_parts)})
else:
# Keep as-is for multimodal content (e.g. image_url blocks)
converted.append(msg)
elif role == "assistant":
openai_msg = {"role": "assistant"}

View File

@@ -12,6 +12,8 @@ import mimetypes
import os
import re
import time
from typing import Optional
import requests
from models.bot import Bot
from models.session_manager import SessionManager
@@ -144,7 +146,12 @@ class GoogleGeminiBot(Bot):
return "", []
pattern = r"\[图片:\s*([^\]]+)\]"
image_paths = [m.strip().strip("'\"") for m in re.findall(pattern, content) if m.strip()]
cleaned_text = re.sub(pattern, "", content)
# Replace markers with path-only hints so the model still knows the
# original file location (needed when it calls tools like vision).
def _replace_with_hint(m):
path = m.group(1).strip().strip("'\"")
return f"[attached image: {path}]"
cleaned_text = re.sub(pattern, _replace_with_hint, content)
cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text).strip()
return cleaned_text, image_paths
@@ -225,6 +232,57 @@ class GoogleGeminiBot(Bot):
logger.warning(f"[Gemini] Unsupported image URL format: {image_url[:120]}")
return None
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""Analyze an image using Gemini REST API."""
try:
model_name = model or self.model or "gemini-2.0-flash"
image_part = self._build_inline_part_from_image_url({"url": image_url})
if not image_part:
return {"error": True, "message": f"Cannot process image URL: {image_url[:120]}"}
payload = {
"contents": [{
"role": "user",
"parts": [image_part, {"text": question}],
}],
"generationConfig": {"maxOutputTokens": max_tokens},
"safetySettings": [
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
],
}
endpoint = f"{self.api_base}/v1beta/models/{model_name}:generateContent"
headers = {"x-goog-api-key": self.api_key, "Content-Type": "application/json"}
resp = requests.post(endpoint, headers=headers, json=payload, timeout=60)
if resp.status_code != 200:
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
body = resp.json()
candidates = body.get("candidates", [])
text_parts = []
for part in candidates[0].get("content", {}).get("parts", []) if candidates else []:
if "text" in part:
text_parts.append(part["text"])
usage_meta = body.get("usageMetadata", {})
return {
"model": model_name,
"content": "".join(text_parts),
"usage": {
"prompt_tokens": usage_meta.get("promptTokenCount", 0),
"completion_tokens": usage_meta.get("candidatesTokenCount", 0),
"total_tokens": usage_meta.get("totalTokenCount", 0),
},
}
except Exception as e:
logger.error(f"[Gemini] call_vision error: {e}")
return {"error": True, "message": str(e)}
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
"""
Call Gemini API with tool support using REST API (following official docs)

View File

@@ -2,6 +2,8 @@
import time
import json
from typing import Optional
import requests
from models.bot import Bot
@@ -20,7 +22,7 @@ class MinimaxBot(Bot):
def __init__(self):
super().__init__()
self.args = {
"model": conf().get("model") or "MiniMax-M2.1",
"model": conf().get("model") or "MiniMax-M2.7",
"temperature": conf().get("temperature", 0.3),
"top_p": conf().get("top_p", 0.95),
}
@@ -175,6 +177,51 @@ class MinimaxBot(Bot):
else:
return result
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""Analyze an image using MiniMax OpenAI-compatible API.
Always uses MiniMax-Text-01 — other MiniMax models do not support vision.
"""
try:
vision_model = "MiniMax-Text-01"
payload = {
"model": vision_model,
"max_tokens": max_tokens,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": image_url}},
],
}],
}
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
resp = requests.post(f"{self.api_base}/chat/completions",
headers=headers, json=payload, timeout=60)
if resp.status_code != 200:
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
data = resp.json()
if "error" in data:
return {"error": True, "message": data["error"].get("message", str(data["error"]))}
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
usage = data.get("usage", {})
return {
"model": vision_model,
"content": content,
"usage": {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
},
}
except Exception as e:
logger.error(f"[MINIMAX] call_vision error: {e}")
return {"error": True, "message": str(e)}
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
"""
Call MiniMax API with tool support for agent integration
@@ -270,7 +317,10 @@ class MinimaxBot(Bot):
if role == "user":
# Handle user message
if isinstance(content, list):
# Extract text from content blocks
has_tool_result = any(
isinstance(b, dict) and b.get("type") == "tool_result" for b in content
)
if has_tool_result:
text_parts = []
tool_results = []
@@ -279,7 +329,6 @@ class MinimaxBot(Bot):
if block.get("type") == "text":
text_parts.append(block.get("text", ""))
elif block.get("type") == "tool_result":
# Tool result should be a separate message with role="tool"
tool_call_id = block.get("tool_use_id") or ""
if not tool_call_id:
logger.warning(f"[MINIMAX] tool_result missing tool_use_id")
@@ -298,9 +347,11 @@ class MinimaxBot(Bot):
"content": "\n".join(text_parts)
})
# Add all tool results (not just the last one)
for tool_result in tool_results:
converted.append(tool_result)
else:
# Keep as-is for multimodal content (e.g. image_url blocks)
converted.append(msg)
else:
# Simple text content
converted.append({

View File

@@ -2,6 +2,7 @@
import json
import time
from typing import Optional
import requests
from models.bot import Bot
@@ -147,6 +148,49 @@ class MoonshotBot(Bot):
else:
return result
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""Analyze an image using Moonshot (Kimi) OpenAI-compatible API."""
try:
vision_model = model or self.args.get("model", "kimi-k2.5")
payload = {
"model": vision_model,
"max_tokens": max_tokens,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": image_url}},
],
}],
}
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
resp = requests.post(f"{self.base_url}/chat/completions",
headers=headers, json=payload, timeout=60)
if resp.status_code != 200:
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
data = resp.json()
if "error" in data:
return {"error": True, "message": data["error"].get("message", str(data["error"]))}
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
usage = data.get("usage", {})
return {
"model": vision_model,
"content": content,
"usage": {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
},
}
except Exception as e:
logger.error(f"[MOONSHOT] call_vision error: {e}")
return {"error": True, "message": str(e)}
# ==================== Agent mode support ====================
def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs):
@@ -435,6 +479,10 @@ class MoonshotBot(Bot):
continue
if role == "user":
has_tool_result = any(
isinstance(b, dict) and b.get("type") == "tool_result" for b in content
)
if has_tool_result:
text_parts = []
tool_results = []
@@ -454,12 +502,14 @@ class MoonshotBot(Bot):
"content": result_content
})
# Tool results first (must come right after assistant with tool_calls)
for tr in tool_results:
converted.append(tr)
if text_parts:
converted.append({"role": "user", "content": "\n".join(text_parts)})
else:
# Keep as-is for multimodal content (e.g. image_url blocks)
converted.append(msg)
elif role == "assistant":
openai_msg = {"role": "assistant"}

View File

@@ -9,6 +9,8 @@ This includes: OpenAI, LinkAI, Azure OpenAI, and many third-party providers.
import json
import openai
import requests
from typing import Optional
from common.log import logger
from agent.protocol.message_utils import drop_orphaned_tool_results_openai
@@ -306,3 +308,51 @@ class OpenAICompatibleBot:
openai_messages.append(msg)
return drop_orphaned_tool_results_openai(openai_messages)
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""Analyze an image using the OpenAI-compatible /chat/completions endpoint."""
try:
api_config = self.get_api_config()
vision_model = model or api_config.get("model", "gpt-4o")
api_key = api_config.get("api_key", "")
api_base = (api_config.get("api_base") or "https://api.openai.com/v1").rstrip("/")
payload = {
"model": vision_model,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": image_url}},
],
}],
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
resp = requests.post(
f"{api_base}/chat/completions",
headers=headers, json=payload, timeout=60,
)
if resp.status_code != 200:
body = resp.text[:500]
logger.error(f"[{self.__class__.__name__}] call_vision HTTP {resp.status_code}: {body}")
return {"error": True, "message": f"HTTP {resp.status_code}: {body}"}
data = resp.json()
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
usage = data.get("usage", {})
return {
"model": vision_model,
"content": content,
"usage": {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
},
}
except Exception as e:
logger.error(f"[{self.__class__.__name__}] call_vision error: {e}")
return {"error": True, "message": str(e)}

View File

@@ -2,6 +2,7 @@
import time
import json
from typing import Optional
from models.bot import Bot
from models.zhipuai.zhipu_ai_session import ZhipuAISession
@@ -149,6 +150,40 @@ class ZHIPUAIBot(Bot, ZhipuAIImage):
else:
return result
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""Analyze an image using ZhipuAI OpenAI-compatible SDK.
Always uses glm-5v-turbo — the text models (glm-5-turbo etc.) do not support vision.
"""
try:
vision_model = "glm-5v-turbo"
response = self.client.chat.completions.create(
model=vision_model,
max_tokens=max_tokens,
messages=[{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": image_url}},
],
}],
)
content = response.choices[0].message.content or ""
usage = response.usage
return {
"model": vision_model,
"content": content,
"usage": {
"prompt_tokens": getattr(usage, "prompt_tokens", 0),
"completion_tokens": getattr(usage, "completion_tokens", 0),
"total_tokens": getattr(usage, "total_tokens", 0),
},
}
except Exception as e:
logger.error(f"[ZHIPU_AI] call_vision error: {e}")
return {"error": True, "message": str(e)}
def call_with_tools(self, messages, tools=None, stream=False, **kwargs):
"""
Call ZhipuAI API with tool support for agent integration

View File

@@ -0,0 +1,184 @@
# encoding:utf-8
"""
Unit tests for MiniMax provider additions:
- MiniMax-M2.7-highspeed constant in const.py
- Default model update in MinimaxBot
- MinimaxVoice TTS provider
"""
import sys
import os
import json
import unittest
from unittest.mock import MagicMock, patch, PropertyMock
# Add project root to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
class TestMinimaxConst(unittest.TestCase):
"""Test that MiniMax-M2.7-highspeed is properly registered in const.py."""
def test_m2_7_highspeed_constant_defined(self):
from common import const
self.assertTrue(hasattr(const, "MINIMAX_M2_7_HIGHSPEED"))
self.assertEqual(const.MINIMAX_M2_7_HIGHSPEED, "MiniMax-M2.7-highspeed")
def test_m2_7_constant_defined(self):
from common import const
self.assertEqual(const.MINIMAX_M2_7, "MiniMax-M2.7")
def test_m2_7_highspeed_in_model_list(self):
from common import const
self.assertIn("MiniMax-M2.7-highspeed", const.MODEL_LIST)
def test_m2_7_in_model_list(self):
from common import const
self.assertIn("MiniMax-M2.7", const.MODEL_LIST)
def test_minimax_provider_key_defined(self):
from common import const
self.assertEqual(const.MiniMax, "minimax")
class TestMinimaxBotDefaultModel(unittest.TestCase):
"""Test that MinimaxBot defaults to MiniMax-M2.7."""
def test_default_model_is_m2_7(self):
# Patch conf() to return empty config
mock_conf = MagicMock()
mock_conf.get = MagicMock(side_effect=lambda key, default=None: default)
with patch("models.minimax.minimax_bot.conf", return_value=mock_conf):
with patch("models.minimax.minimax_bot.SessionManager"):
from models.minimax import minimax_bot
# Reload to pick up patches
import importlib
importlib.reload(minimax_bot)
with patch("models.minimax.minimax_bot.conf", return_value=mock_conf):
bot = minimax_bot.MinimaxBot.__new__(minimax_bot.MinimaxBot)
bot.args = {
"model": mock_conf.get("model") or "MiniMax-M2.7",
}
self.assertEqual(bot.args["model"], "MiniMax-M2.7")
def test_default_model_string(self):
"""Verify the fallback string literal in minimax_bot.py is MiniMax-M2.7."""
import ast
bot_path = os.path.join(os.path.dirname(__file__), "..", "models", "minimax", "minimax_bot.py")
with open(bot_path) as f:
source = f.read()
# Verify MiniMax-M2.7 is in the source (not M2.1)
self.assertIn("MiniMax-M2.7", source)
self.assertNotIn('"MiniMax-M2.1"', source)
class TestMinimaxVoice(unittest.TestCase):
"""Test MinimaxVoice TTS provider."""
def _make_voice(self, api_key="test-key", api_base="https://api.minimax.io/v1"):
mock_conf = MagicMock()
def conf_get(key, default=None):
return {
"minimax_api_key": api_key,
"minimax_api_base": api_base,
}.get(key, default)
mock_conf.get = conf_get
with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
from voice.minimax.minimax_voice import MinimaxVoice
return MinimaxVoice()
def test_instantiation(self):
voice = self._make_voice()
self.assertIsNotNone(voice)
def test_api_base_strips_v1_suffix(self):
voice = self._make_voice(api_base="https://api.minimax.io/v1")
self.assertEqual(voice.api_base, "https://api.minimax.io")
def test_api_base_no_trailing_slash(self):
voice = self._make_voice(api_base="https://api.minimax.io")
self.assertEqual(voice.api_base, "https://api.minimax.io")
def test_voice_to_text_not_supported(self):
voice = self._make_voice()
with self.assertRaises(NotImplementedError):
voice.voiceToText("dummy.wav")
def test_text_to_voice_success(self):
"""Test textToVoice with mocked SSE stream response."""
import os
os.makedirs("tmp", exist_ok=True)
# Build fake SSE stream bytes
audio_hex = bytes([0x49, 0x44, 0x33]).hex() # "ID3" MP3 magic bytes
sse_line = f'data: {{"data": {{"audio": "{audio_hex}", "status": 2}}}}\n\n'
done_line = "data: [DONE]\n\n"
fake_body = (sse_line + done_line).encode("utf-8")
mock_response = MagicMock()
mock_response.raise_for_status = MagicMock()
mock_response.iter_lines.return_value = [
line.encode("utf-8") for line in (sse_line + done_line).splitlines() if line
]
mock_conf = MagicMock()
def conf_get(key, default=None):
return {
"minimax_api_key": "test-key",
"minimax_api_base": "https://api.minimax.io",
}.get(key, default)
mock_conf.get = conf_get
with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
with patch("voice.minimax.minimax_voice.requests.post", return_value=mock_response):
from voice.minimax import minimax_voice
import importlib
importlib.reload(minimax_voice)
with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
voice = minimax_voice.MinimaxVoice()
from bridge.reply import ReplyType
reply = voice.textToVoice("Hello, world!")
self.assertEqual(reply.type, ReplyType.VOICE)
self.assertTrue(reply.content.endswith(".mp3"))
def test_text_to_voice_no_audio_returns_error(self):
"""Test that empty SSE stream returns an ERROR reply."""
mock_response = MagicMock()
mock_response.raise_for_status = MagicMock()
mock_response.iter_lines.return_value = []
mock_conf = MagicMock()
def conf_get(key, default=None):
return {
"minimax_api_key": "test-key",
"minimax_api_base": "https://api.minimax.io",
}.get(key, default)
mock_conf.get = conf_get
with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
with patch("voice.minimax.minimax_voice.requests.post", return_value=mock_response):
from voice.minimax import minimax_voice
import importlib
importlib.reload(minimax_voice)
with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
voice = minimax_voice.MinimaxVoice()
from bridge.reply import ReplyType
reply = voice.textToVoice("Hello")
self.assertEqual(reply.type, ReplyType.ERROR)
class TestVoiceFactory(unittest.TestCase):
"""Test that minimax is registered in the voice factory."""
def test_minimax_voice_factory(self):
mock_conf = MagicMock()
mock_conf.get = MagicMock(return_value=None)
with patch("voice.minimax.minimax_voice.conf", return_value=mock_conf):
from voice.factory import create_voice
voice = create_voice("minimax")
from voice.minimax.minimax_voice import MinimaxVoice
self.assertIsInstance(voice, MinimaxVoice)
if __name__ == "__main__":
unittest.main()

View File

@@ -54,4 +54,8 @@ def create_voice(voice_type):
from voice.tencent.tencent_voice import TencentVoice
return TencentVoice()
elif voice_type == "minimax":
from voice.minimax.minimax_voice import MinimaxVoice
return MinimaxVoice()
raise RuntimeError

View File

View File

@@ -0,0 +1,106 @@
# encoding:utf-8
"""
MiniMax TTS voice service
"""
import datetime
import random
import requests
from bridge.reply import Reply, ReplyType
from common.log import logger
from config import conf
from voice.voice import Voice
MINIMAX_TTS_VOICES = [
"English_Graceful_Lady",
"English_Insightful_Speaker",
"English_radiant_girl",
"English_Persuasive_Man",
"English_Lucky_Robot",
"English_expressive_narrator",
"Chinese_Warm_Woman",
"Chinese_Gentle_Man",
]
class MinimaxVoice(Voice):
def __init__(self):
self.api_key = conf().get("minimax_api_key")
self.api_base = conf().get("minimax_api_base") or "https://api.minimax.io"
# Strip trailing /v1 if present so we can always append /v1/t2a_v2
self.api_base = self.api_base.rstrip("/")
if self.api_base.endswith("/v1"):
self.api_base = self.api_base[:-3]
def voiceToText(self, voice_file):
"""MiniMax does not provide an ASR endpoint; raise NotImplementedError."""
raise NotImplementedError("MiniMax voice-to-text is not supported")
def textToVoice(self, text):
try:
model = conf().get("text_to_voice_model") or "speech-2.8-hd"
voice_id = conf().get("tts_voice_id") or "English_Graceful_Lady"
url = f"{self.api_base}/v1/t2a_v2"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}",
}
payload = {
"model": model,
"text": text,
"stream": True,
"voice_setting": {
"voice_id": voice_id,
"speed": 1,
"vol": 1,
"pitch": 0,
},
"audio_setting": {
"sample_rate": 32000,
"bitrate": 128000,
"format": "mp3",
"channel": 1,
},
}
response = requests.post(url, headers=headers, json=payload, stream=True, timeout=60)
response.raise_for_status()
# Parse SSE stream and collect hex-encoded audio chunks
audio_chunks = []
buffer = ""
for raw in response.iter_lines():
if not raw:
continue
line = raw.decode("utf-8") if isinstance(raw, bytes) else raw
if not line.startswith("data:"):
continue
json_str = line[5:].strip()
if not json_str or json_str == "[DONE]":
continue
try:
import json
event_data = json.loads(json_str)
audio_hex = event_data.get("data", {}).get("audio")
if audio_hex:
audio_chunks.append(bytes.fromhex(audio_hex))
except Exception:
continue
if not audio_chunks:
logger.error("[MINIMAX] TTS returned no audio data")
return Reply(ReplyType.ERROR, "语音合成失败,未获取到音频数据")
audio_data = b"".join(audio_chunks)
file_name = "tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + str(random.randint(0, 1000)) + ".mp3"
with open(file_name, "wb") as f:
f.write(audio_data)
logger.info(f"[MINIMAX] textToVoice success, file={file_name}")
return Reply(ReplyType.VOICE, file_name)
except Exception as e:
logger.error(f"[MINIMAX] textToVoice error: {e}")
return Reply(ReplyType.ERROR, "遇到了一点小问题,请稍后再试")