feat(qianfan): scope vision support to multimodal models

This commit is contained in:
zhayujie
2026-05-06 16:11:10 +08:00
parent 63f99af1e6
commit a5790d82f6
15 changed files with 212 additions and 50 deletions

View File

@@ -609,7 +609,7 @@ API Key 创建:在 [控制台](https://aistudio.google.com/app/apikey?hl=zh-cn
} }
``` ```
- `model`: 默认推荐填写 `ernie-5.0`,也可填写 `ernie-4.5-turbo-128k``ernie-4.5-turbo-32k``ernie-x1-turbo-32k`Vision 工具可使用 `ernie-4.5-turbo-vl` - `model`: 默认推荐填写 `ernie-5.0`(多模态,可直接识图),也可填写 `ernie-x1.1``ernie-4.5-turbo-128k``ernie-4.5-turbo-32k`;当主模型为纯文本 ERNIE 时Vision 工具会自动 fallback 到 `ernie-4.5-turbo-vl`
- `qianfan_api_key`: 百度千帆 API Key通常以 `bce-v3/` 开头,可在百度智能云控制台创建 - `qianfan_api_key`: 百度千帆 API Key通常以 `bce-v3/` 开头,可在百度智能云控制台创建
- `qianfan_api_base`: 可选,默认为 `https://qianfan.baidubce.com/v2` - `qianfan_api_base`: 可选,默认为 `https://qianfan.baidubce.com/v2`

View File

@@ -53,8 +53,8 @@ _DISCOVERABLE_MODELS = [
("ark_api_key", const.DOUBAO, const.DOUBAO_SEED_2_PRO, "Doubao"), ("ark_api_key", const.DOUBAO, const.DOUBAO_SEED_2_PRO, "Doubao"),
("dashscope_api_key", const.QWEN_DASHSCOPE, const.QWEN36_PLUS, "DashScope"), ("dashscope_api_key", const.QWEN_DASHSCOPE, const.QWEN36_PLUS, "DashScope"),
("claude_api_key", const.CLAUDEAPI, const.CLAUDE_4_6_SONNET, "Claude"), ("claude_api_key", const.CLAUDEAPI, const.CLAUDE_4_6_SONNET, "Claude"),
("qianfan_api_key", const.QIANFAN, const.ERNIE_45_TURBO_VL, "Qianfan"),
("gemini_api_key", const.GEMINI, const.GEMINI_31_FLASH_LITE_PRE, "Gemini"), ("gemini_api_key", const.GEMINI, const.GEMINI_31_FLASH_LITE_PRE, "Gemini"),
("qianfan_api_key", const.QIANFAN, const.ERNIE_45_TURBO_VL, "Qianfan"),
("zhipu_ai_api_key", const.ZHIPU_AI, const.GLM_4_7, "ZhipuAI"), ("zhipu_ai_api_key", const.ZHIPU_AI, const.GLM_4_7, "ZhipuAI"),
("minimax_api_key", const.MiniMax, const.MINIMAX_M2_7, "MiniMax"), ("minimax_api_key", const.MiniMax, const.MINIMAX_M2_7, "MiniMax"),
] ]
@@ -346,15 +346,21 @@ class Vision(BaseTool):
preferred_model: Optional[str] = None) -> None: preferred_model: Optional[str] = None) -> None:
""" """
Auto-discover other models whose API key is configured. Auto-discover other models whose API key is configured.
Skip the main model's own bot_type (already covered by MainModel provider). Skip the main model's own bot_type (already covered by MainModel
Skip bot_types that already have a provider in the list (e.g. OpenAI). provider), unless the main model itself does not support vision —
in that case we still want the vendor's dedicated vision model
as a fallback. Also skip bot_types that already appear in the
provider list.
If preferred_model matches a provider's family (e.g. "doubao-*" matches If preferred_model matches a provider's family, use it instead
Doubao), use it instead of that provider's hard-coded default model. of that provider's hard-coded default model.
""" """
main_bot_type = None main_bot_type = None
main_bot_supports_vision = False
if self.model and hasattr(self.model, '_resolve_bot_type'): if self.model and hasattr(self.model, '_resolve_bot_type'):
main_bot_type = self.model._resolve_bot_type(conf().get("model", "")) main_bot_type = self.model._resolve_bot_type(conf().get("model", ""))
main_bot = getattr(self.model, "bot", None)
main_bot_supports_vision = self._main_bot_supports_vision(main_bot)
existing_names = {p.name for p in providers} existing_names = {p.name for p in providers}
preferred_provider = self._infer_provider_from_model(preferred_model) if preferred_model else None preferred_provider = self._infer_provider_from_model(preferred_model) if preferred_model else None
@@ -362,7 +368,11 @@ class Vision(BaseTool):
for config_key, bot_type, default_model, display_name in _DISCOVERABLE_MODELS: for config_key, bot_type, default_model, display_name in _DISCOVERABLE_MODELS:
if display_name in existing_names: if display_name in existing_names:
continue continue
if bot_type == main_bot_type: # Same bot_type as the main model is normally handled by the
# MainModel provider; only skip it here if the main model
# actually supports vision. Otherwise fall through and add
# the vendor's dedicated vision model as a fallback.
if bot_type == main_bot_type and main_bot_supports_vision:
continue continue
api_key = conf().get(config_key, "") api_key = conf().get(config_key, "")
if not api_key or not api_key.strip(): if not api_key or not api_key.strip():
@@ -380,34 +390,44 @@ class Vision(BaseTool):
if preferred_provider == display_name and preferred_model if preferred_provider == display_name and preferred_model
else default_model) else default_model)
providers.append(VisionProvider( provider = VisionProvider(
name=display_name, name=display_name,
api_key="", api_key="",
api_base="", api_base="",
model_override=model_for_provider, model_override=model_for_provider,
use_bot=True, use_bot=True,
fallback_bot=bot, fallback_bot=bot,
)) )
# Same vendor as the main bot is the most natural fallback when
# the main model itself does not support vision — promote it to
# the front of the list instead of relying on declaration order.
if bot_type == main_bot_type:
providers.insert(0, provider)
else:
providers.append(provider)
def _main_bot_supports_vision(self, bot) -> bool: def _main_bot_supports_vision(self, bot) -> bool:
""" """
Whether the main bot is known to natively support vision. Whether the main bot is known to natively support vision.
Having a `call_vision` method is necessary but not sufficient — some Having a `call_vision` method is necessary but not sufficient —
bots (e.g. DeepSeek) implement the method against an endpoint that some bots implement the method against an endpoint that does not
does not actually serve vision models, which causes silent failures actually serve vision models, which causes silent failures when a
when a vendor-foreign model name (e.g. doubao-*) is forwarded. vendor-foreign model name is forwarded.
We trust call_vision only when: Resolution order:
- The bot exposes a truthy `supports_vision` attribute, OR 1. If the bot explicitly declares `supports_vision`, trust it.
- The configured main model name has a known multimodal prefix This lets bots opt in or out based on their own runtime
handled by this bot's own vendor (claude-/gemini-/glm-/qwen-/ configuration (e.g. the currently selected model).
kimi-/doubao-/MiniMax-/abab*/gpt-*). 2. Otherwise, fall back to a model-name prefix heuristic: trust
call_vision when the main model looks like an OpenAI family
model or matches a known multimodal vendor prefix.
""" """
if bot is None: if bot is None:
return False return False
if getattr(bot, "supports_vision", False): if hasattr(bot, "supports_vision"):
return True return bool(getattr(bot, "supports_vision"))
main_model = (conf().get("model") or "").lower() main_model = (conf().get("model") or "").lower()
if not main_model: if not main_model:
return False return False

View File

@@ -780,7 +780,7 @@ class ConfigHandler:
const.QWEN36_PLUS, const.QWEN35_PLUS, const.QWEN3_MAX, const.QWEN36_PLUS, const.QWEN35_PLUS, const.QWEN3_MAX,
const.DOUBAO_SEED_2_PRO, const.DOUBAO_SEED_2_CODE, const.DOUBAO_SEED_2_PRO, const.DOUBAO_SEED_2_CODE,
const.KIMI_K2_6, const.KIMI_K2_5, const.KIMI_K2, const.KIMI_K2_6, const.KIMI_K2_5, const.KIMI_K2,
const.ERNIE_5, const.ERNIE_45_TURBO_128K, const.ERNIE_45_TURBO_32K, const.ERNIE_X1_TURBO_32K, const.ERNIE_5, const.ERNIE_X1_1, const.ERNIE_45_TURBO_128K, const.ERNIE_45_TURBO_32K,
] ]
# Generic placeholder hints surfaced in the web console. We deliberately # Generic placeholder hints surfaced in the web console. We deliberately
@@ -873,7 +873,7 @@ class ConfigHandler:
"api_base_key": "qianfan_api_base", "api_base_key": "qianfan_api_base",
"api_base_default": "https://qianfan.baidubce.com/v2", "api_base_default": "https://qianfan.baidubce.com/v2",
"api_base_placeholder": _PLACEHOLDER_QIANFAN, "api_base_placeholder": _PLACEHOLDER_QIANFAN,
"models": [const.ERNIE_5, const.ERNIE_45_TURBO_128K, const.ERNIE_45_TURBO_32K, const.ERNIE_X1_TURBO_32K], "models": [const.ERNIE_5, const.ERNIE_X1_1, const.ERNIE_45_TURBO_128K, const.ERNIE_45_TURBO_32K],
}), }),
("modelscope", { ("modelscope", {
"label": "ModelScope", "label": "ModelScope",

View File

@@ -88,9 +88,9 @@ DEEPSEEK_V4_PRO = "deepseek-v4-pro" # DeepSeek V4 Pro - 复杂任务更强 (思
# Baidu Qianfan / ERNIE # Baidu Qianfan / ERNIE
ERNIE_5 = "ernie-5.0" # ERNIE 5.0 - default recommendation ERNIE_5 = "ernie-5.0" # ERNIE 5.0 - default recommendation
ERNIE_X1_1 = "ernie-x1.1" # ERNIE X1.1 - reasoning-focused, multimodal
ERNIE_45_TURBO_128K = "ernie-4.5-turbo-128k" ERNIE_45_TURBO_128K = "ernie-4.5-turbo-128k"
ERNIE_45_TURBO_32K = "ernie-4.5-turbo-32k" ERNIE_45_TURBO_32K = "ernie-4.5-turbo-32k"
ERNIE_X1_TURBO_32K = "ernie-x1-turbo-32k"
ERNIE_4_TURBO_8K = "ERNIE-4.0-Turbo-8K" ERNIE_4_TURBO_8K = "ERNIE-4.0-Turbo-8K"
ERNIE_45_TURBO_VL = "ernie-4.5-turbo-vl" ERNIE_45_TURBO_VL = "ernie-4.5-turbo-vl"
ERNIE_45_TURBO_VL_32K = "ernie-4.5-turbo-vl-32k" ERNIE_45_TURBO_VL_32K = "ernie-4.5-turbo-vl-32k"
@@ -170,7 +170,7 @@ MODEL_LIST = [
DEEPSEEK_V4_FLASH, DEEPSEEK_V4_PRO, DEEPSEEK_CHAT, DEEPSEEK_REASONER, DEEPSEEK_V4_FLASH, DEEPSEEK_V4_PRO, DEEPSEEK_CHAT, DEEPSEEK_REASONER,
# Baidu Qianfan / ERNIE # Baidu Qianfan / ERNIE
QIANFAN, ERNIE_5, ERNIE_45_TURBO_128K, ERNIE_45_TURBO_32K, ERNIE_X1_TURBO_32K, ERNIE_4_TURBO_8K, QIANFAN, ERNIE_5, ERNIE_X1_1, ERNIE_45_TURBO_128K, ERNIE_45_TURBO_32K, ERNIE_4_TURBO_8K,
ERNIE_45_TURBO_VL, ERNIE_45_TURBO_VL_32K, ERNIE_45_TURBO_VL, ERNIE_45_TURBO_VL_32K,
# MiniMax # MiniMax

View File

@@ -15,7 +15,7 @@ Option 1: Native integration (recommended):
| Parameter | Description | | Parameter | Description |
| --- | --- | | --- | --- |
| `model` | Default recommendation: `ernie-5.0`; also supports `ernie-4.5-turbo-128k`, `ernie-4.5-turbo-32k`, `ernie-x1-turbo-32k` | | `model` | Default recommendation: `ernie-5.0`; also supports `ernie-x1.1`, `ernie-4.5-turbo-128k`, `ernie-4.5-turbo-32k` |
| `qianfan_api_key` | Qianfan API key, usually starting with `bce-v3/` | | `qianfan_api_key` | Qianfan API key, usually starting with `bce-v3/` |
| `qianfan_api_base` | Optional, defaults to `https://qianfan.baidubce.com/v2` | | `qianfan_api_base` | Optional, defaults to `https://qianfan.baidubce.com/v2` |
@@ -24,13 +24,18 @@ Option 1: Native integration (recommended):
| Model | Use Case | | Model | Use Case |
| --- | --- | | --- | --- |
| `ernie-5.0` | Default recommendation; latest ERNIE flagship with the strongest overall capability | | `ernie-5.0` | Default recommendation; latest ERNIE flagship with the strongest overall capability |
| `ernie-x1.1` | Deep-thinking reasoning model with lower hallucination and stronger instruction following / tool calling |
| `ernie-4.5-turbo-128k` | Long-context and general chat | | `ernie-4.5-turbo-128k` | Long-context and general chat |
| `ernie-4.5-turbo-32k` | General chat with a balanced context window and cost | | `ernie-4.5-turbo-32k` | General chat with a balanced context window and cost |
| `ernie-x1-turbo-32k` | Tasks that need stronger reasoning |
## Vision tool ## Vision tool
After `qianfan_api_key` is configured, Agent mode can auto-discover Qianfan for the Vision tool. The recommended Qianfan vision model is `ernie-4.5-turbo-vl`: Once `qianfan_api_key` is configured, Agent mode can auto-discover Qianfan for the Vision tool:
- When the main model itself is multimodal (e.g. `ernie-5.0`, `ernie-x1.1`, `ernie-4.5-turbo-vl`), images are handled directly by the main model with no extra setup.
- When the main model is text-only (e.g. `ernie-4.5-turbo-128k`), the Vision tool automatically falls back to `ernie-4.5-turbo-vl`.
To force a specific Vision model, set it explicitly in `config.json`:
```json ```json
{ {

View File

@@ -30,7 +30,7 @@ The voice and streaming building blocks come from a community contribution #2791
- **DeepSeek V4 series**: Added `deepseek-v4-pro` / `deepseek-v4-flash`, with `deepseek-v4-flash` set as the new default - **DeepSeek V4 series**: Added `deepseek-v4-pro` / `deepseek-v4-flash`, with `deepseek-v4-flash` set as the new default
- **Unified thinking-mode toggle**: DeepSeek V4, Qwen3 and other thinking-capable models now share the same `enable_thinking` switch - **Unified thinking-mode toggle**: DeepSeek V4, Qwen3 and other thinking-capable models now share the same `enable_thinking` switch
- **Baidu Qianfan / ERNIE first-class integration**: New `qianfan` provider supporting `ernie-5.0` (default recommendation), `ernie-4.5-turbo-128k`, `ernie-4.5-turbo-32k`, `ernie-x1-turbo-32k`. Dedicated `qianfan_api_key` / `qianfan_api_base` settings keep OpenAI config clean; legacy `wenxin` / `wenxin-4` paths are fully preserved. #2790 Thanks [@jimmyzhuu](https://github.com/jimmyzhuu) - **Baidu Qianfan / ERNIE first-class integration**: New `qianfan` provider supporting `ernie-5.0` (default recommendation), `ernie-x1.1`, `ernie-4.5-turbo-128k`, `ernie-4.5-turbo-32k`. Dedicated `qianfan_api_key` / `qianfan_api_base` settings keep OpenAI config clean; legacy `wenxin` / `wenxin-4` paths are fully preserved. #2790 Thanks [@jimmyzhuu](https://github.com/jimmyzhuu)
Documentation: [Baidu Qianfan / ERNIE](https://docs.cowagent.ai/en/models/qianfan) Documentation: [Baidu Qianfan / ERNIE](https://docs.cowagent.ai/en/models/qianfan)

View File

@@ -23,7 +23,7 @@ If the current provider fails, the tool automatically tries the next one until i
| Vendor | Vision Model | Notes | | Vendor | Vision Model | Notes |
| --- | --- | --- | | --- | --- | --- |
| OpenAI / Compatible | Main model | All OpenAI-compatible multimodal models | | OpenAI / Compatible | Main model | All OpenAI-compatible multimodal models |
| Baidu Qianfan | ernie-4.5-turbo-vl | Auto-discovered when `qianfan_api_key` is configured; can also be selected via `tool.vision.model` | | Baidu Qianfan | Main model | Multimodal main models (e.g. `ernie-5.0`) handle images directly; falls back to `ernie-4.5-turbo-vl` for text-only main models |
| Qwen (DashScope) | Main model | Via MultiModalConversation API | | Qwen (DashScope) | Main model | Via MultiModalConversation API |
| Claude | Main model | Anthropic native image format | | Claude | Main model | Anthropic native image format |
| Gemini | Main model | inlineData format | | Gemini | Main model | inlineData format |

View File

@@ -15,7 +15,7 @@ description: Baidu Qianfan ERNIE モデル設定
| パラメータ | 説明 | | パラメータ | 説明 |
| --- | --- | | --- | --- |
| `model` | デフォルトの推奨は `ernie-5.0`。`ernie-4.5-turbo-128k`、`ernie-4.5-turbo-32k`、`ernie-x1-turbo-32k` も利用できます | | `model` | デフォルトの推奨は `ernie-5.0`。`ernie-x1.1`、`ernie-4.5-turbo-128k`、`ernie-4.5-turbo-32k` も利用できます |
| `qianfan_api_key` | Qianfan API Key。通常は `bce-v3/` で始まります | | `qianfan_api_key` | Qianfan API Key。通常は `bce-v3/` で始まります |
| `qianfan_api_base` | 任意。デフォルトは `https://qianfan.baidubce.com/v2` | | `qianfan_api_base` | 任意。デフォルトは `https://qianfan.baidubce.com/v2` |
@@ -24,13 +24,18 @@ description: Baidu Qianfan ERNIE モデル設定
| モデル | 用途 | | モデル | 用途 |
| --- | --- | | --- | --- |
| `ernie-5.0` | デフォルト推奨。文心の最新フラッグシップモデルで、総合性能が最も強い | | `ernie-5.0` | デフォルト推奨。文心の最新フラッグシップモデルで、総合性能が最も強い |
| `ernie-x1.1` | 深層推論モデル。ハルシネーションが少なく、指示追従とツール呼び出しが強化 |
| `ernie-4.5-turbo-128k` | 長いコンテキストと一般的なチャット向け | | `ernie-4.5-turbo-128k` | 長いコンテキストと一般的なチャット向け |
| `ernie-4.5-turbo-32k` | コンテキスト長とコストのバランスが良い一般チャット向け | | `ernie-4.5-turbo-32k` | コンテキスト長とコストのバランスが良い一般チャット向け |
| `ernie-x1-turbo-32k` | より強い推論が必要なタスク向け |
## Vision ツール ## Vision ツール
`qianfan_api_key` を設定すると、Agent モードの Vision ツールは Qianfan を自動検出できます。推奨する Qianfan の視覚モデルは `ernie-4.5-turbo-vl` です: `qianfan_api_key` を設定すると、Agent モードの Vision ツールは Qianfan を自動検出します:
- 主モデルが多モーダル(`ernie-5.0`、`ernie-x1.1`、`ernie-4.5-turbo-vl` など)の場合は、追加設定なしで主モデルがそのまま画像を処理します。
- 主モデルがテキスト専用(`ernie-4.5-turbo-128k` などの場合は、Vision ツールが自動的に `ernie-4.5-turbo-vl` にフォールバックします。
特定の Vision モデルを強制したい場合は、`config.json` で明示的に指定できます:
```json ```json
{ {

View File

@@ -30,7 +30,7 @@ description: CowAgent 2.0.8 - 飛書チャネル全面アップグレード(
- **DeepSeek V4 シリーズ**`deepseek-v4-pro` / `deepseek-v4-flash` を追加、デフォルトモデルを `deepseek-v4-flash` に切り替え - **DeepSeek V4 シリーズ**`deepseek-v4-pro` / `deepseek-v4-flash` を追加、デフォルトモデルを `deepseek-v4-flash` に切り替え
- **思考モデルスイッチの統一**DeepSeek V4、Qwen3 など思考対応モデルの切り替え動作を `enable_thinking` に統一 - **思考モデルスイッチの統一**DeepSeek V4、Qwen3 など思考対応モデルの切り替え動作を `enable_thinking` に統一
- **百度千帆 / ERNIE のファーストクラス対応**:新たな `qianfan` プロバイダーを追加。`ernie-5.0`(デフォルト推奨)、`ernie-4.5-turbo-128k`、`ernie-4.5-turbo-32k`、`ernie-x1-turbo-32k` をサポート。`qianfan_api_key` / `qianfan_api_base` の独立設定により OpenAI 設定を汚染せず、旧来の `wenxin` / `wenxin-4` パスも完全互換 #2790 Thanks [@jimmyzhuu](https://github.com/jimmyzhuu) - **百度千帆 / ERNIE のファーストクラス対応**:新たな `qianfan` プロバイダーを追加。`ernie-5.0`(デフォルト推奨)、`ernie-x1.1`、`ernie-4.5-turbo-128k`、`ernie-4.5-turbo-32k` をサポート。`qianfan_api_key` / `qianfan_api_base` の独立設定により OpenAI 設定を汚染せず、旧来の `wenxin` / `wenxin-4` パスも完全互換 #2790 Thanks [@jimmyzhuu](https://github.com/jimmyzhuu)
ドキュメント:[百度千帆 / ERNIE](https://docs.cowagent.ai/ja/models/qianfan) ドキュメント:[百度千帆 / ERNIE](https://docs.cowagent.ai/ja/models/qianfan)

View File

@@ -23,7 +23,7 @@ Vision ツールは多段階の自動選択+自動フォールバック戦略
| ベンダー | ビジョンモデル | 説明 | | ベンダー | ビジョンモデル | 説明 |
| --- | --- | --- | | --- | --- | --- |
| OpenAI / 互換プロトコル | メインモデル | すべての OpenAI 互換マルチモーダルモデルに対応 | | OpenAI / 互換プロトコル | メインモデル | すべての OpenAI 互換マルチモーダルモデルに対応 |
| Baidu Qianfan | ernie-4.5-turbo-vl | `qianfan_api_key` を設定すると自動検出され、`tool.vision.model` でも指定できます | | Baidu Qianfan | メインモデル | 多モーダルの主モデル(`ernie-5.0` など)は直接画像を処理。テキスト専用主モデルの場合は `ernie-4.5-turbo-vl` に自動フォールバック |
| 通義千問 (DashScope) | メインモデル | MultiModalConversation API 経由 | | 通義千問 (DashScope) | メインモデル | MultiModalConversation API 経由 |
| Claude | メインモデル | Anthropic ネイティブ画像形式 | | Claude | メインモデル | Anthropic ネイティブ画像形式 |
| Gemini | メインモデル | inlineData 形式 | | Gemini | メインモデル | inlineData 形式 |

View File

@@ -15,7 +15,7 @@ description: 百度千帆 ERNIE 模型配置
| 参数 | 说明 | | 参数 | 说明 |
| --- | --- | | --- | --- |
| `model` | 默认推荐使用 `ernie-5.0`;也可使用 `ernie-4.5-turbo-128k`、`ernie-4.5-turbo-32k`、`ernie-x1-turbo-32k` | | `model` | 默认推荐使用 `ernie-5.0`;也可使用 `ernie-x1.1`、`ernie-4.5-turbo-128k`、`ernie-4.5-turbo-32k` |
| `qianfan_api_key` | 千帆 API Key格式通常以 `bce-v3/` 开头 | | `qianfan_api_key` | 千帆 API Key格式通常以 `bce-v3/` 开头 |
| `qianfan_api_base` | 可选,默认为 `https://qianfan.baidubce.com/v2` | | `qianfan_api_base` | 可选,默认为 `https://qianfan.baidubce.com/v2` |
@@ -24,13 +24,18 @@ description: 百度千帆 ERNIE 模型配置
| 模型 | 适用场景 | | 模型 | 适用场景 |
| --- | --- | | --- | --- |
| `ernie-5.0` | 默认推荐,文心新一代旗舰模型,综合能力最强 | | `ernie-5.0` | 默认推荐,文心新一代旗舰模型,综合能力最强 |
| `ernie-x1.1` | 深度思考推理模型,幻觉更低、指令遵循与工具调用更强 |
| `ernie-4.5-turbo-128k` | 长上下文和通用对话 | | `ernie-4.5-turbo-128k` | 长上下文和通用对话 |
| `ernie-4.5-turbo-32k` | 通用对话,成本和上下文更均衡 | | `ernie-4.5-turbo-32k` | 通用对话,成本和上下文更均衡 |
| `ernie-x1-turbo-32k` | 需要更强推理能力的任务 |
## Vision 工具 ## Vision 工具
配置 `qianfan_api_key` 后Agent 的 Vision 工具可以自动使用千帆视觉模型。默认推荐使用 `ernie-4.5-turbo-vl` 配置 `qianfan_api_key` 后Agent 的 Vision 工具可以自动使用千帆视觉模型:
- 当主模型本身是多模态时(如 `ernie-5.0`、`ernie-x1.1`、`ernie-4.5-turbo-vl`),直接由主模型识别图像,无需额外配置
- 当主模型是纯文本时(如 `ernie-4.5-turbo-128k`Vision 工具会自动 fallback 到 `ernie-4.5-turbo-vl`
如需手动指定 Vision 模型,可在 `config.json` 中显式配置:
```json ```json
{ {

View File

@@ -30,7 +30,7 @@ description: CowAgent 2.0.8 - 飞书渠道全面升级(语音、流式打字
- **DeepSeek V4 系列**:新增 `deepseek-v4-pro` / `deepseek-v4-flash`,并将默认模型切换为 `deepseek-v4-flash` - **DeepSeek V4 系列**:新增 `deepseek-v4-pro` / `deepseek-v4-flash`,并将默认模型切换为 `deepseek-v4-flash`
- **思考模型开关统一**DeepSeek V4、Qwen3 等思考模型的开关行为对齐到 `enable_thinking` - **思考模型开关统一**DeepSeek V4、Qwen3 等思考模型的开关行为对齐到 `enable_thinking`
- **百度千帆模型接入**:新增百度千帆厂商,支持 `ernie-5.0`、`ernie-4.5-turbo-128k` 等模型, 相关文档查看 [百度千帆](https://docs.cowagent.ai/models/qianfan)。#2790 Thanks @jimmyzhuu - **百度千帆模型接入**:新增百度千帆厂商,支持 `ernie-5.0`、`ernie-4.5-turbo-128k` 等模型,并支持图像识别工具,相关文档查看 [百度千帆](https://docs.cowagent.ai/models/qianfan)。#2790 Thanks @jimmyzhuu
- **新增有道翻译**`translate` 模块新增有道翻译支持 #2797 Thanks @Zmjjeff7 - **新增有道翻译**`translate` 模块新增有道翻译支持 #2797 Thanks @Zmjjeff7
## 🛠 OpenAI 客户端重构 ## 🛠 OpenAI 客户端重构

View File

@@ -19,12 +19,12 @@ Vision 工具采用多级自动选择 + 自动兜底策略,无需手动配置
| 厂商 | 视觉模型 | 说明 | | 厂商 | 视觉模型 | 说明 |
| --- | --- | --- | | --- | --- | --- |
| OpenAI / 兼容协议 | 使用主模型 | 支持所有 OpenAI 协议兼容的多模态模型 | | OpenAI / 兼容协议 | 使用主模型 | 支持所有 OpenAI 协议兼容的多模态模型 |
| 百度千帆 (Qianfan) | ernie-4.5-turbo-vl | 配置 `qianfan_api_key` 后自动发现,也可通过 `tool.vision.model` 指定 |
| 通义千问 (DashScope) | 使用主模型 | 例如 qwen3.6-plus 等 | | 通义千问 (DashScope) | 使用主模型 | 例如 qwen3.6-plus 等 |
| Claude | 使用主模型 | Anthropic 原生图像格式 | | Claude | 使用主模型 | Anthropic 原生图像格式 |
| Gemini | 使用主模型 | inlineData 格式 | | Gemini | 使用主模型 | inlineData 格式 |
| 豆包 (Doubao) | 使用主模型 | doubao-seed-2-0 系列原生支持 | | 豆包 (Doubao) | 使用主模型 | doubao-seed-2-0 系列原生支持 |
| Kimi (Moonshot) | 使用主模型 | kimi-k2.6、kimi-k2.5 原生支持 | | Kimi (Moonshot) | 使用主模型 | kimi-k2.6、kimi-k2.5 原生支持 |
| 百度千帆 (Qianfan) | 使用主模型 | 默认使用多模态主模型 (如 ernie-5.0),主模型不支持时兜底使用 ernie-4.5-turbo-vl |
| 智谱 AI | glm-5v-turbo | 固定使用视觉专用模型 | | 智谱 AI | glm-5v-turbo | 固定使用视觉专用模型 |
| MiniMax | MiniMax-Text-01 | 固定使用视觉专用模型 | | MiniMax | MiniMax-Text-01 | 固定使用视觉专用模型 |
@@ -42,7 +42,7 @@ Vision 工具采用多级自动选择 + 自动兜底策略,无需手动配置
{ {
"tool": { "tool": {
"vision": { "vision": {
"model": "ernie-4.5-turbo-vl" "model": "gpt-4.1"
} }
} }
} }

View File

@@ -17,9 +17,21 @@ DEFAULT_API_BASE = "https://qianfan.baidubce.com/v2"
DEFAULT_MODEL = const.ERNIE_5 DEFAULT_MODEL = const.ERNIE_5
DEFAULT_VISION_MODEL = const.ERNIE_45_TURBO_VL DEFAULT_VISION_MODEL = const.ERNIE_45_TURBO_VL
# Qianfan models that natively understand images. Other ERNIE variants
# are text-only and must not receive image payloads.
_VISION_CAPABLE_MODELS = {
const.ERNIE_5,
const.ERNIE_X1_1,
const.ERNIE_45_TURBO_VL,
const.ERNIE_45_TURBO_VL_32K,
}
class QianfanBot(Bot, OpenAICompatibleBot): class QianfanBot(Bot, OpenAICompatibleBot):
supports_vision = True @property
def supports_vision(self) -> bool:
"""Whether the configured main model is multimodal."""
return (conf().get("model") or "").lower() in _VISION_CAPABLE_MODELS
def __init__(self): def __init__(self):
super().__init__() super().__init__()

View File

@@ -18,7 +18,7 @@ class TestQianfanConstantsAndRouting(unittest.TestCase):
self.assertEqual(const.ERNIE_45_TURBO_128K, "ernie-4.5-turbo-128k") self.assertEqual(const.ERNIE_45_TURBO_128K, "ernie-4.5-turbo-128k")
self.assertEqual(const.ERNIE_45_TURBO_32K, "ernie-4.5-turbo-32k") self.assertEqual(const.ERNIE_45_TURBO_32K, "ernie-4.5-turbo-32k")
self.assertEqual(const.ERNIE_X1_TURBO_32K, "ernie-x1-turbo-32k") self.assertEqual(const.ERNIE_X1_1, "ernie-x1.1")
self.assertEqual( self.assertEqual(
const.ERNIE_45_TURBO_VL, const.ERNIE_45_TURBO_VL,
"ernie-4.5-turbo-vl", "ernie-4.5-turbo-vl",
@@ -30,7 +30,7 @@ class TestQianfanConstantsAndRouting(unittest.TestCase):
self.assertIn(const.QIANFAN, const.MODEL_LIST) self.assertIn(const.QIANFAN, const.MODEL_LIST)
self.assertIn(const.ERNIE_45_TURBO_128K, const.MODEL_LIST) self.assertIn(const.ERNIE_45_TURBO_128K, const.MODEL_LIST)
self.assertIn(const.ERNIE_45_TURBO_32K, const.MODEL_LIST) self.assertIn(const.ERNIE_45_TURBO_32K, const.MODEL_LIST)
self.assertIn(const.ERNIE_X1_TURBO_32K, const.MODEL_LIST) self.assertIn(const.ERNIE_X1_1, const.MODEL_LIST)
self.assertIn(const.ERNIE_45_TURBO_VL, const.MODEL_LIST) self.assertIn(const.ERNIE_45_TURBO_VL, const.MODEL_LIST)
self.assertIn(const.ERNIE_45_TURBO_VL_32K, const.MODEL_LIST) self.assertIn(const.ERNIE_45_TURBO_VL_32K, const.MODEL_LIST)
@@ -223,15 +223,31 @@ class TestQianfanBot(unittest.TestCase):
self.assertEqual(result["content"], "请求失败bad gateway text") self.assertEqual(result["content"], "请求失败bad gateway text")
post.assert_called_once() post.assert_called_once()
def test_qianfan_bot_supports_vision(self): def test_qianfan_bot_supports_vision_for_multimodal_models(self):
fake_conf = self._fake_conf() for model in ("ernie-5.0", "ernie-x1.1", "ernie-4.5-turbo-vl", "ernie-4.5-turbo-vl-32k"):
with patch("models.qianfan.qianfan_bot.conf", return_value=fake_conf): fake_conf = self._fake_conf({"model": model})
with patch("models.qianfan.qianfan_bot.SessionManager"): with patch("models.qianfan.qianfan_bot.conf", return_value=fake_conf):
from models.qianfan.qianfan_bot import QianfanBot with patch("models.qianfan.qianfan_bot.SessionManager"):
from models.qianfan.qianfan_bot import QianfanBot
bot = QianfanBot() bot = QianfanBot()
self.assertTrue(
bot.supports_vision,
msg=f"{model} should be marked as multimodal",
)
self.assertTrue(bot.supports_vision) def test_qianfan_bot_does_not_advertise_vision_for_text_only_models(self):
for model in ("ernie-4.5-turbo-128k", "ernie-4.5-turbo-32k"):
fake_conf = self._fake_conf({"model": model})
with patch("models.qianfan.qianfan_bot.conf", return_value=fake_conf):
with patch("models.qianfan.qianfan_bot.SessionManager"):
from models.qianfan.qianfan_bot import QianfanBot
bot = QianfanBot()
self.assertFalse(
bot.supports_vision,
msg=f"{model} should not be marked as multimodal",
)
def test_call_vision_posts_openai_compatible_multimodal_payload(self): def test_call_vision_posts_openai_compatible_multimodal_payload(self):
fake_conf = self._fake_conf() fake_conf = self._fake_conf()
@@ -435,6 +451,105 @@ class TestQianfanVisionTool(unittest.TestCase):
self.assertEqual(providers[0].name, "MainModel") self.assertEqual(providers[0].name, "MainModel")
self.assertEqual(providers[0].model_override, "ernie-4.5-turbo-vl-32k") self.assertEqual(providers[0].model_override, "ernie-4.5-turbo-vl-32k")
def test_vision_main_model_uses_ernie_5_directly(self):
"""ERNIE 5.0 is omni-modal → main-model path forwards image to it."""
fake_conf = self._fake_conf({"model": "ernie-5.0"})
from common import const
fake_model = MagicMock()
fake_model._resolve_bot_type.return_value = const.QIANFAN
fake_model.bot = MagicMock()
fake_model.bot.supports_vision = True
fake_model.bot.call_vision = MagicMock()
with patch("agent.tools.vision.vision.conf", return_value=fake_conf):
from agent.tools.vision.vision import Vision
tool = Vision()
tool.model = fake_model
providers = tool._resolve_providers()
self.assertEqual(providers[0].name, "MainModel")
self.assertEqual(providers[0].model_override, "ernie-5.0")
def test_vision_falls_back_to_qianfan_vl_when_main_model_is_text_only_ernie(self):
"""Text-only ERNIE (e.g. ernie-4.5-turbo-128k) must NOT receive image
payloads — Vision should skip MainModel and pick up the Qianfan
provider from _DISCOVERABLE_MODELS instead."""
fake_conf = self._fake_conf({
"model": "ernie-4.5-turbo-128k",
"qianfan_api_key": "test-qianfan-key",
})
from common import const
# Main bot reports supports_vision=False because the configured
# model is text-only.
fake_main_bot = MagicMock()
fake_main_bot.supports_vision = False
fake_main_bot.call_vision = MagicMock()
fake_model = MagicMock()
fake_model._resolve_bot_type.return_value = const.QIANFAN
fake_model.bot = fake_main_bot
# The discoverable Qianfan provider creates a new bot via factory.
fake_factory_bot = MagicMock()
fake_factory_bot.call_vision = MagicMock()
with patch("agent.tools.vision.vision.conf", return_value=fake_conf):
with patch("models.bot_factory.create_bot", return_value=fake_factory_bot):
from agent.tools.vision.vision import Vision
tool = Vision()
tool.model = fake_model
providers = tool._resolve_providers()
# MainModel must be absent; Qianfan fallback provider must be the
# first choice and pinned to the dedicated vision model.
names = [p.name for p in providers]
self.assertNotIn("MainModel", names)
self.assertEqual(names[0], "Qianfan")
self.assertEqual(providers[0].model_override, const.ERNIE_45_TURBO_VL)
def test_vision_prefers_same_vendor_fallback_over_other_configured_keys(self):
"""When the main bot is text-only ERNIE and several vision-capable
keys are configured, the same-vendor (Qianfan) fallback wins over
unrelated providers regardless of declaration order."""
fake_conf = self._fake_conf({
"model": "ernie-4.5-turbo-128k",
"qianfan_api_key": "test-qianfan-key",
"ark_api_key": "test-ark-key",
"claude_api_key": "test-claude-key",
"minimax_api_key": "test-minimax-key",
})
from common import const
fake_main_bot = MagicMock()
fake_main_bot.supports_vision = False
fake_main_bot.call_vision = MagicMock()
fake_model = MagicMock()
fake_model._resolve_bot_type.return_value = const.QIANFAN
fake_model.bot = fake_main_bot
fake_factory_bot = MagicMock()
fake_factory_bot.call_vision = MagicMock()
with patch("agent.tools.vision.vision.conf", return_value=fake_conf):
with patch("models.bot_factory.create_bot", return_value=fake_factory_bot):
from agent.tools.vision.vision import Vision
tool = Vision()
tool.model = fake_model
providers = tool._resolve_providers()
names = [p.name for p in providers]
self.assertEqual(names[0], "Qianfan")
self.assertEqual(providers[0].model_override, const.ERNIE_45_TURBO_VL)
# Other configured providers should still appear in the chain.
for expected in ("Doubao", "Claude", "MiniMax"):
self.assertIn(expected, names)
class TestQianfanDocs(unittest.TestCase): class TestQianfanDocs(unittest.TestCase):
def _read(self, relative_path): def _read(self, relative_path):