Merge pull request #2800 from jimmyzhuu/feat/qianfan-vision-provider

Add Qianfan support to Vision tool
This commit is contained in:
zhayujie
2026-05-06 15:39:12 +08:00
committed by GitHub
11 changed files with 313 additions and 6 deletions

View File

@@ -604,11 +604,12 @@ API Key 创建:在 [控制台](https://aistudio.google.com/app/apikey?hl=zh-cn
```json
{
"model": "ernie-5.0",
"qianfan_api_key": ""
"qianfan_api_key": "",
"qianfan_api_base": "https://qianfan.baidubce.com/v2"
}
```
- `model`: 默认推荐填写 `ernie-5.0`,也可填写 `ernie-4.5-turbo-128k``ernie-4.5-turbo-32k``ernie-x1-turbo-32k`
- `model`: 默认推荐填写 `ernie-5.0`,也可填写 `ernie-4.5-turbo-128k``ernie-4.5-turbo-32k``ernie-x1-turbo-32k`Vision 工具可使用 `ernie-4.5-turbo-vl`
- `qianfan_api_key`: 百度千帆 API Key通常以 `bce-v3/` 开头,可在百度智能云控制台创建
- `qianfan_api_base`: 可选,默认为 `https://qianfan.baidubce.com/v2`

View File

@@ -53,6 +53,7 @@ _DISCOVERABLE_MODELS = [
("ark_api_key", const.DOUBAO, const.DOUBAO_SEED_2_PRO, "Doubao"),
("dashscope_api_key", const.QWEN_DASHSCOPE, const.QWEN36_PLUS, "DashScope"),
("claude_api_key", const.CLAUDEAPI, const.CLAUDE_4_6_SONNET, "Claude"),
("qianfan_api_key", const.QIANFAN, const.ERNIE_45_TURBO_VL, "Qianfan"),
("gemini_api_key", const.GEMINI, const.GEMINI_31_FLASH_LITE_PRE, "Gemini"),
("zhipu_ai_api_key", const.ZHIPU_AI, const.GLM_4_7, "ZhipuAI"),
("minimax_api_key", const.MiniMax, const.MINIMAX_M2_7, "MiniMax"),
@@ -67,6 +68,7 @@ _MODEL_PREFIX_TO_PROVIDER = [
("moonshot-", "Moonshot"),
("qwen", "DashScope"), # qwen-*, qwen3-*, qwen3.6-*, etc.
("claude-", "Claude"),
("ernie-", "Qianfan"),
("gemini-", "Gemini"),
("glm-", "ZhipuAI"),
("minimax-", "MiniMax"),
@@ -140,7 +142,7 @@ class Vision(BaseTool):
"Error: No model available for Vision.\n"
"The main model does not support vision and no other API keys are configured.\n"
"Options:\n"
" 1. Switch to a multimodal model (e.g. qwen3.6-plus, claude-sonnet-4-6, gemini-2.0-flash)\n"
" 1. Switch to a multimodal model (e.g. ernie-4.5-turbo-vl, qwen3.6-plus, claude-sonnet-4-6, gemini-2.0-flash)\n"
" 2. Configure OPENAI_API_KEY: env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n"
" 3. Configure LINKAI_API_KEY: env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")"
)

View File

@@ -92,6 +92,8 @@ ERNIE_45_TURBO_128K = "ernie-4.5-turbo-128k"
ERNIE_45_TURBO_32K = "ernie-4.5-turbo-32k"
ERNIE_X1_TURBO_32K = "ernie-x1-turbo-32k"
ERNIE_4_TURBO_8K = "ERNIE-4.0-Turbo-8K"
ERNIE_45_TURBO_VL = "ernie-4.5-turbo-vl"
ERNIE_45_TURBO_VL_32K = "ernie-4.5-turbo-vl-32k"
# Qwen (通义千问 - 阿里云 DashScope)
QWEN_TURBO = "qwen-turbo"
@@ -169,6 +171,7 @@ MODEL_LIST = [
# Baidu Qianfan / ERNIE
QIANFAN, ERNIE_5, ERNIE_45_TURBO_128K, ERNIE_45_TURBO_32K, ERNIE_X1_TURBO_32K, ERNIE_4_TURBO_8K,
ERNIE_45_TURBO_VL, ERNIE_45_TURBO_VL_32K,
# MiniMax
MiniMax, MINIMAX_M2_7, MINIMAX_M2_7_HIGHSPEED, MINIMAX_M2_5, MINIMAX_M2_1, MINIMAX_M2_1_LIGHTNING, MINIMAX_M2, MINIMAX_ABAB6_5,

View File

@@ -28,6 +28,20 @@ Option 1: Native integration (recommended):
| `ernie-4.5-turbo-32k` | General chat with a balanced context window and cost |
| `ernie-x1-turbo-32k` | Tasks that need stronger reasoning |
## Vision tool
After `qianfan_api_key` is configured, Agent mode can auto-discover Qianfan for the Vision tool. The recommended Qianfan vision model is `ernie-4.5-turbo-vl`:
```json
{
"tool": {
"vision": {
"model": "ernie-4.5-turbo-vl"
}
}
}
```
Option 2: OpenAI-compatible configuration:
```json

View File

@@ -23,6 +23,7 @@ If the current provider fails, the tool automatically tries the next one until i
| Vendor | Vision Model | Notes |
| --- | --- | --- |
| OpenAI / Compatible | Main model | All OpenAI-compatible multimodal models |
| Baidu Qianfan | ernie-4.5-turbo-vl | Auto-discovered when `qianfan_api_key` is configured; can also be selected via `tool.vision.model` |
| Qwen (DashScope) | Main model | Via MultiModalConversation API |
| Claude | Main model | Anthropic native image format |
| Gemini | Main model | inlineData format |
@@ -52,7 +53,7 @@ To specify a particular model for the vision tool, add to `config.json`:
{
"tool": {
"vision": {
"model": "gpt-4o"
"model": "ernie-4.5-turbo-vl"
}
}
}

View File

@@ -28,6 +28,20 @@ description: Baidu Qianfan ERNIE モデル設定
| `ernie-4.5-turbo-32k` | コンテキスト長とコストのバランスが良い一般チャット向け |
| `ernie-x1-turbo-32k` | より強い推論が必要なタスク向け |
## Vision ツール
`qianfan_api_key` を設定すると、Agent モードの Vision ツールは Qianfan を自動検出できます。推奨する Qianfan の視覚モデルは `ernie-4.5-turbo-vl` です:
```json
{
"tool": {
"vision": {
"model": "ernie-4.5-turbo-vl"
}
}
}
```
方法 2: OpenAI 互換接続:
```json

View File

@@ -23,6 +23,7 @@ Vision ツールは多段階の自動選択+自動フォールバック戦略
| ベンダー | ビジョンモデル | 説明 |
| --- | --- | --- |
| OpenAI / 互換プロトコル | メインモデル | すべての OpenAI 互換マルチモーダルモデルに対応 |
| Baidu Qianfan | ernie-4.5-turbo-vl | `qianfan_api_key` を設定すると自動検出され、`tool.vision.model` でも指定できます |
| 通義千問 (DashScope) | メインモデル | MultiModalConversation API 経由 |
| Claude | メインモデル | Anthropic ネイティブ画像形式 |
| Gemini | メインモデル | inlineData 形式 |
@@ -52,7 +53,7 @@ Vision ツールで使用するモデルを指定するには、`config.json`
{
"tool": {
"vision": {
"model": "gpt-4o"
"model": "ernie-4.5-turbo-vl"
}
}
}

View File

@@ -28,6 +28,20 @@ description: 百度千帆 ERNIE 模型配置
| `ernie-4.5-turbo-32k` | 通用对话,成本和上下文更均衡 |
| `ernie-x1-turbo-32k` | 需要更强推理能力的任务 |
## Vision 工具
配置 `qianfan_api_key` 后Agent 的 Vision 工具可以自动使用千帆视觉模型。默认推荐使用 `ernie-4.5-turbo-vl`
```json
{
"tool": {
"vision": {
"model": "ernie-4.5-turbo-vl"
}
}
}
```
方式二OpenAI 兼容方式接入:
```json

View File

@@ -19,6 +19,7 @@ Vision 工具采用多级自动选择 + 自动兜底策略,无需手动配置
| 厂商 | 视觉模型 | 说明 |
| --- | --- | --- |
| OpenAI / 兼容协议 | 使用主模型 | 支持所有 OpenAI 协议兼容的多模态模型 |
| 百度千帆 (Qianfan) | ernie-4.5-turbo-vl | 配置 `qianfan_api_key` 后自动发现,也可通过 `tool.vision.model` 指定 |
| 通义千问 (DashScope) | 使用主模型 | 例如 qwen3.6-plus 等 |
| Claude | 使用主模型 | Anthropic 原生图像格式 |
| Gemini | 使用主模型 | inlineData 格式 |
@@ -41,7 +42,7 @@ Vision 工具采用多级自动选择 + 自动兜底策略,无需手动配置
{
"tool": {
"vision": {
"model": "gpt-4o"
"model": "ernie-4.5-turbo-vl"
}
}
}

View File

@@ -15,9 +15,12 @@ from .qianfan_session import QianfanSession
DEFAULT_API_BASE = "https://qianfan.baidubce.com/v2"
DEFAULT_MODEL = const.ERNIE_5
DEFAULT_VISION_MODEL = const.ERNIE_45_TURBO_VL
class QianfanBot(Bot, OpenAICompatibleBot):
supports_vision = True
def __init__(self):
super().__init__()
model = self._resolve_model()
@@ -136,6 +139,54 @@ class QianfanBot(Bot, OpenAICompatibleBot):
return self.reply_text(session, args, retry_count + 1)
return {"completion_tokens": 0, "content": "我现在有点累了,等会再来吧"}
def call_vision(self, image_url: str, question: str,
model: str = None, max_tokens: int = 1000) -> dict:
vision_model = model or DEFAULT_VISION_MODEL
payload = {
"model": vision_model,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": image_url}},
],
}
],
"max_tokens": max_tokens,
}
try:
response = requests.post(
"{}/chat/completions".format(self.api_base),
headers=self._build_headers(),
json=payload,
timeout=conf().get("request_timeout", 180),
)
if response.status_code != 200:
err = self._error_result(response, None)
return {
"error": True,
"message": err.get("content", "Qianfan vision request failed"),
}
data = response.json()
choices = data.get("choices", [])
content = choices[0].get("message", {}).get("content", "") if choices else ""
usage = data.get("usage", {}) or {}
return {
"content": content,
"model": data.get("model", vision_model),
"usage": {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
},
}
except Exception as e:
logger.exception(e)
return {"error": True, "message": str(e)}
def _error_result(self, response, session, args=None, retry_count=0):
try:
body = response.json()

View File

@@ -19,10 +19,20 @@ class TestQianfanConstantsAndRouting(unittest.TestCase):
self.assertEqual(const.ERNIE_45_TURBO_128K, "ernie-4.5-turbo-128k")
self.assertEqual(const.ERNIE_45_TURBO_32K, "ernie-4.5-turbo-32k")
self.assertEqual(const.ERNIE_X1_TURBO_32K, "ernie-x1-turbo-32k")
self.assertEqual(
const.ERNIE_45_TURBO_VL,
"ernie-4.5-turbo-vl",
)
self.assertEqual(
const.ERNIE_45_TURBO_VL_32K,
"ernie-4.5-turbo-vl-32k",
)
self.assertIn(const.QIANFAN, const.MODEL_LIST)
self.assertIn(const.ERNIE_45_TURBO_128K, const.MODEL_LIST)
self.assertIn(const.ERNIE_45_TURBO_32K, const.MODEL_LIST)
self.assertIn(const.ERNIE_X1_TURBO_32K, const.MODEL_LIST)
self.assertIn(const.ERNIE_45_TURBO_VL, const.MODEL_LIST)
self.assertIn(const.ERNIE_45_TURBO_VL_32K, const.MODEL_LIST)
def test_qianfan_config_keys_are_available(self):
import config
@@ -213,6 +223,113 @@ class TestQianfanBot(unittest.TestCase):
self.assertEqual(result["content"], "请求失败bad gateway text")
post.assert_called_once()
def test_qianfan_bot_supports_vision(self):
fake_conf = self._fake_conf()
with patch("models.qianfan.qianfan_bot.conf", return_value=fake_conf):
with patch("models.qianfan.qianfan_bot.SessionManager"):
from models.qianfan.qianfan_bot import QianfanBot
bot = QianfanBot()
self.assertTrue(bot.supports_vision)
def test_call_vision_posts_openai_compatible_multimodal_payload(self):
fake_conf = self._fake_conf()
fake_response = MagicMock()
fake_response.status_code = 200
fake_response.json.return_value = {
"id": "chatcmpl-test",
"model": "ernie-4.5-turbo-vl",
"choices": [{"message": {"content": "图中有一个红色方块。"}}],
"usage": {
"prompt_tokens": 10,
"completion_tokens": 8,
"total_tokens": 18,
},
}
with patch("models.qianfan.qianfan_bot.conf", return_value=fake_conf):
with patch("models.qianfan.qianfan_bot.SessionManager"):
from models.qianfan.qianfan_bot import QianfanBot
bot = QianfanBot()
with patch("models.qianfan.qianfan_bot.requests.post", return_value=fake_response) as post:
result = bot.call_vision(
image_url="data:image/png;base64,AAAA",
question="这张图里有什么?",
)
self.assertEqual(result["content"], "图中有一个红色方块。")
self.assertEqual(result["model"], "ernie-4.5-turbo-vl")
self.assertEqual(result["usage"]["total_tokens"], 18)
post.assert_called_once()
url = post.call_args.args[0]
kwargs = post.call_args.kwargs
self.assertEqual(url, "https://qianfan.baidubce.com/v2/chat/completions")
self.assertEqual(kwargs["headers"]["Authorization"], "Bearer test-qianfan-key")
self.assertEqual(kwargs["json"]["model"], "ernie-4.5-turbo-vl")
self.assertEqual(kwargs["json"]["max_tokens"], 1000)
self.assertEqual(kwargs["json"]["messages"], [
{
"role": "user",
"content": [
{"type": "text", "text": "这张图里有什么?"},
{
"type": "image_url",
"image_url": {"url": "data:image/png;base64,AAAA"},
},
],
}
])
def test_call_vision_allows_explicit_model_override(self):
fake_conf = self._fake_conf()
fake_response = MagicMock()
fake_response.status_code = 200
fake_response.json.return_value = {
"model": "ernie-4.5-turbo-vl-32k",
"choices": [{"message": {"content": "有文字。"}}],
"usage": {},
}
with patch("models.qianfan.qianfan_bot.conf", return_value=fake_conf):
with patch("models.qianfan.qianfan_bot.SessionManager"):
from models.qianfan.qianfan_bot import QianfanBot
bot = QianfanBot()
with patch("models.qianfan.qianfan_bot.requests.post", return_value=fake_response) as post:
result = bot.call_vision(
image_url="data:image/jpeg;base64,BBBB",
question="识别文字",
model="ernie-4.5-turbo-vl-32k",
max_tokens=256,
)
self.assertEqual(result["model"], "ernie-4.5-turbo-vl-32k")
self.assertEqual(post.call_args.kwargs["json"]["model"], "ernie-4.5-turbo-vl-32k")
self.assertEqual(post.call_args.kwargs["json"]["max_tokens"], 256)
def test_call_vision_returns_error_dict_for_api_error(self):
fake_conf = self._fake_conf()
fake_response = MagicMock()
fake_response.status_code = 400
fake_response.json.return_value = {"error": {"message": "bad image"}}
fake_response.text = '{"error":{"message":"bad image"}}'
with patch("models.qianfan.qianfan_bot.conf", return_value=fake_conf):
with patch("models.qianfan.qianfan_bot.SessionManager"):
from models.qianfan.qianfan_bot import QianfanBot
bot = QianfanBot()
with patch("models.qianfan.qianfan_bot.requests.post", return_value=fake_response):
result = bot.call_vision(
image_url="data:image/png;base64,AAAA",
question="这张图里有什么?",
)
self.assertTrue(result["error"])
self.assertEqual(result["message"], "请求失败bad image")
class TestQianfanSurfaces(unittest.TestCase):
def _read(self, relative_path):
@@ -243,6 +360,82 @@ class TestQianfanSurfaces(unittest.TestCase):
self.assertIn("const.QIANFAN", godcmd_source)
class TestQianfanVisionTool(unittest.TestCase):
def _fake_conf(self, values=None):
data = {
"model": "deepseek-v4-flash",
"qianfan_api_key": "",
"qianfan_api_base": "https://qianfan.baidubce.com/v2",
"open_ai_api_key": "",
"linkai_api_key": "",
"use_linkai": False,
"tool": {},
}
if values:
data.update(values)
fake_conf = MagicMock()
fake_conf.get.side_effect = lambda key, default=None: data.get(key, default)
return fake_conf
def test_vision_auto_discovers_qianfan_when_key_configured(self):
fake_conf = self._fake_conf({"qianfan_api_key": "test-qianfan-key"})
fake_bot = MagicMock()
fake_bot.call_vision = MagicMock()
with patch("agent.tools.vision.vision.conf", return_value=fake_conf):
with patch("models.bot_factory.create_bot", return_value=fake_bot) as create_bot:
from agent.tools.vision.vision import Vision
from common import const
tool = Vision()
tool.model = None
providers = tool._resolve_providers()
self.assertEqual(providers[0].name, "Qianfan")
self.assertEqual(providers[0].model_override, const.ERNIE_45_TURBO_VL)
self.assertTrue(providers[0].use_bot)
create_bot.assert_called_with(const.QIANFAN)
def test_vision_routes_ernie_model_override_to_qianfan(self):
fake_conf = self._fake_conf({
"qianfan_api_key": "test-qianfan-key",
"tool": {"vision": {"model": "ernie-4.5-turbo-vl-32k"}},
})
fake_bot = MagicMock()
fake_bot.call_vision = MagicMock()
with patch("agent.tools.vision.vision.conf", return_value=fake_conf):
with patch("models.bot_factory.create_bot", return_value=fake_bot):
from agent.tools.vision.vision import Vision
tool = Vision()
tool.model = None
providers = tool._resolve_providers()
self.assertEqual(providers[0].name, "Qianfan")
self.assertEqual(providers[0].model_override, "ernie-4.5-turbo-vl-32k")
def test_vision_main_model_uses_qianfan_when_configured_model_is_ernie(self):
fake_conf = self._fake_conf({"model": "ernie-4.5-turbo-vl-32k"})
from common import const
fake_model = MagicMock()
fake_model._resolve_bot_type.return_value = const.QIANFAN
fake_model.bot = MagicMock()
fake_model.bot.supports_vision = True
fake_model.bot.call_vision = MagicMock()
with patch("agent.tools.vision.vision.conf", return_value=fake_conf):
from agent.tools.vision.vision import Vision
tool = Vision()
tool.model = fake_model
providers = tool._resolve_providers()
self.assertEqual(providers[0].name, "MainModel")
self.assertEqual(providers[0].model_override, "ernie-4.5-turbo-vl-32k")
class TestQianfanDocs(unittest.TestCase):
def _read(self, relative_path):
root = os.path.join(os.path.dirname(__file__), "..")
@@ -259,6 +452,7 @@ class TestQianfanDocs(unittest.TestCase):
self.assertIn("qianfan_api_key", text)
self.assertIn("https://qianfan.baidubce.com/v2", text)
self.assertIn("ernie-4.5-turbo-128k", text)
self.assertIn("ernie-4.5-turbo-vl", text)
def test_model_indexes_link_qianfan(self):
for path in (
@@ -276,6 +470,17 @@ class TestQianfanDocs(unittest.TestCase):
self.assertIn('"qianfan_api_key": ""', text)
self.assertIn('"qianfan_api_base": "https://qianfan.baidubce.com/v2"', text)
def test_vision_docs_document_qianfan_provider(self):
expected = {
"docs/tools/vision.mdx": "百度千帆",
"docs/en/tools/vision.mdx": "Baidu Qianfan",
"docs/ja/tools/vision.mdx": "Baidu Qianfan",
}
for path, label in expected.items():
text = self._read(path)
self.assertIn(label, text)
self.assertIn("ernie-4.5-turbo-vl", text)
if __name__ == "__main__":
unittest.main()