diff --git a/README.md b/README.md
index 2a72f513..8b9e044e 100644
--- a/README.md
+++ b/README.md
@@ -104,6 +104,7 @@ CowAgent supports all mainstream LLM providers. **Chat, vision, image generation
| [Kimi](https://docs.cowagent.ai/en/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | |
| [MiniMax](https://docs.cowagent.ai/en/models/minimax) | MiniMax-M2.7 | ✅ | ✅ | ✅ | | ✅ | |
| [ERNIE](https://docs.cowagent.ai/en/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | |
+| [MiMo](https://docs.cowagent.ai/en/models/mimo) | mimo-v2.5 / pro | ✅ | ✅ | | | ✅ | |
| [LinkAI](https://docs.cowagent.ai/en/models/linkai) | One key for 100+ models | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Custom](https://docs.cowagent.ai/en/models/custom) | Local models / third-party proxy | ✅ | | | | | |
diff --git a/agent/tools/vision/vision.py b/agent/tools/vision/vision.py
index 56a2ecfe..498f3cd8 100644
--- a/agent/tools/vision/vision.py
+++ b/agent/tools/vision/vision.py
@@ -57,6 +57,7 @@ _DISCOVERABLE_MODELS = [
("qianfan_api_key", const.QIANFAN, const.ERNIE_45_TURBO_VL, "Qianfan"),
("zhipu_ai_api_key", const.ZHIPU_AI, const.GLM_4_7, "ZhipuAI"),
("minimax_api_key", const.MiniMax, const.MINIMAX_M2_7, "MiniMax"),
+ ("mimo_api_key", const.MIMO, const.MIMO_V2_5_PRO, "MiMo"),
]
# Model name prefix → discoverable provider display_name.
@@ -73,6 +74,7 @@ _MODEL_PREFIX_TO_PROVIDER = [
("glm-", "ZhipuAI"),
("minimax-", "MiniMax"),
("abab", "MiniMax"),
+ ("mimo-", "MiMo"),
]
# Model prefixes that natively belong to OpenAI / LinkAI (raw HTTP providers).
@@ -92,6 +94,7 @@ _PROVIDER_ID_TO_DISPLAY = {
"qianfan": "Qianfan",
"zhipu": "ZhipuAI",
"minimax": "MiniMax",
+ "mimo": "MiMo",
}
diff --git a/bridge/bridge.py b/bridge/bridge.py
index c0cb62e4..6eeb0887 100644
--- a/bridge/bridge.py
+++ b/bridge/bridge.py
@@ -63,6 +63,10 @@ class Bridge(object):
if model_type and model_type.startswith("deepseek"):
self.btype["chat"] = const.DEEPSEEK
+ # 小米 MiMo 系列模型,全部以 mimo- 开头
+ if model_type and model_type.startswith("mimo-"):
+ self.btype["chat"] = const.MIMO
+
if model_type and isinstance(model_type, str):
lowered_model_type = model_type.lower()
if lowered_model_type == const.QIANFAN or lowered_model_type.startswith("ernie"):
diff --git a/channel/web/web_channel.py b/channel/web/web_channel.py
index ab1d6915..af4e241e 100644
--- a/channel/web/web_channel.py
+++ b/channel/web/web_channel.py
@@ -1387,6 +1387,7 @@ class ConfigHandler:
const.DOUBAO_SEED_2_PRO, const.DOUBAO_SEED_2_CODE,
const.KIMI_K2_6, const.KIMI_K2_5, const.KIMI_K2,
const.ERNIE_5_1, const.ERNIE_5, const.ERNIE_X1_1, const.ERNIE_45_TURBO_128K, const.ERNIE_45_TURBO_32K,
+ const.MIMO_V2_5_PRO, const.MIMO_V2_5,
]
# Generic placeholder hints surfaced in the web console. We deliberately
@@ -1481,6 +1482,14 @@ class ConfigHandler:
"api_base_placeholder": _PLACEHOLDER_QIANFAN,
"models": [const.ERNIE_5_1, const.ERNIE_5, const.ERNIE_X1_1, const.ERNIE_45_TURBO_128K, const.ERNIE_45_TURBO_32K],
}),
+ ("mimo", {
+ "label": {"zh": "小米 MiMo", "en": "MiMo"},
+ "api_key_field": "mimo_api_key",
+ "api_base_key": "mimo_api_base",
+ "api_base_default": "https://api.xiaomimimo.com/v1",
+ "api_base_placeholder": _PLACEHOLDER_V1,
+ "models": [const.MIMO_V2_5_PRO, const.MIMO_V2_5],
+ }),
("linkai", {
"label": "LinkAI",
"api_key_field": "linkai_api_key",
@@ -1502,10 +1511,10 @@ class ConfigHandler:
EDITABLE_KEYS = {
"model", "bot_type", "use_linkai",
"open_ai_api_base", "deepseek_api_base", "qianfan_api_base", "claude_api_base", "gemini_api_base",
- "zhipu_ai_api_base", "moonshot_base_url", "ark_base_url", "custom_api_base",
+ "zhipu_ai_api_base", "moonshot_base_url", "ark_base_url", "custom_api_base", "mimo_api_base",
"open_ai_api_key", "deepseek_api_key", "qianfan_api_key", "claude_api_key", "gemini_api_key",
"zhipu_ai_api_key", "dashscope_api_key", "moonshot_api_key",
- "ark_api_key", "minimax_api_key", "linkai_api_key", "custom_api_key",
+ "ark_api_key", "minimax_api_key", "linkai_api_key", "custom_api_key", "mimo_api_key",
"agent_max_context_tokens", "agent_max_context_turns", "agent_max_steps",
"enable_thinking", "web_password",
}
@@ -1646,7 +1655,7 @@ class ModelsHandler:
# Capability -> provider ids drawn from ConfigHandler.PROVIDER_MODELS.
_ASR_PROVIDERS = ["openai", "dashscope", "zhipu", "linkai"]
# Web-console white-list. Other vendors stay usable via direct config.
- _TTS_PROVIDERS = ["openai", "minimax", "dashscope", "linkai"]
+ _TTS_PROVIDERS = ["openai", "minimax", "dashscope", "mimo", "linkai"]
# TTS engine catalog (speech models, not voice timbres). Entries are
# either a bare code or {value, hint?} when a friendly label helps.
@@ -1661,6 +1670,10 @@ class ModelsHandler:
"dashscope": [
{"value": "qwen3-tts-flash", "hint": "覆盖普通话、方言与主流外语"},
],
+ # 小米 MiMo TTS 系列,通过 chat completions 接口合成
+ "mimo": [
+ {"value": "mimo-v2.5-tts", "hint": "预置音色 · 支持唱歌模式"},
+ ],
# Aggregating gateway: a single endpoint multiplexes several
# underlying TTS engines, selected via the `model` field.
# Each engine exposes its own voice catalog (see _TTS_PROVIDER_VOICES).
@@ -1780,6 +1793,18 @@ class ModelsHandler:
{"value": "Marcus", "hint": "陕西话 · 秦川"},
{"value": "Roy", "hint": "闽南语 · 阿杰"},
],
+ # 小米 MiMo 预置音色列表(mimo-v2.5-tts),文档:
+ # https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5
+ "mimo": [
+ {"value": "冰糖", "hint": "中文 · 女声 · 冰糖"},
+ {"value": "茉莉", "hint": "中文 · 女声 · 茉莉"},
+ {"value": "苏打", "hint": "中文 · 男声 · 苏打"},
+ {"value": "白桦", "hint": "中文 · 男声 · 白桦"},
+ {"value": "Mia", "hint": "英文 · 女声 · Mia"},
+ {"value": "Chloe", "hint": "英文 · 女声 · Chloe"},
+ {"value": "Milo", "hint": "英文 · 男声 · Milo"},
+ {"value": "Dean", "hint": "英文 · 男声 · Dean"},
+ ],
# Aggregating gateway: voices are scoped per engine model. The
# frontend picks the correct list based on the selected model so
# users don't see incompatible timbres for the active engine.
@@ -1916,6 +1941,8 @@ class ModelsHandler:
# (see models/minimax/minimax_bot.py::call_vision); the M2.x chat
# family is text-only.
"minimax": [const.MINIMAX_TEXT_01],
+ # MiMo 原生全模态模型:v2.5-pro / v2.5 支持图像/音频/视频输入
+ "mimo": [const.MIMO_V2_5_PRO, const.MIMO_V2_5],
# LinkAI proxies the underlying vendor; surface a curated set of
# multimodal models. Order: gpt-4.1-mini → gpt-5.4-mini as the
# cross-vendor baselines, then each vendor's recommended default.
@@ -2045,6 +2072,7 @@ class ModelsHandler:
("qianfan", "qianfan_api_key", const.ERNIE_45_TURBO_VL),
("zhipu", "zhipu_ai_api_key", const.GLM_5V_TURBO),
("minimax", "minimax_api_key", const.MINIMAX_TEXT_01),
+ ("mimo", "mimo_api_key", const.MIMO_V2_5_PRO),
]
@classmethod
diff --git a/common/const.py b/common/const.py
index 9cfcd63c..7addd6af 100644
--- a/common/const.py
+++ b/common/const.py
@@ -15,6 +15,7 @@ ZHIPU_AI = "zhipu"
MOONSHOT = "moonshot"
MiniMax = "minimax"
DEEPSEEK = "deepseek"
+MIMO = "mimo" # 小米 MiMo 大模型
CUSTOM = "custom" # custom OpenAI-compatible API, bot_type won't auto-switch on model change
MODELSCOPE = "modelscope"
@@ -140,6 +141,13 @@ KIMI_K2 = "kimi-k2"
KIMI_K2_5 = "kimi-k2.5"
KIMI_K2_6 = "kimi-k2.6" # Kimi K2.6 - Agent recommended model (default)
+# 小米 MiMo
+MIMO_V2_5_PRO = "mimo-v2.5-pro" # MiMo V2.5 Pro - 旗舰,长上下文(默认推荐)
+MIMO_V2_5 = "mimo-v2.5" # MiMo V2.5 - 多模态(文/图/音/视频)
+MIMO_V2_PRO = "mimo-v2-pro" # MiMo V2 Pro
+MIMO_V2_OMNI = "mimo-v2-omni" # MiMo V2 Omni - 多模态
+MIMO_V2_FLASH = "mimo-v2-flash" # MiMo V2 Flash - 极速版
+
# Doubao (Volcengine Ark)
DOUBAO = "doubao"
DOUBAO_SEED_2_CODE = "doubao-seed-2-0-code-preview-260215"
@@ -182,6 +190,9 @@ MODEL_LIST = [
# MiniMax
MiniMax, MINIMAX_M2_7, MINIMAX_M2_7_HIGHSPEED, MINIMAX_M2_5, MINIMAX_M2_1, MINIMAX_M2_1_LIGHTNING, MINIMAX_M2, MINIMAX_ABAB6_5,
+ # 小米 MiMo
+ MIMO, MIMO_V2_5_PRO, MIMO_V2_5, MIMO_V2_PRO, MIMO_V2_OMNI, MIMO_V2_FLASH,
+
# Claude
CLAUDE3, CLAUDE_4_6_SONNET, CLAUDE_4_7_OPUS, CLAUDE_4_6_OPUS, CLAUDE_4_OPUS, CLAUDE_4_5_SONNET, CLAUDE_4_SONNET, CLAUDE_3_OPUS, CLAUDE_3_OPUS_0229,
CLAUDE_35_SONNET, CLAUDE_35_SONNET_1022, CLAUDE_35_SONNET_0620, CLAUDE_3_SONNET, CLAUDE_3_HAIKU,
diff --git a/config.py b/config.py
index 6a3a00df..1d44dcc5 100644
--- a/config.py
+++ b/config.py
@@ -209,6 +209,9 @@ available_setting = {
"Minimax_base_url": "",
"deepseek_api_key": "",
"deepseek_api_base": "https://api.deepseek.com/v1",
+ # 小米 MiMo 大模型
+ "mimo_api_key": "",
+ "mimo_api_base": "https://api.xiaomimimo.com/v1",
"web_host": "", # Web console bind address; empty means auto
"web_port": 9899,
"web_password": "", # Web console password; empty means no authentication required
@@ -401,6 +404,8 @@ def load_config():
"minimax_api_base": "MINIMAX_API_BASE",
"deepseek_api_key": "DEEPSEEK_API_KEY",
"deepseek_api_base": "DEEPSEEK_API_BASE",
+ "mimo_api_key": "MIMO_API_KEY",
+ "mimo_api_base": "MIMO_API_BASE",
"qianfan_api_key": "QIANFAN_API_KEY",
"qianfan_api_base": "QIANFAN_API_BASE",
"zhipu_ai_api_key": "ZHIPU_AI_API_KEY",
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 00000000..f406cc2a
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,30 @@
+# Documentation
+
+This directory contains the Mintlify documentation site for the project.
+
+## Prerequisites
+
+- Node.js v20.17.0 or higher (LTS recommended)
+
+## Install the CLI (one-time, global)
+
+```bash
+npm i -g mint
+```
+
+## Run the docs locally
+
+From this `docs/` directory:
+
+```bash
+mint dev
+```
+
+Then open http://localhost:3000 (or the port Mint reports if 3000 is in use).
+
+> The first run downloads the Mint preview framework (~90 MB) into `~/.mintlify/`.
+> Subsequent runs start instantly from the local cache.
+
+## More
+
+- Mintlify docs: https://www.mintlify.com/docs
diff --git a/docs/docs.json b/docs/docs.json
index 00a5be67..e2826887 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -88,6 +88,7 @@
"models/doubao",
"models/kimi",
"models/qianfan",
+ "models/mimo",
"models/linkai",
"models/coding-plan",
"models/custom"
@@ -290,6 +291,7 @@
"en/models/doubao",
"en/models/kimi",
"en/models/qianfan",
+ "en/models/mimo",
"en/models/linkai",
"en/models/coding-plan",
"en/models/custom"
@@ -492,6 +494,7 @@
"ja/models/doubao",
"ja/models/kimi",
"ja/models/qianfan",
+ "ja/models/mimo",
"ja/models/linkai",
"ja/models/coding-plan",
"ja/models/custom"
diff --git a/docs/en/models/index.mdx b/docs/en/models/index.mdx
index 9c7afe44..cbe74d41 100644
--- a/docs/en/models/index.mdx
+++ b/docs/en/models/index.mdx
@@ -21,6 +21,7 @@ A snapshot of each vendor's capabilities. "Text" refers to the main chat model;
| [Doubao](/en/models/doubao) | doubao-seed-2.0 series | ✅ | ✅ | ✅ | | | ✅ |
| [Kimi](/en/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | |
| [ERNIE](/en/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | |
+| [MiMo](/en/models/mimo) | mimo-v2.5-pro / v2.5 | ✅ | ✅ | | | ✅ | |
| [LinkAI](/en/models/linkai) | 100+ models from multiple vendors | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Custom](/en/models/custom) | Local models / third-party proxies | ✅ | | | | | |
diff --git a/docs/en/models/mimo.mdx b/docs/en/models/mimo.mdx
new file mode 100644
index 00000000..6f808b8e
--- /dev/null
+++ b/docs/en/models/mimo.mdx
@@ -0,0 +1,136 @@
+---
+title: MiMo
+description: Xiaomi MiMo model configuration (Text Chat + Image Understanding + Text-to-Speech)
+---
+
+Xiaomi MiMo is a native omni-modal large model. A single `mimo_api_key` enables text chat, image understanding, and text-to-speech all at once.
+
+
+ All capabilities below can be configured in one place via the "Model Management" page in the Web Console — no need to manually edit the configuration file.
+
+
+## Text Chat
+
+```json
+{
+ "model": "mimo-v2.5-pro",
+ "mimo_api_key": "YOUR_API_KEY",
+ "mimo_api_base": "https://api.xiaomimimo.com/v1"
+}
+```
+
+| Parameter | Description |
+| --- | --- |
+| `model` | Default recommendation: `mimo-v2.5-pro`; `mimo-v2.5` is also supported |
+| `mimo_api_key` | Create one in the [MiMo Open Platform](https://platform.xiaomimimo.com/console/api-keys) |
+| `mimo_api_base` | Optional, defaults to `https://api.xiaomimimo.com/v1` |
+
+### Model Selection
+
+| Model | Use Case |
+| --- | --- |
+| `mimo-v2.5-pro` | Flagship: native omni-modal + Agent capability, up to 1M tokens context |
+| `mimo-v2.5` | General-purpose, native omni-modal (text / image / video / audio) |
+
+## Thinking Mode
+
+The MiMo V2.5 series enables "thinking mode" by default: the model emits `reasoning_content` (chain-of-thought) before the final answer, improving performance on complex tasks.
+
+Use the global `enable_thinking` flag to toggle visibility (also switchable from the Web Console settings):
+
+```json
+{
+ "enable_thinking": true
+}
+```
+
+## Image Understanding
+
+Once `mimo_api_key` is configured, the Agent's Vision tool can automatically use MiMo's vision models:
+
+- When the main model itself is multimodal (`mimo-v2.5-pro` / `mimo-v2.5`), images are handled directly by the main model with no extra setup.
+- When the main model belongs to another vendor, the Vision tool falls back to `mimo-v2.5-pro` in order.
+
+To force a specific Vision model, set it explicitly in the configuration:
+
+```json
+{
+ "tools": {
+ "vision": {
+ "provider": "mimo",
+ "model": "mimo-v2.5-pro"
+ }
+ }
+}
+```
+
+## Text-to-Speech (TTS)
+
+```json
+{
+ "text_to_voice": "mimo",
+ "text_to_voice_model": "mimo-v2.5-tts",
+ "tts_voice_id": "冰糖"
+}
+```
+
+| Parameter | Description |
+| --- | --- |
+| `text_to_voice_model` | Currently only `mimo-v2.5-tts` (preset voices + singing mode) |
+| `tts_voice_id` | Preset voice name (Chinese voice IDs use the Chinese name directly) |
+
+### Preset Voices
+
+| Voice ID | Description |
+| --- | --- |
+| `Mia` | English · Female |
+| `Chloe` | English · Female |
+| `Milo` | English · Male |
+| `Dean` | English · Male |
+| `冰糖` | Chinese · Female (default) |
+| `茉莉` | Chinese · Female |
+| `苏打` | Chinese · Male |
+| `白桦` | Chinese · Male |
+
+
+You can also pick a voice visually from the Web Console under "Model Management → Text-to-Speech".
+
+### Style Control
+
+MiMo TTS supports embedding **audio tags** in the synthesis text to control emotion, tone, dialect, persona, and even singing. Tags must appear in the **text that will be synthesized to speech (i.e. the Agent's reply)**, with the overall style tag placed at the very beginning:
+
+```
+(style)content-to-synthesize
+```
+
+Half-width `()`, full-width `()`, and `[]` brackets are all accepted. Both Chinese and English style descriptors work — pick whichever language expresses the timbre most precisely. Common examples:
+
+| Category | Example tags |
+| --- | --- |
+| Basic emotions | `happy` `sad` `angry` `fear` `surprised` `excited` `aggrieved` `calm` `indifferent` |
+| Compound emotions | `wistful` `relieved` `helpless` `guilty` `at ease` `uneasy` `touched` |
+| Overall tone | `gentle` `aloof` `lively` `serious` `languid` `playful` `deep` `sharp` `cutting` |
+| Voice character | `magnetic` `mellow` `bright` `ethereal` `childlike` `aged` `sweet` `husky` |
+| Persona | `squeaky` `mature lady` `young boy` `uncle` `Taiwanese accent` |
+| Dialect | `Northeastern` `Sichuan` `Henan` `Cantonese` |
+| Role-play | `Sun Wukong` `Lin Daiyu` |
+| Singing | `sing` / `singing` |
+
+Examples:
+
+- `(magnetic)The night is deep, and the city is still breathing.`
+- `(gentle)Take a breath. You've got this.`
+- `(serious)This is the final warning before the system reboots.`
+- `(singing)Oh, when the saints go marching in…`
+
+You can also insert fine-grained audio tags at any position in the text to control breathing, laughter, pauses, etc. For example:
+
+```
+(nervous, deep breath) Phew… stay calm, stay calm. (faster pace) I've rehearsed this intro fifty times, it'll be fine.
+```
+
+See the [MiMo speech synthesis documentation](https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5) for the full tag list.
+
+
+ When CowAgent calls TTS, the Agent's reply text (including any `(...)` tags) is forwarded directly to MiMo for synthesis. Tell the model in its persona / system prompt to "prefix replies with a `(style)` tag to control the tone", and IM channels (WeChat / Feishu / DingTalk / WeCom) will play voice replies with the corresponding emotion, dialect, or even singing.
+
diff --git a/docs/ja/README.md b/docs/ja/README.md
index b68a82d0..df71ec74 100644
--- a/docs/ja/README.md
+++ b/docs/ja/README.md
@@ -104,6 +104,7 @@ CowAgent は主要な LLM プロバイダーすべてに対応しています。
| [Kimi](https://docs.cowagent.ai/ja/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | |
| [MiniMax](https://docs.cowagent.ai/ja/models/minimax) | MiniMax-M2.7 | ✅ | ✅ | ✅ | | ✅ | |
| [ERNIE](https://docs.cowagent.ai/ja/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | |
+| [MiMo](https://docs.cowagent.ai/ja/models/mimo) | mimo-v2.5-pro / v2.5 | ✅ | ✅ | | | ✅ | |
| [LinkAI](https://docs.cowagent.ai/ja/models/linkai) | 1 つの Key で 100+ モデルに接続 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [カスタム](https://docs.cowagent.ai/ja/models/custom) | ローカルモデル / サードパーティプロキシ | ✅ | | | | | |
diff --git a/docs/ja/models/mimo.mdx b/docs/ja/models/mimo.mdx
new file mode 100644
index 00000000..c677810f
--- /dev/null
+++ b/docs/ja/models/mimo.mdx
@@ -0,0 +1,135 @@
+---
+title: Xiaomi MiMo
+description: Xiaomi MiMo モデル設定(テキスト対話 + 画像理解 + 音声合成)
+---
+
+Xiaomi MiMo はネイティブ全モーダル大規模言語モデルです。1 つの `mimo_api_key` でテキスト対話、画像理解、音声合成を同時に有効化できます。
+
+
+ Web コンソールの「モデル管理」ページから、以下のすべての機能をワンストップで設定でき、設定ファイルを手動で編集する必要はありません。
+
+
+## テキスト対話
+
+```json
+{
+ "model": "mimo-v2.5-pro",
+ "mimo_api_key": "YOUR_API_KEY",
+ "mimo_api_base": "https://api.xiaomimimo.com/v1"
+}
+```
+
+| パラメータ | 説明 |
+| --- | --- |
+| `model` | 推奨は `mimo-v2.5-pro`。`mimo-v2.5` も使用可能 |
+| `mimo_api_key` | [MiMo Open Platform](https://platform.xiaomimimo.com/console/api-keys) で作成 |
+| `mimo_api_base` | 任意。デフォルトは `https://api.xiaomimimo.com/v1` |
+
+### モデル選択
+
+| モデル | ユースケース |
+| --- | --- |
+| `mimo-v2.5-pro` | フラッグシップ。ネイティブ全モーダル + Agent 能力、最大 100 万トークンのコンテキスト |
+| `mimo-v2.5` | 汎用版。ネイティブ全モーダル(テキスト / 画像 / 動画 / 音声) |
+
+## 思考モード
+
+MiMo V2.5 シリーズはデフォルトで「思考モード」が有効です。最終回答の前に `reasoning_content`(思考過程)を出力することで、複雑なタスクのパフォーマンスを高めます。
+
+表示の有無はグローバル設定 `enable_thinking` で切り替え可能です(Web コンソールの設定ページからも変更できます):
+
+```json
+{
+ "enable_thinking": true
+}
+```
+
+## 画像理解
+
+`mimo_api_key` を設定すると、Agent の Vision ツールは自動的に MiMo のビジョンモデルを利用します:
+
+- メインモデル自体がマルチモーダル(`mimo-v2.5-pro` / `mimo-v2.5`)の場合は、画像はメインモデルが直接処理し、追加設定は不要です。
+- メインモデルが他社製の場合、Vision ツールは順序に従い `mimo-v2.5-pro` にフォールバックします。
+
+特定の Vision モデルを強制したい場合は、設定ファイルで明示的に指定してください:
+
+```json
+{
+ "tools": {
+ "vision": {
+ "provider": "mimo",
+ "model": "mimo-v2.5-pro"
+ }
+ }
+}
+```
+
+## 音声合成
+
+```json
+{
+ "text_to_voice": "mimo",
+ "text_to_voice_model": "mimo-v2.5-tts",
+ "tts_voice_id": "冰糖"
+}
+```
+
+| パラメータ | 説明 |
+| --- | --- |
+| `text_to_voice_model` | 現在は `mimo-v2.5-tts` のみ対応(プリセット音色 + 歌唱モード) |
+| `tts_voice_id` | プリセット音色名(中国語の音色は中国語名がそのまま ID) |
+
+### プリセット音色
+
+| 音色 ID | 説明 |
+| --- | --- |
+| `冰糖` | 中国語 · 女声(デフォルト) |
+| `茉莉` | 中国語 · 女声 |
+| `苏打` | 中国語 · 男声 |
+| `白桦` | 中国語 · 男声 |
+| `Mia` | 英語 · 女声 |
+| `Chloe` | 英語 · 女声 |
+| `Milo` | 英語 · 男声 |
+| `Dean` | 英語 · 男声 |
+
+Web コンソールの「モデル管理 → 音声合成」のドロップダウンから視覚的に選択することもできます。
+
+### スタイル制御
+
+MiMo TTS は合成テキスト内に **音声タグ** を埋め込むことで、感情、語調、方言、キャラクター、さらには歌唱まで制御できます。タグは **最終的に音声合成されるテキスト(つまり Agent の返信内容)** に含める必要があり、全体スタイルのタグは先頭に置きます:
+
+```
+(スタイル)合成するテキスト
+```
+
+半角 `()`、全角 `()`、`[]` の 3 種類の括弧に対応。スタイル記述は中国語・英語のどちらでも OK で、最も的確に表現できる言語を選んでください。代表的なスタイル例:
+
+| 種類 | サンプルタグ |
+| --- | --- |
+| 基本感情 | `happy` `sad` `angry` `fear` `surprised` `excited` `aggrieved` `calm` `indifferent` |
+| 複合感情 | `wistful` `relieved` `helpless` `guilty` `at ease` `uneasy` `touched` |
+| 全体トーン | `gentle` `aloof` `lively` `serious` `languid` `playful` `deep` `sharp` `cutting` |
+| 声質 | `magnetic` `mellow` `bright` `ethereal` `childlike` `aged` `sweet` `husky` |
+| キャラクター調 | `squeaky` `mature lady` `young boy` `uncle` `Taiwanese accent` |
+| 方言 | `Northeastern` `Sichuan` `Henan` `Cantonese` |
+| ロールプレイ | `Sun Wukong` `Lin Daiyu` |
+| 歌唱 | `sing` / `singing` |
+
+例:
+
+- `(magnetic)夜が深まり、街はまだ呼吸している。`
+- `(gentle)深呼吸して。きっと大丈夫。`
+- `(serious)これがシステム再起動前の最後の警告です。`
+- `(singing)Twinkle, twinkle, little star, how I wonder what you are…`
+
+テキストの任意の位置に細かい音声タグを挿入して、呼吸、笑い声、間などを制御することもできます。例:
+
+```
+(nervous, deep breath) ふぅ……落ち着いて、落ち着いて。(faster pace) 自己紹介は五十回練習したから大丈夫。
+```
+
+タグの完全な一覧は [MiMo 音声合成ドキュメント](https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5) を参照してください。
+
+
+ CowAgent は TTS 呼び出し時、Agent の返信原文(`(...)` タグを含む)をそのまま MiMo に送信します。ペルソナ / システムプロンプトで「返信の冒頭に `(スタイル)` タグを付けて口調を指定する」よう指示すれば、IM チャネル(WeChat / Feishu / DingTalk / WeCom)の音声返信に感情・方言・歌唱などの効果を付与できます。
+
diff --git a/docs/models/index.mdx b/docs/models/index.mdx
index 5a7df20a..114c58e0 100644
--- a/docs/models/index.mdx
+++ b/docs/models/index.mdx
@@ -22,6 +22,7 @@ CowAgent 支持国内外主流厂商的大语言模型,模型接口实现在
| [豆包 Doubao](/models/doubao) | doubao-seed-2.0 系列 | ✅ | ✅ | ✅ | | | ✅ |
| [Kimi](/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | |
| [百度千帆](/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | |
+| [小米 MiMo](/models/mimo) | mimo-v2.5-pro / v2.5 | ✅ | ✅ | | | ✅ | |
| [LinkAI](/models/linkai) | 多厂商 100+ 模型统一接入 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [自定义](/models/custom) |本地模型 / 三方代理 | ✅ | | | | | |
diff --git a/docs/models/mimo.mdx b/docs/models/mimo.mdx
new file mode 100644
index 00000000..ea445df9
--- /dev/null
+++ b/docs/models/mimo.mdx
@@ -0,0 +1,135 @@
+---
+title: 小米 MiMo
+description: 小米 MiMo 模型配置(文本对话 + 图像理解 + 语音合成)
+---
+
+小米 MiMo 是原生全模态大模型,单 `mimo_api_key` 即可同时启用文本对话、图像理解与语音合成。
+
+
+ 通过 Web 控制台的「模型管理」页面可一站式配置以下全部能力,无需手动改配置文件。
+
+
+## 文本对话
+
+```json
+{
+ "model": "mimo-v2.5-pro",
+ "mimo_api_key": "YOUR_API_KEY",
+ "mimo_api_base": "https://api.xiaomimimo.com/v1"
+}
+```
+
+| 参数 | 说明 |
+| --- | --- |
+| `model` | 默认推荐 `mimo-v2.5-pro`,也可使用 `mimo-v2.5` |
+| `mimo_api_key` | 在 [MiMo 开放平台](https://platform.xiaomimimo.com/console/api-keys) 创建 |
+| `mimo_api_base` | 可选,默认为 `https://api.xiaomimimo.com/v1` |
+
+### 模型选择
+
+| 模型 | 适用场景 |
+| --- | --- |
+| `mimo-v2.5-pro` | 旗舰,原生全模态 + Agent 能力,最高 100 万 tokens 上下文 |
+| `mimo-v2.5` | 综合版,原生全模态(文本 / 图像 / 视频 / 音频) |
+
+## 思考模式
+
+MiMo V2.5 系列默认开启「思考模式」:模型在输出最终回答前会先输出 `reasoning_content`(思维链),提升复杂任务表现。
+
+通过全局配置 `enable_thinking` 控制是否展示(也可在 Web 控制台 - 配置页面切换):
+
+```json
+{
+ "enable_thinking": true
+}
+```
+
+## 图像理解
+
+配置 `mimo_api_key` 后,Agent 的 Vision 工具可以自动使用 MiMo 视觉模型:
+
+- 当主模型本身是多模态时(`mimo-v2.5-pro` / `mimo-v2.5`),直接由主模型识别图像,无需额外配置
+- 当主模型是其他厂商时,Vision 工具会根据顺序自动 fallback 到 `mimo-v2.5-pro`
+
+如需手动指定 Vision 模型,可在配置文件中显式配置:
+
+```json
+{
+ "tools": {
+ "vision": {
+ "provider": "mimo",
+ "model": "mimo-v2.5-pro"
+ }
+ }
+}
+```
+
+## 语音合成
+
+```json
+{
+ "text_to_voice": "mimo",
+ "text_to_voice_model": "mimo-v2.5-tts",
+ "tts_voice_id": "冰糖"
+}
+```
+
+| 参数 | 说明 |
+| --- | --- |
+| `text_to_voice_model` | 当前仅支持 `mimo-v2.5-tts`(预置音色 + 唱歌模式) |
+| `tts_voice_id` | 预置音色名(中文音色直接使用中文名作为 ID) |
+
+### 预置音色
+
+| 音色 ID | 说明 |
+| --- | --- |
+| `冰糖` | 中文 · 女声(默认) |
+| `茉莉` | 中文 · 女声 |
+| `苏打` | 中文 · 男声 |
+| `白桦` | 中文 · 男声 |
+| `Mia` | 英文 · 女声 |
+| `Chloe` | 英文 · 女声 |
+| `Milo` | 英文 · 男声 |
+| `Dean` | 英文 · 男声 |
+
+也可在 Web 控制台的「模型管理 → 语音合成」下拉框中可视化选择。
+
+### 风格控制
+
+MiMo TTS 支持在合成文本中嵌入 **音频标签** 来控制情绪、语调、方言、角色甚至唱歌。标签需出现在 **最终被合成为语音的文本(即 Agent 回复内容)** 中,整体风格标签写在开头:
+
+```
+(风格)待合成内容
+```
+
+支持半角 `()`、全角 `()` 或 `[]` 三种括号。常见风格示例:
+
+| 类型 | 示例标签 |
+| --- | --- |
+| 基础情绪 | `开心` `悲伤` `愤怒` `恐惧` `惊讶` `兴奋` `委屈` `平静` `冷漠` |
+| 复合情绪 | `怅然` `欣慰` `无奈` `愧疚` `释然` `忐忑` `动情` |
+| 整体语调 | `温柔` `高冷` `活泼` `严肃` `慵懒` `俏皮` `深沉` `干练` `凌厉` |
+| 音色定位 | `磁性` `醇厚` `清亮` `空灵` `稚嫩` `苍老` `甜美` `沙哑` |
+| 人设腔调 | `夹子音` `御姐音` `正太音` `大叔音` `台湾腔` |
+| 方言 | `东北话` `四川话` `河南话` `粤语` |
+| 角色扮演 | `孙悟空` `林黛玉` |
+| 唱歌 | `唱歌`(等价于 `sing` / `singing`) |
+
+示例:
+
+- (磁性)夜已经深了,城市还在呼吸。
+- (东北话)哎呀妈呀,这天儿也忒冷了吧!
+- (粤语)呢个真係好正啊!
+- (唱歌)原谅我这一生不羁放纵爱自由…
+
+也可以在文本任意位置插入细粒度音频标签来控制呼吸、笑声、停顿等,例如:
+
+```
+(紧张,深呼吸)呼……冷静,冷静。(语速加快)自我介绍我背了五十遍了,应该没问题。
+```
+
+完整标签列表参见 [MiMo 语音合成文档](https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5)。
+
+
+ CowAgent 在调用 TTS 时会将 Agent 的回复原文(含 `(...)` 标签)直接送入 MiMo 合成。你可以在人设 / 系统提示词里要求模型「在回复开头用 `(风格)` 标签控制语气」,即可让 IM 渠道(微信 / 飞书 / 钉钉 / 企微)的语音回复带上情绪、方言、唱歌等效果。
+
diff --git a/docs/zh/README.md b/docs/zh/README.md
index db54626e..095e9194 100644
--- a/docs/zh/README.md
+++ b/docs/zh/README.md
@@ -104,6 +104,7 @@ CowAgent 支持国内外主流厂商的大语言模型。**文本对话、图像
| [豆包 Doubao](https://docs.cowagent.ai/models/doubao) | doubao-seed-2.0 系列 | ✅ | ✅ | ✅ | | | ✅ |
| [Kimi](https://docs.cowagent.ai/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | |
| [百度ERNIE](https://docs.cowagent.ai/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | |
+| [小米 MiMo](https://docs.cowagent.ai/models/mimo) | mimo-v2.5-pro / v2.5 | ✅ | ✅ | | | ✅ | |
| [LinkAI](https://docs.cowagent.ai/models/linkai) | 一个 Key 接入 100+ 模型 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [自定义](https://docs.cowagent.ai/models/custom) | 本地模型 / 三方代理 | ✅ | | | | | |
diff --git a/models/bot_factory.py b/models/bot_factory.py
index 824aed04..5d07a236 100644
--- a/models/bot_factory.py
+++ b/models/bot_factory.py
@@ -25,6 +25,10 @@ def create_bot(bot_type):
from models.qianfan.qianfan_bot import QianfanBot
return QianfanBot()
+ elif bot_type == const.MIMO:
+ from models.mimo.mimo_bot import MimoBot
+ return MimoBot()
+
elif bot_type in (const.OPENAI, const.CHATGPT, const.CUSTOM): # OpenAI-compatible API
from models.chatgpt.chat_gpt_bot import ChatGPTBot
return ChatGPTBot()
diff --git a/models/mimo/__init__.py b/models/mimo/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/models/mimo/mimo_bot.py b/models/mimo/mimo_bot.py
new file mode 100644
index 00000000..a815e9f0
--- /dev/null
+++ b/models/mimo/mimo_bot.py
@@ -0,0 +1,668 @@
+# encoding:utf-8
+
+"""
+小米 MiMo Bot —— OpenAI 兼容协议,使用独立 API key / base 配置。
+
+支持模型:
+- mimo-v2.5-pro (旗舰,长上下文,默认开启思考)
+- mimo-v2.5 (多模态:文/图/音/视频,默认开启思考)
+- mimo-v2-pro (V2 Pro,默认开启思考)
+- mimo-v2-omni (V2 多模态,默认开启思考)
+- mimo-v2-flash (V2 极速版,默认关闭思考)
+
+思考模式说明:
+- 开关参数:``{"thinking": {"type": "enabled" | "disabled"}}``
+- mimo-v2.5-pro / mimo-v2.5 在思考模式下 ``temperature`` 会被强制为 1.0,
+ 本地直接剥离 ``temperature`` / ``top_p`` 等参数避免歧义。
+- 多轮工具调用过程中,若历史包含 tool_calls,所有后续 assistant 消息必须回传
+ ``reasoning_content``,否则 API 返回 400 错误。
+- 文档:https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/passing-back-reasoning_content
+"""
+
+import json
+import time
+from typing import Optional
+
+import requests
+
+from bridge.context import ContextType
+from bridge.reply import Reply, ReplyType
+from common import const
+from common.log import logger
+from config import conf, load_config
+from models.bot import Bot
+from models.openai_compatible_bot import OpenAICompatibleBot
+from models.session_manager import SessionManager
+from .mimo_session import MimoSession
+
+DEFAULT_API_BASE = "https://api.xiaomimimo.com/v1"
+DEFAULT_MODEL = const.MIMO_V2_5_PRO
+
+# 支持多模态输入(图/音/视频)的模型
+MULTIMODAL_MODELS = {const.MIMO_V2_5_PRO, const.MIMO_V2_5, const.MIMO_V2_OMNI}
+
+
+class MimoBot(Bot, OpenAICompatibleBot):
+ def __init__(self):
+ super().__init__()
+ self.sessions = SessionManager(
+ MimoSession,
+ model=conf().get("model") or DEFAULT_MODEL,
+ )
+ conf_model = conf().get("model") or DEFAULT_MODEL
+ self.args = {
+ "model": conf_model,
+ "temperature": conf().get("temperature", 1.0),
+ "top_p": conf().get("top_p", 0.95),
+ }
+
+ # ---------- config helpers ----------
+
+ @property
+ def api_key(self):
+ return conf().get("mimo_api_key")
+
+ @property
+ def api_base(self):
+ url = conf().get("mimo_api_base") or DEFAULT_API_BASE
+ return url.rstrip("/")
+
+ def get_api_config(self):
+ """OpenAICompatibleBot 接口 —— 供 call_with_tools() 使用。"""
+ return {
+ "api_key": self.api_key,
+ "api_base": self.api_base,
+ "model": conf().get("model", DEFAULT_MODEL),
+ "default_temperature": conf().get("temperature", 1.0),
+ "default_top_p": conf().get("top_p", 0.95),
+ }
+
+ @property
+ def supports_vision(self) -> bool:
+ """主模型为多模态模型时,允许 vision tool 走主 bot 通道。"""
+ model_name = (conf().get("model") or "").lower()
+ return model_name in MULTIMODAL_MODELS
+
+ @staticmethod
+ def _model_supports_thinking(model_name: str) -> bool:
+ """全部 mimo 系列模型都支持 thinking 开关。"""
+ if not model_name:
+ return False
+ return model_name.lower().startswith("mimo-")
+
+ @staticmethod
+ def _thinking_default_enabled(model_name: str) -> bool:
+ """各模型的思考模式默认值。mimo-v2-flash 默认关闭,其他默认开启。"""
+ if not model_name:
+ return False
+ return model_name.lower() != const.MIMO_V2_FLASH
+
+ def _build_headers(self) -> dict:
+ return {
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {self.api_key}",
+ }
+
+ # ---------- simple chat (non-agent mode) ----------
+
+ def reply(self, query, context=None):
+ if context.type == ContextType.TEXT:
+ logger.info("[MIMO] query={}".format(query))
+
+ session_id = context["session_id"]
+ reply = None
+ clear_memory_commands = conf().get("clear_memory_commands", ["#清除记忆"])
+ if query in clear_memory_commands:
+ self.sessions.clear_session(session_id)
+ reply = Reply(ReplyType.INFO, "记忆已清除")
+ elif query == "#清除所有":
+ self.sessions.clear_all_session()
+ reply = Reply(ReplyType.INFO, "所有人记忆已清除")
+ elif query == "#更新配置":
+ load_config()
+ reply = Reply(ReplyType.INFO, "配置已更新")
+ if reply:
+ return reply
+
+ session = self.sessions.session_query(query, session_id)
+ logger.debug("[MIMO] session query={}".format(session.messages))
+
+ new_args = self.args.copy()
+ reply_content = self.reply_text(session, args=new_args)
+ logger.debug(
+ "[MIMO] new_query={}, session_id={}, reply_cont={}, completion_tokens={}".format(
+ session.messages, session_id,
+ reply_content["content"], reply_content["completion_tokens"],
+ )
+ )
+ if reply_content["completion_tokens"] == 0 and len(reply_content["content"]) > 0:
+ reply = Reply(ReplyType.ERROR, reply_content["content"])
+ elif reply_content["completion_tokens"] > 0:
+ self.sessions.session_reply(
+ reply_content["content"], session_id, reply_content["total_tokens"],
+ )
+ reply = Reply(ReplyType.TEXT, reply_content["content"])
+ else:
+ reply = Reply(ReplyType.ERROR, reply_content["content"])
+ logger.debug("[MIMO] reply {} used 0 tokens.".format(reply_content))
+ return reply
+ else:
+ reply = Reply(ReplyType.ERROR, "Bot不支持处理{}类型的消息".format(context.type))
+ return reply
+
+ def reply_text(self, session, args=None, retry_count: int = 0) -> dict:
+ try:
+ headers = self._build_headers()
+ body = dict(args) if args else dict(self.args)
+ body["messages"] = session.messages
+
+ model_name = str(body.get("model", ""))
+ # 思考模式下 mimo-v2.5-pro / mimo-v2.5 不支持自定义 temperature/top_p,
+ # 简单起见,所有支持思考的模型按默认配置走,剥离这些参数。
+ if self._model_supports_thinking(model_name) and self._thinking_default_enabled(model_name):
+ for k in ("temperature", "top_p", "presence_penalty", "frequency_penalty"):
+ body.pop(k, None)
+
+ res = requests.post(
+ f"{self.api_base}/chat/completions",
+ headers=headers,
+ json=body,
+ timeout=180,
+ )
+ if res.status_code == 200:
+ response = res.json()
+ return {
+ "total_tokens": response["usage"]["total_tokens"],
+ "completion_tokens": response["usage"]["completion_tokens"],
+ "content": response["choices"][0]["message"]["content"],
+ }
+ else:
+ try:
+ response = res.json()
+ error = response.get("error", {})
+ except Exception:
+ error = {"message": res.text[:300]}
+ logger.error(
+ f"[MIMO] chat failed, status_code={res.status_code}, "
+ f"msg={error.get('message')}, type={error.get('type')}"
+ )
+ result = {"completion_tokens": 0, "content": "提问太快啦,请休息一下再问我吧"}
+ need_retry = False
+ if res.status_code >= 500:
+ need_retry = retry_count < 2
+ elif res.status_code == 401:
+ result["content"] = "授权失败,请检查API Key是否正确"
+ elif res.status_code == 429:
+ result["content"] = "请求过于频繁,请稍后再试"
+ need_retry = retry_count < 2
+
+ if need_retry:
+ time.sleep(3)
+ return self.reply_text(session, args, retry_count + 1)
+ return result
+ except Exception as e:
+ logger.exception(e)
+ if retry_count < 2:
+ return self.reply_text(session, args, retry_count + 1)
+ return {"completion_tokens": 0, "content": "我现在有点累了,等会再来吧"}
+
+ # ==================== Agent mode support ====================
+
+ def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs):
+ """
+ 带工具调用支持的 MiMo API 调用 (供 agent 集成使用)。
+
+ 处理逻辑:
+ - Claude 格式 → OpenAI 格式 转换(含 reasoning_content 全量回传)
+ - System prompt 注入
+ - SSE 流式响应(包含 tool_calls 与 reasoning_content 增量)
+ - 思考模式开关传递
+ """
+ try:
+ converted_messages = self._convert_messages_to_openai_format(messages)
+
+ system_prompt = kwargs.pop("system", None)
+ if system_prompt:
+ if not converted_messages or converted_messages[0].get("role") != "system":
+ converted_messages.insert(0, {"role": "system", "content": system_prompt})
+ else:
+ converted_messages[0] = {"role": "system", "content": system_prompt}
+
+ converted_tools = None
+ if tools:
+ converted_tools = self._convert_tools_to_openai_format(tools)
+
+ model = kwargs.pop("model", None) or self.args["model"]
+ max_tokens = kwargs.pop("max_tokens", None)
+
+ request_body = {
+ "model": model,
+ "messages": converted_messages,
+ "stream": stream,
+ }
+ if max_tokens is not None:
+ # MiMo 使用 max_completion_tokens 命名(含可见输出 + 推理 token)
+ request_body["max_completion_tokens"] = max_tokens
+
+ if converted_tools:
+ request_body["tools"] = converted_tools
+ request_body["tool_choice"] = kwargs.pop("tool_choice", "auto")
+
+ # 思考模式:默认遵循各模型的官方默认值;caller 可显式覆盖
+ thinking_param = kwargs.pop("thinking", None)
+ thinking_active = False
+
+ if self._model_supports_thinking(model):
+ if thinking_param is None:
+ default_on = self._thinking_default_enabled(model)
+ thinking_param = {"type": "enabled" if default_on else "disabled"}
+ request_body["thinking"] = thinking_param
+ thinking_active = thinking_param.get("type") == "enabled"
+
+ # 思考模式下 v2.5-pro / v2.5 不支持自定义 temperature;干脆全部剥离避免被静默忽略
+ if thinking_active:
+ for k in ("temperature", "top_p", "presence_penalty", "frequency_penalty"):
+ request_body.pop(k, None)
+ kwargs.pop(k, None)
+ else:
+ temperature = kwargs.pop("temperature", None)
+ if temperature is not None:
+ request_body["temperature"] = temperature
+ top_p = kwargs.pop("top_p", None)
+ if top_p is not None:
+ request_body["top_p"] = top_p
+
+ logger.debug(
+ f"[MIMO] API call: model={model}, "
+ f"tools={len(converted_tools) if converted_tools else 0}, "
+ f"stream={stream}, thinking={thinking_active}"
+ )
+
+ if stream:
+ return self._handle_stream_response(request_body)
+ else:
+ return self._handle_sync_response(request_body)
+
+ except Exception as e:
+ logger.error(f"[MIMO] call_with_tools error: {e}")
+ import traceback
+ logger.error(traceback.format_exc())
+
+ def error_generator():
+ yield {"error": True, "message": str(e), "status_code": 500}
+ return error_generator()
+
+ # -------------------- streaming --------------------
+
+ def _handle_stream_response(self, request_body: dict):
+ """SSE 流式 chunk 转为 OpenAI 标准 delta 输出(含 reasoning_content)。"""
+ try:
+ headers = self._build_headers()
+ url = f"{self.api_base}/chat/completions"
+ response = requests.post(url, headers=headers, json=request_body, stream=True, timeout=180)
+
+ if response.status_code != 200:
+ error_msg = response.text
+ logger.error(f"[MIMO] API error: status={response.status_code}, msg={error_msg}")
+ yield {"error": True, "message": error_msg, "status_code": response.status_code}
+ return
+
+ current_tool_calls = {}
+ finish_reason = None
+
+ for line in response.iter_lines():
+ if not line:
+ continue
+
+ line = line.decode("utf-8")
+ if line.startswith("data: "):
+ data_str = line[6:]
+ elif line.startswith("data:"):
+ data_str = line[5:]
+ else:
+ continue
+ if data_str.strip() == "[DONE]":
+ break
+
+ try:
+ chunk = json.loads(data_str)
+ except json.JSONDecodeError as e:
+ logger.warning(f"[MIMO] JSON decode error: {e}, data: {data_str[:200]}")
+ continue
+
+ if chunk.get("error"):
+ error_data = chunk["error"]
+ error_msg = error_data.get("message", "Unknown error") if isinstance(error_data, dict) else str(error_data)
+ logger.error(f"[MIMO] stream error: {error_msg}")
+ yield {"error": True, "message": error_msg, "status_code": 500}
+ return
+
+ if not chunk.get("choices"):
+ continue
+ choice = chunk["choices"][0]
+ delta = choice.get("delta", {})
+
+ if choice.get("finish_reason"):
+ finish_reason = choice["finish_reason"]
+
+ # 推理内容(思考模式):单独 delta 透传给 agent_stream
+ if delta.get("reasoning_content"):
+ yield {
+ "choices": [{
+ "index": 0,
+ "delta": {
+ "role": "assistant",
+ "reasoning_content": delta["reasoning_content"],
+ },
+ "finish_reason": None,
+ }]
+ }
+
+ if delta.get("content"):
+ yield {
+ "choices": [{
+ "index": 0,
+ "delta": {
+ "role": "assistant",
+ "content": delta["content"],
+ },
+ }]
+ }
+
+ if "tool_calls" in delta and delta["tool_calls"]:
+ for tool_call_chunk in delta["tool_calls"]:
+ index = tool_call_chunk.get("index", 0)
+ if index not in current_tool_calls:
+ current_tool_calls[index] = {
+ "id": tool_call_chunk.get("id", ""),
+ "name": tool_call_chunk.get("function", {}).get("name", ""),
+ "arguments": "",
+ }
+ if "function" in tool_call_chunk and "arguments" in tool_call_chunk["function"]:
+ current_tool_calls[index]["arguments"] += tool_call_chunk["function"]["arguments"]
+
+ yield {
+ "choices": [{
+ "index": 0,
+ "delta": {"tool_calls": [tool_call_chunk]},
+ }]
+ }
+
+ yield {
+ "choices": [{
+ "index": 0,
+ "delta": {},
+ "finish_reason": finish_reason,
+ }]
+ }
+
+ except requests.exceptions.Timeout:
+ logger.error("[MIMO] Request timeout")
+ yield {"error": True, "message": "Request timeout", "status_code": 500}
+ except Exception as e:
+ logger.error(f"[MIMO] stream response error: {e}")
+ import traceback
+ logger.error(traceback.format_exc())
+ yield {"error": True, "message": str(e), "status_code": 500}
+
+ # -------------------- sync --------------------
+
+ def _handle_sync_response(self, request_body: dict):
+ """非流式响应;统一 yield 一份 Claude 格式 dict 与流式路径对齐。"""
+ try:
+ headers = self._build_headers()
+ request_body.pop("stream", None)
+ url = f"{self.api_base}/chat/completions"
+ response = requests.post(url, headers=headers, json=request_body, timeout=180)
+
+ if response.status_code != 200:
+ error_msg = response.text
+ logger.error(f"[MIMO] API error: status={response.status_code}, msg={error_msg}")
+ yield {"error": True, "message": error_msg, "status_code": response.status_code}
+ return
+
+ result = response.json()
+ message = result["choices"][0]["message"]
+ finish_reason = result["choices"][0]["finish_reason"]
+
+ response_data = {"role": "assistant", "content": []}
+
+ # 推理内容包装成 thinking block,便于 agent 层持久化并在工具调用时回传
+ if message.get("reasoning_content"):
+ response_data["content"].append({
+ "type": "thinking",
+ "thinking": message["reasoning_content"],
+ })
+
+ if message.get("content"):
+ response_data["content"].append({
+ "type": "text",
+ "text": message["content"],
+ })
+
+ if message.get("tool_calls"):
+ for tool_call in message["tool_calls"]:
+ try:
+ tool_input = json.loads(tool_call["function"]["arguments"])
+ except (json.JSONDecodeError, TypeError):
+ tool_input = {}
+ response_data["content"].append({
+ "type": "tool_use",
+ "id": tool_call["id"],
+ "name": tool_call["function"]["name"],
+ "input": tool_input,
+ })
+
+ if finish_reason == "tool_calls":
+ response_data["stop_reason"] = "tool_use"
+ elif finish_reason == "stop":
+ response_data["stop_reason"] = "end_turn"
+ else:
+ response_data["stop_reason"] = finish_reason
+
+ yield response_data
+
+ except requests.exceptions.Timeout:
+ logger.error("[MIMO] Request timeout")
+ yield {"error": True, "message": "Request timeout", "status_code": 500}
+ except Exception as e:
+ logger.error(f"[MIMO] sync response error: {e}")
+ import traceback
+ logger.error(traceback.format_exc())
+ yield {"error": True, "message": str(e), "status_code": 500}
+
+ # -------------------- format conversion --------------------
+
+ def _convert_messages_to_openai_format(self, messages):
+ """
+ 将 Claude 格式(content blocks)转为 OpenAI 格式。
+
+ 关键约束:MiMo 思考模式下,一旦历史包含 tool_calls 的 assistant 轮次,
+ 所有后续 assistant 消息(含工具调用轮)必须回传 reasoning_content,
+ 否则 API 返回 400。本地无 trace 时用空字符串回填,MiMo 接受字段存在
+ 即可。
+ """
+ if not messages:
+ return []
+
+ has_tool_call_history = False
+ for msg in messages:
+ if msg.get("role") != "assistant":
+ continue
+ if msg.get("tool_calls"):
+ has_tool_call_history = True
+ break
+ content = msg.get("content")
+ if isinstance(content, list) and any(
+ isinstance(b, dict) and b.get("type") == "tool_use" for b in content
+ ):
+ has_tool_call_history = True
+ break
+
+ converted = []
+
+ for msg in messages:
+ role = msg.get("role")
+ content = msg.get("content")
+
+ if not isinstance(content, list):
+ if (
+ role == "assistant"
+ and isinstance(msg, dict)
+ and has_tool_call_history
+ and "reasoning_content" not in msg
+ ):
+ patched = dict(msg)
+ patched["reasoning_content"] = ""
+ converted.append(patched)
+ else:
+ converted.append(msg)
+ continue
+
+ if role == "user":
+ has_tool_result = any(
+ isinstance(b, dict) and b.get("type") == "tool_result" for b in content
+ )
+ if has_tool_result:
+ text_parts = []
+ tool_results = []
+
+ for block in content:
+ if not isinstance(block, dict):
+ continue
+ if block.get("type") == "text":
+ text_parts.append(block.get("text", ""))
+ elif block.get("type") == "tool_result":
+ tool_call_id = block.get("tool_use_id") or ""
+ result_content = block.get("content", "")
+ if not isinstance(result_content, str):
+ result_content = json.dumps(result_content, ensure_ascii=False)
+ tool_results.append({
+ "role": "tool",
+ "tool_call_id": tool_call_id,
+ "content": result_content,
+ })
+
+ converted.extend(tool_results)
+
+ if text_parts:
+ converted.append({"role": "user", "content": "\n".join(text_parts)})
+ else:
+ # 多模态原样保留(image_url / input_audio / video_url 等 block)
+ converted.append(msg)
+
+ elif role == "assistant":
+ openai_msg = {"role": "assistant"}
+ text_parts = []
+ tool_calls = []
+ reasoning_parts = []
+
+ for block in content:
+ if not isinstance(block, dict):
+ continue
+ btype = block.get("type")
+ if btype == "text":
+ text_parts.append(block.get("text", ""))
+ elif btype == "tool_use":
+ tool_calls.append({
+ "id": block.get("id"),
+ "type": "function",
+ "function": {
+ "name": block.get("name"),
+ "arguments": json.dumps(block.get("input", {})),
+ },
+ })
+ elif btype == "thinking":
+ reasoning_parts.append(block.get("thinking", ""))
+
+ if text_parts:
+ openai_msg["content"] = "\n".join(text_parts)
+ elif not tool_calls:
+ openai_msg["content"] = ""
+
+ if tool_calls:
+ openai_msg["tool_calls"] = tool_calls
+ if not text_parts:
+ openai_msg["content"] = None
+
+ if reasoning_parts:
+ openai_msg["reasoning_content"] = "\n".join(reasoning_parts)
+ elif has_tool_call_history:
+ openai_msg["reasoning_content"] = ""
+
+ converted.append(openai_msg)
+ else:
+ converted.append(msg)
+
+ return converted
+
+ def _convert_tools_to_openai_format(self, tools):
+ """工具定义 Claude 格式 → OpenAI 格式。"""
+ if not tools:
+ return None
+
+ converted = []
+ for tool in tools:
+ if "type" in tool and tool["type"] == "function":
+ converted.append(tool)
+ else:
+ converted.append({
+ "type": "function",
+ "function": {
+ "name": tool.get("name"),
+ "description": tool.get("description"),
+ "parameters": tool.get("input_schema", {}),
+ },
+ })
+ return converted
+
+ # -------------------- vision --------------------
+
+ def call_vision(self, image_url: str, question: str,
+ model: Optional[str] = None,
+ max_tokens: int = 1000) -> dict:
+ """通过 MiMo OpenAI 兼容的 /chat/completions 端点进行图像理解。"""
+ try:
+ # 主模型若不支持视觉(如 mimo-v2-flash),自动切到 mimo-v2.5-pro
+ vision_model = model
+ if not vision_model:
+ cur = self.args.get("model") or DEFAULT_MODEL
+ vision_model = cur if cur in MULTIMODAL_MODELS else const.MIMO_V2_5_PRO
+
+ payload = {
+ "model": vision_model,
+ "max_completion_tokens": max_tokens,
+ "messages": [{
+ "role": "user",
+ "content": [
+ {"type": "text", "text": question},
+ {"type": "image_url", "image_url": {"url": image_url}},
+ ],
+ }],
+ }
+ headers = self._build_headers()
+ resp = requests.post(
+ f"{self.api_base}/chat/completions",
+ headers=headers, json=payload, timeout=60,
+ )
+ if resp.status_code != 200:
+ return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
+ data = resp.json()
+ if "error" in data:
+ return {"error": True, "message": data["error"].get("message", str(data["error"]))}
+ choice = data.get("choices", [{}])[0].get("message", {})
+ # 部分模型在多模态下会把答案塞在 reasoning_content 而非 content
+ content = choice.get("content") or choice.get("reasoning_content") or ""
+ usage = data.get("usage", {})
+ return {
+ "model": vision_model,
+ "content": content,
+ "usage": {
+ "prompt_tokens": usage.get("prompt_tokens", 0),
+ "completion_tokens": usage.get("completion_tokens", 0),
+ "total_tokens": usage.get("total_tokens", 0),
+ },
+ }
+ except Exception as e:
+ logger.error(f"[MIMO] call_vision error: {e}")
+ return {"error": True, "message": str(e)}
diff --git a/models/mimo/mimo_session.py b/models/mimo/mimo_session.py
new file mode 100644
index 00000000..76483f11
--- /dev/null
+++ b/models/mimo/mimo_session.py
@@ -0,0 +1,57 @@
+from common.log import logger
+from models.session_manager import Session
+
+
+class MimoSession(Session):
+ def __init__(self, session_id, system_prompt=None, model="mimo-v2.5-pro"):
+ super().__init__(session_id, system_prompt)
+ self.model = model
+ self.reset()
+
+ def discard_exceeding(self, max_tokens, cur_tokens=None):
+ precise = True
+ try:
+ cur_tokens = self.calc_tokens()
+ except Exception as e:
+ precise = False
+ if cur_tokens is None:
+ raise e
+ logger.debug("Exception when counting tokens precisely for query: {}".format(e))
+ while cur_tokens > max_tokens:
+ if len(self.messages) > 2:
+ self.messages.pop(1)
+ elif len(self.messages) == 2 and self.messages[1]["role"] == "assistant":
+ self.messages.pop(1)
+ if precise:
+ cur_tokens = self.calc_tokens()
+ else:
+ cur_tokens = cur_tokens - max_tokens
+ break
+ elif len(self.messages) == 2 and self.messages[1]["role"] == "user":
+ logger.warn("user message exceed max_tokens. total_tokens={}".format(cur_tokens))
+ break
+ else:
+ logger.debug("max_tokens={}, total_tokens={}, len(messages)={}".format(
+ max_tokens, cur_tokens, len(self.messages)))
+ break
+ if precise:
+ cur_tokens = self.calc_tokens()
+ else:
+ cur_tokens = cur_tokens - max_tokens
+ return cur_tokens
+
+ def calc_tokens(self):
+ return num_tokens_from_messages(self.messages, self.model)
+
+
+def num_tokens_from_messages(messages, model):
+ tokens = 0
+ for msg in messages:
+ content = msg.get("content", "")
+ if isinstance(content, str):
+ tokens += len(content)
+ elif isinstance(content, list):
+ for block in content:
+ if isinstance(block, dict):
+ tokens += len(block.get("text", ""))
+ return tokens
diff --git a/voice/factory.py b/voice/factory.py
index 3be60bbf..2bc356f4 100644
--- a/voice/factory.py
+++ b/voice/factory.py
@@ -66,4 +66,8 @@ def create_voice(voice_type):
from voice.zhipuai.zhipuai_voice import ZhipuAIVoice
return ZhipuAIVoice()
+ elif voice_type == "mimo":
+ from voice.mimo.mimo_voice import MimoVoice
+
+ return MimoVoice()
raise RuntimeError
diff --git a/voice/mimo/__init__.py b/voice/mimo/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/voice/mimo/mimo_voice.py b/voice/mimo/mimo_voice.py
new file mode 100644
index 00000000..2ae885f8
--- /dev/null
+++ b/voice/mimo/mimo_voice.py
@@ -0,0 +1,109 @@
+# encoding:utf-8
+"""
+小米 MiMo TTS - 基于 mimo-v2.5-tts 模型的语音合成。
+
+通过 /chat/completions 接口实现:assistant 消息内容为待合成文本,
+audio 字段指定预置音色(如 冰糖/茉莉/苏打/Mia/Chloe 等),返回 base64
+编码的音频字节。
+
+文档:https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5
+注意:MiMo 不提供 ASR 端点,因此 voiceToText 不实现。
+"""
+import base64
+import datetime
+import os
+import random
+
+import requests
+
+from bridge.reply import Reply, ReplyType
+from common.log import logger
+from config import conf
+from voice.voice import Voice
+
+DEFAULT_API_BASE = "https://api.xiaomimimo.com/v1"
+DEFAULT_TTS_MODEL = "mimo-v2.5-tts"
+DEFAULT_TTS_VOICE = "冰糖" # 默认音色:中国集群事实默认值
+REQUEST_TIMEOUT = (5, 120)
+
+
+class MimoVoice(Voice):
+ def __init__(self):
+ pass
+
+ def voiceToText(self, voice_file: str):
+ # MiMo 没有独立 ASR 端点;建议使用其他 provider(如 openai/zhipu/dashscope)
+ logger.warning("[MimoVoice] voiceToText is not supported by MiMo API")
+ return Reply(ReplyType.ERROR, "MiMo 暂不支持语音识别,请配置其他 voice_to_text provider")
+
+ def textToVoice(self, text: str):
+ try:
+ api_key = conf().get("mimo_api_key", "")
+ if not api_key:
+ logger.error("[MimoVoice] mimo_api_key is not configured")
+ return Reply(ReplyType.ERROR, "未配置 MiMo API key")
+
+ api_base = (conf().get("mimo_api_base") or DEFAULT_API_BASE).rstrip("/")
+ model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
+ voice_id = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
+
+ # 目标合成文本必须放在 assistant 消息;user 消息可选用作风格指令
+ payload = {
+ "model": model,
+ "messages": [
+ {"role": "assistant", "content": text},
+ ],
+ "audio": {
+ "format": "wav",
+ "voice": voice_id,
+ },
+ }
+ headers = {
+ "Authorization": f"Bearer {api_key}",
+ "Content-Type": "application/json",
+ }
+ url = f"{api_base}/chat/completions"
+ response = requests.post(url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT)
+
+ if response.status_code != 200:
+ logger.error(
+ f"[MimoVoice] textToVoice failed: status={response.status_code} "
+ f"body={response.text[:500]} model={model} voice={voice_id}"
+ )
+ return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
+
+ data = response.json()
+ if "error" in data:
+ err = data["error"]
+ msg = err.get("message", str(err)) if isinstance(err, dict) else str(err)
+ logger.error(f"[MimoVoice] textToVoice api error: {msg}")
+ return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
+
+ message = (data.get("choices") or [{}])[0].get("message", {}) or {}
+ audio_obj = message.get("audio") or {}
+ audio_b64 = audio_obj.get("data")
+ if not audio_b64:
+ logger.error(f"[MimoVoice] textToVoice empty audio in response: {data}")
+ return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
+
+ try:
+ audio_bytes = base64.b64decode(audio_b64)
+ except Exception as e:
+ logger.error(f"[MimoVoice] base64 decode failed: {e}")
+ return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
+
+ file_name = (
+ "tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+ + str(random.randint(0, 1000)) + ".wav"
+ )
+ os.makedirs(os.path.dirname(file_name), exist_ok=True)
+ with open(file_name, "wb") as f:
+ f.write(audio_bytes)
+ logger.info(
+ f"[MimoVoice] textToVoice model={model} voice={voice_id} "
+ f"file={file_name} bytes={len(audio_bytes)}"
+ )
+ return Reply(ReplyType.VOICE, file_name)
+ except Exception as e:
+ logger.exception(f"[MimoVoice] textToVoice exception: {e}")
+ return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")