feat(models): support xiaomi mimo

2026-07-17 11:07:11 +08:00 · 2026-05-28 10:49:52 +08:00
parent 83cd6ad158
commit bccce2d7cb
22 changed files with 1340 additions and 3 deletions
--- a/README.md
+++ b/README.md
@@ -104,6 +104,7 @@ CowAgent supports all mainstream LLM providers. **Chat, vision, image generation
 | [Kimi](https://docs.cowagent.ai/en/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | |
 | [MiniMax](https://docs.cowagent.ai/en/models/minimax) | MiniMax-M2.7 | ✅ | ✅ | ✅ | | ✅ | |
 | [ERNIE](https://docs.cowagent.ai/en/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | |
 | [MiMo](https://docs.cowagent.ai/en/models/mimo) | mimo-v2.5 / pro | ✅ | ✅ | | | ✅ | |
 | [LinkAI](https://docs.cowagent.ai/en/models/linkai) | One key for 100+ models | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [Custom](https://docs.cowagent.ai/en/models/custom) | Local models / third-party proxy | ✅ | | | | | |
--- a/agent/tools/vision/vision.py
+++ b/agent/tools/vision/vision.py
@@ -57,6 +57,7 @@ _DISCOVERABLE_MODELS = [
    ("qianfan_api_key", const.QIANFAN, const.ERNIE_45_TURBO_VL, "Qianfan"),
    ("zhipu_ai_api_key", const.ZHIPU_AI, const.GLM_4_7, "ZhipuAI"),
    ("minimax_api_key", const.MiniMax, const.MINIMAX_M2_7, "MiniMax"),
    ("mimo_api_key", const.MIMO, const.MIMO_V2_5_PRO, "MiMo"),
 ]
 # Model name prefix → discoverable provider display_name.
@@ -73,6 +74,7 @@ _MODEL_PREFIX_TO_PROVIDER = [
    ("glm-", "ZhipuAI"),
    ("minimax-", "MiniMax"),
    ("abab", "MiniMax"),
    ("mimo-", "MiMo"),
 ]
 # Model prefixes that natively belong to OpenAI / LinkAI (raw HTTP providers).
@@ -92,6 +94,7 @@ _PROVIDER_ID_TO_DISPLAY = {
    "qianfan": "Qianfan",
    "zhipu": "ZhipuAI",
    "minimax": "MiniMax",
    "mimo": "MiMo",
 }
--- a/bridge/bridge.py
+++ b/bridge/bridge.py
@@ -63,6 +63,10 @@ class Bridge(object):
            if model_type and model_type.startswith("deepseek"):
                self.btype["chat"] = const.DEEPSEEK
            # 小米 MiMo 系列模型，全部以 mimo- 开头
            if model_type and model_type.startswith("mimo-"):
                self.btype["chat"] = const.MIMO
            if model_type and isinstance(model_type, str):
                lowered_model_type = model_type.lower()
                if lowered_model_type == const.QIANFAN or lowered_model_type.startswith("ernie"):
--- a/channel/web/web_channel.py
+++ b/channel/web/web_channel.py
@@ -1387,6 +1387,7 @@ class ConfigHandler:
        const.DOUBAO_SEED_2_PRO, const.DOUBAO_SEED_2_CODE,
        const.KIMI_K2_6, const.KIMI_K2_5, const.KIMI_K2,
        const.ERNIE_5_1, const.ERNIE_5, const.ERNIE_X1_1, const.ERNIE_45_TURBO_128K, const.ERNIE_45_TURBO_32K,
        const.MIMO_V2_5_PRO, const.MIMO_V2_5,
    ]
    # Generic placeholder hints surfaced in the web console. We deliberately
@@ -1481,6 +1482,14 @@ class ConfigHandler:
            "api_base_placeholder": _PLACEHOLDER_QIANFAN,
            "models": [const.ERNIE_5_1, const.ERNIE_5, const.ERNIE_X1_1, const.ERNIE_45_TURBO_128K, const.ERNIE_45_TURBO_32K],
        }),
        ("mimo", {
            "label": {"zh": "小米 MiMo", "en": "MiMo"},
            "api_key_field": "mimo_api_key",
            "api_base_key": "mimo_api_base",
            "api_base_default": "https://api.xiaomimimo.com/v1",
            "api_base_placeholder": _PLACEHOLDER_V1,
            "models": [const.MIMO_V2_5_PRO, const.MIMO_V2_5],
        }),
        ("linkai", {
            "label": "LinkAI",
            "api_key_field": "linkai_api_key",
@@ -1502,10 +1511,10 @@ class ConfigHandler:
    EDITABLE_KEYS = {
        "model", "bot_type", "use_linkai",
        "open_ai_api_base", "deepseek_api_base", "qianfan_api_base", "claude_api_base", "gemini_api_base",
-        "zhipu_ai_api_base", "moonshot_base_url", "ark_base_url", "custom_api_base",
+        "zhipu_ai_api_base", "moonshot_base_url", "ark_base_url", "custom_api_base", "mimo_api_base",
        "open_ai_api_key", "deepseek_api_key", "qianfan_api_key", "claude_api_key", "gemini_api_key",
        "zhipu_ai_api_key", "dashscope_api_key", "moonshot_api_key",
-        "ark_api_key", "minimax_api_key", "linkai_api_key", "custom_api_key",
+        "ark_api_key", "minimax_api_key", "linkai_api_key", "custom_api_key", "mimo_api_key",
        "agent_max_context_tokens", "agent_max_context_turns", "agent_max_steps",
        "enable_thinking", "web_password",
    }
@@ -1646,7 +1655,7 @@ class ModelsHandler:
    # Capability -> provider ids drawn from ConfigHandler.PROVIDER_MODELS.
    _ASR_PROVIDERS = ["openai", "dashscope", "zhipu", "linkai"]
    # Web-console white-list. Other vendors stay usable via direct config.
-    _TTS_PROVIDERS = ["openai", "minimax", "dashscope", "linkai"]
+    _TTS_PROVIDERS = ["openai", "minimax", "dashscope", "mimo", "linkai"]
    # TTS engine catalog (speech models, not voice timbres). Entries are
    # either a bare code or {value, hint?} when a friendly label helps.
@@ -1661,6 +1670,10 @@ class ModelsHandler:
        "dashscope": [
            {"value": "qwen3-tts-flash", "hint": "覆盖普通话、方言与主流外语"},
        ],
        # 小米 MiMo TTS 系列，通过 chat completions 接口合成
        "mimo": [
            {"value": "mimo-v2.5-tts", "hint": "预置音色 · 支持唱歌模式"},
        ],
        # Aggregating gateway: a single endpoint multiplexes several
        # underlying TTS engines, selected via the `model` field.
        # Each engine exposes its own voice catalog (see _TTS_PROVIDER_VOICES).
@@ -1780,6 +1793,18 @@ class ModelsHandler:
            {"value": "Marcus",   "hint": "陕西话 · 秦川"},
            {"value": "Roy",      "hint": "闽南语 · 阿杰"},
        ],
        # 小米 MiMo 预置音色列表（mimo-v2.5-tts），文档：
        # https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5
        "mimo": [
            {"value": "冰糖",   "hint": "中文 · 女声 · 冰糖"},
            {"value": "茉莉",   "hint": "中文 · 女声 · 茉莉"},
            {"value": "苏打",   "hint": "中文 · 男声 · 苏打"},
            {"value": "白桦",   "hint": "中文 · 男声 · 白桦"},
            {"value": "Mia",   "hint": "英文 · 女声 · Mia"},
            {"value": "Chloe", "hint": "英文 · 女声 · Chloe"},
            {"value": "Milo",  "hint": "英文 · 男声 · Milo"},
            {"value": "Dean",  "hint": "英文 · 男声 · Dean"},
        ],
        # Aggregating gateway: voices are scoped per engine model. The
        # frontend picks the correct list based on the selected model so
        # users don't see incompatible timbres for the active engine.
@@ -1916,6 +1941,8 @@ class ModelsHandler:
        # (see models/minimax/minimax_bot.py::call_vision); the M2.x chat
        # family is text-only.
        "minimax":   [const.MINIMAX_TEXT_01],
        # MiMo 原生全模态模型：v2.5-pro / v2.5 支持图像/音频/视频输入
        "mimo":      [const.MIMO_V2_5_PRO, const.MIMO_V2_5],
        # LinkAI proxies the underlying vendor; surface a curated set of
        # multimodal models. Order: gpt-4.1-mini → gpt-5.4-mini as the
        # cross-vendor baselines, then each vendor's recommended default.
@@ -2045,6 +2072,7 @@ class ModelsHandler:
        ("qianfan",   "qianfan_api_key",   const.ERNIE_45_TURBO_VL),
        ("zhipu",     "zhipu_ai_api_key",  const.GLM_5V_TURBO),
        ("minimax",   "minimax_api_key",   const.MINIMAX_TEXT_01),
        ("mimo",      "mimo_api_key",      const.MIMO_V2_5_PRO),
    ]
    @classmethod
--- a/common/const.py
+++ b/common/const.py
@@ -15,6 +15,7 @@ ZHIPU_AI = "zhipu"
 MOONSHOT = "moonshot"
 MiniMax = "minimax"
 DEEPSEEK = "deepseek"
 MIMO = "mimo"  # 小米 MiMo 大模型
 CUSTOM = "custom"  # custom OpenAI-compatible API, bot_type won't auto-switch on model change
 MODELSCOPE = "modelscope"
@@ -140,6 +141,13 @@ KIMI_K2 = "kimi-k2"
 KIMI_K2_5 = "kimi-k2.5"
 KIMI_K2_6 = "kimi-k2.6"  # Kimi K2.6 - Agent recommended model (default)
 # 小米 MiMo
 MIMO_V2_5_PRO = "mimo-v2.5-pro"      # MiMo V2.5 Pro - 旗舰，长上下文（默认推荐）
 MIMO_V2_5 = "mimo-v2.5"              # MiMo V2.5 - 多模态（文/图/音/视频）
 MIMO_V2_PRO = "mimo-v2-pro"          # MiMo V2 Pro
 MIMO_V2_OMNI = "mimo-v2-omni"        # MiMo V2 Omni - 多模态
 MIMO_V2_FLASH = "mimo-v2-flash"      # MiMo V2 Flash - 极速版
 # Doubao (Volcengine Ark)
 DOUBAO = "doubao"
 DOUBAO_SEED_2_CODE = "doubao-seed-2-0-code-preview-260215"
@@ -182,6 +190,9 @@ MODEL_LIST = [
              # MiniMax
              MiniMax, MINIMAX_M2_7, MINIMAX_M2_7_HIGHSPEED, MINIMAX_M2_5, MINIMAX_M2_1, MINIMAX_M2_1_LIGHTNING, MINIMAX_M2, MINIMAX_ABAB6_5,
              # 小米 MiMo
              MIMO, MIMO_V2_5_PRO, MIMO_V2_5, MIMO_V2_PRO, MIMO_V2_OMNI, MIMO_V2_FLASH,
              # Claude
              CLAUDE3, CLAUDE_4_6_SONNET, CLAUDE_4_7_OPUS, CLAUDE_4_6_OPUS, CLAUDE_4_OPUS, CLAUDE_4_5_SONNET, CLAUDE_4_SONNET, CLAUDE_3_OPUS, CLAUDE_3_OPUS_0229,
              CLAUDE_35_SONNET, CLAUDE_35_SONNET_1022, CLAUDE_35_SONNET_0620, CLAUDE_3_SONNET, CLAUDE_3_HAIKU,
--- a/config.py
+++ b/config.py
@@ -209,6 +209,9 @@ available_setting = {
    "Minimax_base_url": "",
    "deepseek_api_key": "",
    "deepseek_api_base": "https://api.deepseek.com/v1",
    # 小米 MiMo 大模型
    "mimo_api_key": "",
    "mimo_api_base": "https://api.xiaomimimo.com/v1",
    "web_host": "",  # Web console bind address; empty means auto
    "web_port": 9899,
    "web_password": "",  # Web console password; empty means no authentication required
@@ -401,6 +404,8 @@ def load_config():
        "minimax_api_base": "MINIMAX_API_BASE",
        "deepseek_api_key": "DEEPSEEK_API_KEY",
        "deepseek_api_base": "DEEPSEEK_API_BASE",
        "mimo_api_key": "MIMO_API_KEY",
        "mimo_api_base": "MIMO_API_BASE",
        "qianfan_api_key": "QIANFAN_API_KEY",
        "qianfan_api_base": "QIANFAN_API_BASE",
        "zhipu_ai_api_key": "ZHIPU_AI_API_KEY",
--- a/docs/README.md
+++ b/docs/README.md
@@ -0,0 +1,30 @@
 # Documentation
 This directory contains the Mintlify documentation site for the project.
 ## Prerequisites
 - Node.js v20.17.0 or higher (LTS recommended)
 ## Install the CLI (one-time, global)
 ```bash
 npm i -g mint
 ```
 ## Run the docs locally
 From this `docs/` directory:
 ```bash
 mint dev
 ```
 Then open http://localhost:3000 (or the port Mint reports if 3000 is in use).
 > The first run downloads the Mint preview framework (~90 MB) into `~/.mintlify/`.
 > Subsequent runs start instantly from the local cache.
 ## More
 - Mintlify docs: https://www.mintlify.com/docs
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -88,6 +88,7 @@
                  "models/doubao",
                  "models/kimi",
                  "models/qianfan",
                  "models/mimo",
                  "models/linkai",
                  "models/coding-plan",
                  "models/custom"
@@ -290,6 +291,7 @@
                  "en/models/doubao",
                  "en/models/kimi",
                  "en/models/qianfan",
                  "en/models/mimo",
                  "en/models/linkai",
                  "en/models/coding-plan",
                  "en/models/custom"
@@ -492,6 +494,7 @@
                  "ja/models/doubao",
                  "ja/models/kimi",
                  "ja/models/qianfan",
                  "ja/models/mimo",
                  "ja/models/linkai",
                  "ja/models/coding-plan",
                  "ja/models/custom"
--- a/docs/en/models/index.mdx
+++ b/docs/en/models/index.mdx
@@ -21,6 +21,7 @@ A snapshot of each vendor's capabilities. "Text" refers to the main chat model;
 | [Doubao](/en/models/doubao) | doubao-seed-2.0 series | ✅ | ✅ | ✅ | | | ✅ |
 | [Kimi](/en/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | |
 | [ERNIE](/en/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | |
 | [MiMo](/en/models/mimo) | mimo-v2.5-pro / v2.5 | ✅ | ✅ | | | ✅ | |
 | [LinkAI](/en/models/linkai) | 100+ models from multiple vendors | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [Custom](/en/models/custom) | Local models / third-party proxies | ✅ | | | | | |
--- a/docs/en/models/mimo.mdx
+++ b/docs/en/models/mimo.mdx
@@ -0,0 +1,136 @@
 ---
 title: MiMo
 description: Xiaomi MiMo model configuration (Text Chat + Image Understanding + Text-to-Speech)
 ---
 Xiaomi MiMo is a native omni-modal large model. A single `mimo_api_key` enables text chat, image understanding, and text-to-speech all at once.
 <Tip>
  All capabilities below can be configured in one place via the "Model Management" page in the Web Console — no need to manually edit the configuration file.
 </Tip>
 ## Text Chat
 ```json
 {
  "model": "mimo-v2.5-pro",
  "mimo_api_key": "YOUR_API_KEY",
  "mimo_api_base": "https://api.xiaomimimo.com/v1"
 }
 ```
 | Parameter | Description |
 | --- | --- |
 | `model` | Default recommendation: `mimo-v2.5-pro`; `mimo-v2.5` is also supported |
 | `mimo_api_key` | Create one in the [MiMo Open Platform](https://platform.xiaomimimo.com/console/api-keys) |
 | `mimo_api_base` | Optional, defaults to `https://api.xiaomimimo.com/v1` |
 ### Model Selection
 | Model | Use Case |
 | --- | --- |
 | `mimo-v2.5-pro` | Flagship: native omni-modal + Agent capability, up to 1M tokens context |
 | `mimo-v2.5` | General-purpose, native omni-modal (text / image / video / audio) |
 ## Thinking Mode
 The MiMo V2.5 series enables "thinking mode" by default: the model emits `reasoning_content` (chain-of-thought) before the final answer, improving performance on complex tasks.
 Use the global `enable_thinking` flag to toggle visibility (also switchable from the Web Console settings):
 ```json
 {
  "enable_thinking": true
 }
 ```
 ## Image Understanding
 Once `mimo_api_key` is configured, the Agent's Vision tool can automatically use MiMo's vision models:
 - When the main model itself is multimodal (`mimo-v2.5-pro` / `mimo-v2.5`), images are handled directly by the main model with no extra setup.
 - When the main model belongs to another vendor, the Vision tool falls back to `mimo-v2.5-pro` in order.
 To force a specific Vision model, set it explicitly in the configuration:
 ```json
 {
  "tools": {
    "vision": {
      "provider": "mimo",
      "model": "mimo-v2.5-pro"
    }
  }
 }
 ```
 ## Text-to-Speech (TTS)
 ```json
 {
  "text_to_voice": "mimo",
  "text_to_voice_model": "mimo-v2.5-tts",
  "tts_voice_id": "冰糖"
 }
 ```
 | Parameter | Description |
 | --- | --- |
 | `text_to_voice_model` | Currently only `mimo-v2.5-tts` (preset voices + singing mode) |
 | `tts_voice_id` | Preset voice name (Chinese voice IDs use the Chinese name directly) |
 ### Preset Voices
 | Voice ID | Description |
 | --- | --- |
 | `Mia` | English · Female |
 | `Chloe` | English · Female |
 | `Milo` | English · Male |
 | `Dean` | English · Male |
 | `冰糖` | Chinese · Female (default) |
 | `茉莉` | Chinese · Female |
 | `苏打` | Chinese · Male |
 | `白桦` | Chinese · Male |
 You can also pick a voice visually from the Web Console under "Model Management → Text-to-Speech".
 ### Style Control
 MiMo TTS supports embedding **audio tags** in the synthesis text to control emotion, tone, dialect, persona, and even singing. Tags must appear in the **text that will be synthesized to speech (i.e. the Agent's reply)**, with the overall style tag placed at the very beginning:
 ```
 (style)content-to-synthesize
 ```
 Half-width `()`, full-width `（）`, and `[]` brackets are all accepted. Both Chinese and English style descriptors work — pick whichever language expresses the timbre most precisely. Common examples:
 | Category | Example tags |
 | --- | --- |
 | Basic emotions | `happy` `sad` `angry` `fear` `surprised` `excited` `aggrieved` `calm` `indifferent` |
 | Compound emotions | `wistful` `relieved` `helpless` `guilty` `at ease` `uneasy` `touched` |
 | Overall tone | `gentle` `aloof` `lively` `serious` `languid` `playful` `deep` `sharp` `cutting` |
 | Voice character | `magnetic` `mellow` `bright` `ethereal` `childlike` `aged` `sweet` `husky` |
 | Persona | `squeaky` `mature lady` `young boy` `uncle` `Taiwanese accent` |
 | Dialect | `Northeastern` `Sichuan` `Henan` `Cantonese` |
 | Role-play | `Sun Wukong` `Lin Daiyu` |
 | Singing | `sing` / `singing` |
 Examples:
 - `(magnetic)The night is deep, and the city is still breathing.`
 - `(gentle)Take a breath. You've got this.`
 - `(serious)This is the final warning before the system reboots.`
 - `(singing)Oh, when the saints go marching in…`
 You can also insert fine-grained audio tags at any position in the text to control breathing, laughter, pauses, etc. For example:
 ```
 (nervous, deep breath) Phew… stay calm, stay calm. (faster pace) I've rehearsed this intro fifty times, it'll be fine.
 ```
 See the [MiMo speech synthesis documentation](https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5) for the full tag list.
 <Tip>
  When CowAgent calls TTS, the Agent's reply text (including any `(...)` tags) is forwarded directly to MiMo for synthesis. Tell the model in its persona / system prompt to "prefix replies with a `(style)` tag to control the tone", and IM channels (WeChat / Feishu / DingTalk / WeCom) will play voice replies with the corresponding emotion, dialect, or even singing.
 </Tip>
--- a/docs/ja/README.md
+++ b/docs/ja/README.md
@@ -104,6 +104,7 @@ CowAgent は主要な LLM プロバイダーすべてに対応しています。
 | [Kimi](https://docs.cowagent.ai/ja/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | |
 | [MiniMax](https://docs.cowagent.ai/ja/models/minimax) | MiniMax-M2.7 | ✅ | ✅ | ✅ | | ✅ | |
 | [ERNIE](https://docs.cowagent.ai/ja/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | |
 | [MiMo](https://docs.cowagent.ai/ja/models/mimo) | mimo-v2.5-pro / v2.5 | ✅ | ✅ | | | ✅ | |
 | [LinkAI](https://docs.cowagent.ai/ja/models/linkai) | 1 つの Key で 100+ モデルに接続 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [カスタム](https://docs.cowagent.ai/ja/models/custom) | ローカルモデル / サードパーティプロキシ | ✅ | | | | | |
--- a/docs/ja/models/mimo.mdx
+++ b/docs/ja/models/mimo.mdx
@@ -0,0 +1,135 @@
 ---
 title: Xiaomi MiMo
 description: Xiaomi MiMo モデル設定（テキスト対話 + 画像理解 + 音声合成）
 ---
 Xiaomi MiMo はネイティブ全モーダル大規模言語モデルです。1 つの `mimo_api_key` でテキスト対話、画像理解、音声合成を同時に有効化できます。
 <Tip>
  Web コンソールの「モデル管理」ページから、以下のすべての機能をワンストップで設定でき、設定ファイルを手動で編集する必要はありません。
 </Tip>
 ## テキスト対話
 ```json
 {
  "model": "mimo-v2.5-pro",
  "mimo_api_key": "YOUR_API_KEY",
  "mimo_api_base": "https://api.xiaomimimo.com/v1"
 }
 ```
 | パラメータ | 説明 |
 | --- | --- |
 | `model` | 推奨は `mimo-v2.5-pro`。`mimo-v2.5` も使用可能 |
 | `mimo_api_key` | [MiMo Open Platform](https://platform.xiaomimimo.com/console/api-keys) で作成 |
 | `mimo_api_base` | 任意。デフォルトは `https://api.xiaomimimo.com/v1` |
 ### モデル選択
 | モデル | ユースケース |
 | --- | --- |
 | `mimo-v2.5-pro` | フラッグシップ。ネイティブ全モーダル + Agent 能力、最大 100 万トークンのコンテキスト |
 | `mimo-v2.5` | 汎用版。ネイティブ全モーダル（テキスト / 画像 / 動画 / 音声） |
 ## 思考モード
 MiMo V2.5 シリーズはデフォルトで「思考モード」が有効です。最終回答の前に `reasoning_content`（思考過程）を出力することで、複雑なタスクのパフォーマンスを高めます。
 表示の有無はグローバル設定 `enable_thinking` で切り替え可能です（Web コンソールの設定ページからも変更できます）：
 ```json
 {
  "enable_thinking": true
 }
 ```
 ## 画像理解
 `mimo_api_key` を設定すると、Agent の Vision ツールは自動的に MiMo のビジョンモデルを利用します：
 - メインモデル自体がマルチモーダル（`mimo-v2.5-pro` / `mimo-v2.5`）の場合は、画像はメインモデルが直接処理し、追加設定は不要です。
 - メインモデルが他社製の場合、Vision ツールは順序に従い `mimo-v2.5-pro` にフォールバックします。
 特定の Vision モデルを強制したい場合は、設定ファイルで明示的に指定してください：
 ```json
 {
  "tools": {
    "vision": {
      "provider": "mimo",
      "model": "mimo-v2.5-pro"
    }
  }
 }
 ```
 ## 音声合成
 ```json
 {
  "text_to_voice": "mimo",
  "text_to_voice_model": "mimo-v2.5-tts",
  "tts_voice_id": "冰糖"
 }
 ```
 | パラメータ | 説明 |
 | --- | --- |
 | `text_to_voice_model` | 現在は `mimo-v2.5-tts` のみ対応（プリセット音色 + 歌唱モード） |
 | `tts_voice_id` | プリセット音色名（中国語の音色は中国語名がそのまま ID） |
 ### プリセット音色
 | 音色 ID | 説明 |
 | --- | --- |
 | `冰糖` | 中国語 · 女声（デフォルト） |
 | `茉莉` | 中国語 · 女声 |
 | `苏打` | 中国語 · 男声 |
 | `白桦` | 中国語 · 男声 |
 | `Mia` | 英語 · 女声 |
 | `Chloe` | 英語 · 女声 |
 | `Milo` | 英語 · 男声 |
 | `Dean` | 英語 · 男声 |
 Web コンソールの「モデル管理 → 音声合成」のドロップダウンから視覚的に選択することもできます。
 ### スタイル制御
 MiMo TTS は合成テキスト内に **音声タグ** を埋め込むことで、感情、語調、方言、キャラクター、さらには歌唱まで制御できます。タグは **最終的に音声合成されるテキスト（つまり Agent の返信内容）** に含める必要があり、全体スタイルのタグは先頭に置きます：
 ```
 (スタイル)合成するテキスト
 ```
 半角 `()`、全角 `（）`、`[]` の 3 種類の括弧に対応。スタイル記述は中国語・英語のどちらでも OK で、最も的確に表現できる言語を選んでください。代表的なスタイル例：
 | 種類 | サンプルタグ |
 | --- | --- |
 | 基本感情 | `happy` `sad` `angry` `fear` `surprised` `excited` `aggrieved` `calm` `indifferent` |
 | 複合感情 | `wistful` `relieved` `helpless` `guilty` `at ease` `uneasy` `touched` |
 | 全体トーン | `gentle` `aloof` `lively` `serious` `languid` `playful` `deep` `sharp` `cutting` |
 | 声質 | `magnetic` `mellow` `bright` `ethereal` `childlike` `aged` `sweet` `husky` |
 | キャラクター調 | `squeaky` `mature lady` `young boy` `uncle` `Taiwanese accent` |
 | 方言 | `Northeastern` `Sichuan` `Henan` `Cantonese` |
 | ロールプレイ | `Sun Wukong` `Lin Daiyu` |
 | 歌唱 | `sing` / `singing` |
 例：
 - `(magnetic)夜が深まり、街はまだ呼吸している。`
 - `(gentle)深呼吸して。きっと大丈夫。`
 - `(serious)これがシステム再起動前の最後の警告です。`
 - `(singing)Twinkle, twinkle, little star, how I wonder what you are…`
 テキストの任意の位置に細かい音声タグを挿入して、呼吸、笑い声、間などを制御することもできます。例：
 ```
 (nervous, deep breath) ふぅ……落ち着いて、落ち着いて。(faster pace) 自己紹介は五十回練習したから大丈夫。
 ```
 タグの完全な一覧は [MiMo 音声合成ドキュメント](https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5) を参照してください。
 <Tip>
  CowAgent は TTS 呼び出し時、Agent の返信原文（`(...)` タグを含む）をそのまま MiMo に送信します。ペルソナ / システムプロンプトで「返信の冒頭に `(スタイル)` タグを付けて口調を指定する」よう指示すれば、IM チャネル（WeChat / Feishu / DingTalk / WeCom）の音声返信に感情・方言・歌唱などの効果を付与できます。
 </Tip>
--- a/docs/models/index.mdx
+++ b/docs/models/index.mdx
@@ -22,6 +22,7 @@ CowAgent 支持国内外主流厂商的大语言模型，模型接口实现在
 | [豆包 Doubao](/models/doubao) | doubao-seed-2.0 系列 | ✅ | ✅ | ✅ | | | ✅ |
 | [Kimi](/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | |
 | [百度千帆](/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | |
 | [小米 MiMo](/models/mimo) | mimo-v2.5-pro / v2.5 | ✅ | ✅ | | | ✅ | |
 | [LinkAI](/models/linkai) | 多厂商 100+ 模型统一接入 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [自定义](/models/custom) |本地模型 / 三方代理 | ✅ | | | | | |
--- a/docs/models/mimo.mdx
+++ b/docs/models/mimo.mdx
@@ -0,0 +1,135 @@
 ---
 title: 小米 MiMo
 description: 小米 MiMo 模型配置（文本对话 + 图像理解 + 语音合成）
 ---
 小米 MiMo 是原生全模态大模型，单 `mimo_api_key` 即可同时启用文本对话、图像理解与语音合成。
 <Tip>
  通过 Web 控制台的「模型管理」页面可一站式配置以下全部能力，无需手动改配置文件。
 </Tip>
 ## 文本对话
 ```json
 {
  "model": "mimo-v2.5-pro",
  "mimo_api_key": "YOUR_API_KEY",
  "mimo_api_base": "https://api.xiaomimimo.com/v1"
 }
 ```
 | 参数 | 说明 |
 | --- | --- |
 | `model` | 默认推荐 `mimo-v2.5-pro`，也可使用 `mimo-v2.5` |
 | `mimo_api_key` | 在 [MiMo 开放平台](https://platform.xiaomimimo.com/console/api-keys) 创建 |
 | `mimo_api_base` | 可选，默认为 `https://api.xiaomimimo.com/v1` |
 ### 模型选择
 | 模型 | 适用场景 |
 | --- | --- |
 | `mimo-v2.5-pro` | 旗舰，原生全模态 + Agent 能力，最高 100 万 tokens 上下文 |
 | `mimo-v2.5` | 综合版，原生全模态（文本 / 图像 / 视频 / 音频） |
 ## 思考模式
 MiMo V2.5 系列默认开启「思考模式」：模型在输出最终回答前会先输出 `reasoning_content`（思维链），提升复杂任务表现。
 通过全局配置 `enable_thinking` 控制是否展示（也可在 Web 控制台 - 配置页面切换）：
 ```json
 {
  "enable_thinking": true
 }
 ```
 ## 图像理解
 配置 `mimo_api_key` 后，Agent 的 Vision 工具可以自动使用 MiMo 视觉模型：
 - 当主模型本身是多模态时（`mimo-v2.5-pro` / `mimo-v2.5`），直接由主模型识别图像，无需额外配置
 - 当主模型是其他厂商时，Vision 工具会根据顺序自动 fallback 到 `mimo-v2.5-pro`
 如需手动指定 Vision 模型，可在配置文件中显式配置：
 ```json
 {
  "tools": {
    "vision": {
      "provider": "mimo",
      "model": "mimo-v2.5-pro"
    }
  }
 }
 ```
 ## 语音合成
 ```json
 {
  "text_to_voice": "mimo",
  "text_to_voice_model": "mimo-v2.5-tts",
  "tts_voice_id": "冰糖"
 }
 ```
 | 参数 | 说明 |
 | --- | --- |
 | `text_to_voice_model` | 当前仅支持 `mimo-v2.5-tts`（预置音色 + 唱歌模式） |
 | `tts_voice_id` | 预置音色名（中文音色直接使用中文名作为 ID） |
 ### 预置音色
 | 音色 ID | 说明 |
 | --- | --- |
 | `冰糖` | 中文 · 女声（默认） |
 | `茉莉` | 中文 · 女声 |
 | `苏打` | 中文 · 男声 |
 | `白桦` | 中文 · 男声 |
 | `Mia` | 英文 · 女声 |
 | `Chloe` | 英文 · 女声 |
 | `Milo` | 英文 · 男声 |
 | `Dean` | 英文 · 男声 |
 也可在 Web 控制台的「模型管理 → 语音合成」下拉框中可视化选择。
 ### 风格控制
 MiMo TTS 支持在合成文本中嵌入 **音频标签** 来控制情绪、语调、方言、角色甚至唱歌。标签需出现在 **最终被合成为语音的文本（即 Agent 回复内容）** 中，整体风格标签写在开头：
 ```
 (风格)待合成内容
 ```
 支持半角 `()`、全角 `（）` 或 `[]` 三种括号。常见风格示例：
 | 类型 | 示例标签 |
 | --- | --- |
 | 基础情绪 | `开心` `悲伤` `愤怒` `恐惧` `惊讶` `兴奋` `委屈` `平静` `冷漠` |
 | 复合情绪 | `怅然` `欣慰` `无奈` `愧疚` `释然` `忐忑` `动情` |
 | 整体语调 | `温柔` `高冷` `活泼` `严肃` `慵懒` `俏皮` `深沉` `干练` `凌厉` |
 | 音色定位 | `磁性` `醇厚` `清亮` `空灵` `稚嫩` `苍老` `甜美` `沙哑` |
 | 人设腔调 | `夹子音` `御姐音` `正太音` `大叔音` `台湾腔` |
 | 方言 | `东北话` `四川话` `河南话` `粤语` |
 | 角色扮演 | `孙悟空` `林黛玉` |
 | 唱歌 | `唱歌`（等价于 `sing` / `singing`） |
 示例：
 - (磁性)夜已经深了，城市还在呼吸。
 - (东北话)哎呀妈呀，这天儿也忒冷了吧！
 - (粤语)呢个真係好正啊！
 - (唱歌)原谅我这一生不羁放纵爱自由…
 也可以在文本任意位置插入细粒度音频标签来控制呼吸、笑声、停顿等，例如：
 ```
 （紧张，深呼吸）呼……冷静，冷静。（语速加快）自我介绍我背了五十遍了，应该没问题。
 ```
 完整标签列表参见 [MiMo 语音合成文档](https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5)。
 <Tip>
  CowAgent 在调用 TTS 时会将 Agent 的回复原文（含 `(...)` 标签）直接送入 MiMo 合成。你可以在人设 / 系统提示词里要求模型「在回复开头用 `(风格)` 标签控制语气」，即可让 IM 渠道（微信 / 飞书 / 钉钉 / 企微）的语音回复带上情绪、方言、唱歌等效果。
 </Tip>
--- a/docs/zh/README.md
+++ b/docs/zh/README.md
@@ -104,6 +104,7 @@ CowAgent 支持国内外主流厂商的大语言模型。**文本对话、图像
 | [豆包 Doubao](https://docs.cowagent.ai/models/doubao) | doubao-seed-2.0 系列 | ✅ | ✅ | ✅ | | | ✅ |
 | [Kimi](https://docs.cowagent.ai/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | |
 | [百度ERNIE](https://docs.cowagent.ai/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | |
 | [小米 MiMo](https://docs.cowagent.ai/models/mimo) | mimo-v2.5-pro / v2.5 | ✅ | ✅ | | | ✅ | |
 | [LinkAI](https://docs.cowagent.ai/models/linkai) | 一个 Key 接入 100+ 模型 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [自定义](https://docs.cowagent.ai/models/custom) | 本地模型 / 三方代理 | ✅ | | | | | |
--- a/models/bot_factory.py
+++ b/models/bot_factory.py
@@ -25,6 +25,10 @@ def create_bot(bot_type):
        from models.qianfan.qianfan_bot import QianfanBot
        return QianfanBot()
    elif bot_type == const.MIMO:
        from models.mimo.mimo_bot import MimoBot
        return MimoBot()
    elif bot_type in (const.OPENAI, const.CHATGPT, const.CUSTOM):  # OpenAI-compatible API
        from models.chatgpt.chat_gpt_bot import ChatGPTBot
        return ChatGPTBot()
--- a/models/mimo/init.py
+++ b/models/mimo/init.py
--- a/models/mimo/mimo_bot.py
+++ b/models/mimo/mimo_bot.py
@@ -0,0 +1,668 @@
 # encoding:utf-8
 """
 小米 MiMo Bot —— OpenAI 兼容协议，使用独立 API key / base 配置。
 支持模型：
 - mimo-v2.5-pro     (旗舰，长上下文，默认开启思考)
 - mimo-v2.5         (多模态：文/图/音/视频，默认开启思考)
 - mimo-v2-pro       (V2 Pro，默认开启思考)
 - mimo-v2-omni      (V2 多模态，默认开启思考)
 - mimo-v2-flash     (V2 极速版，默认关闭思考)
 思考模式说明：
 - 开关参数：``{"thinking": {"type": "enabled" | "disabled"}}``
 - mimo-v2.5-pro / mimo-v2.5 在思考模式下 ``temperature`` 会被强制为 1.0，
  本地直接剥离 ``temperature`` / ``top_p`` 等参数避免歧义。
 - 多轮工具调用过程中，若历史包含 tool_calls，所有后续 assistant 消息必须回传
  ``reasoning_content``，否则 API 返回 400 错误。
 - 文档：https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/passing-back-reasoning_content
 """
 import json
 import time
 from typing import Optional
 import requests
 from bridge.context import ContextType
 from bridge.reply import Reply, ReplyType
 from common import const
 from common.log import logger
 from config import conf, load_config
 from models.bot import Bot
 from models.openai_compatible_bot import OpenAICompatibleBot
 from models.session_manager import SessionManager
 from .mimo_session import MimoSession
 DEFAULT_API_BASE = "https://api.xiaomimimo.com/v1"
 DEFAULT_MODEL = const.MIMO_V2_5_PRO
 # 支持多模态输入（图/音/视频）的模型
 MULTIMODAL_MODELS = {const.MIMO_V2_5_PRO, const.MIMO_V2_5, const.MIMO_V2_OMNI}
 class MimoBot(Bot, OpenAICompatibleBot):
    def __init__(self):
        super().__init__()
        self.sessions = SessionManager(
            MimoSession,
            model=conf().get("model") or DEFAULT_MODEL,
        )
        conf_model = conf().get("model") or DEFAULT_MODEL
        self.args = {
            "model": conf_model,
            "temperature": conf().get("temperature", 1.0),
            "top_p": conf().get("top_p", 0.95),
        }
    # ---------- config helpers ----------
    @property
    def api_key(self):
        return conf().get("mimo_api_key")
    @property
    def api_base(self):
        url = conf().get("mimo_api_base") or DEFAULT_API_BASE
        return url.rstrip("/")
    def get_api_config(self):
        """OpenAICompatibleBot 接口 —— 供 call_with_tools() 使用。"""
        return {
            "api_key": self.api_key,
            "api_base": self.api_base,
            "model": conf().get("model", DEFAULT_MODEL),
            "default_temperature": conf().get("temperature", 1.0),
            "default_top_p": conf().get("top_p", 0.95),
        }
    @property
    def supports_vision(self) -> bool:
        """主模型为多模态模型时，允许 vision tool 走主 bot 通道。"""
        model_name = (conf().get("model") or "").lower()
        return model_name in MULTIMODAL_MODELS
    @staticmethod
    def _model_supports_thinking(model_name: str) -> bool:
        """全部 mimo 系列模型都支持 thinking 开关。"""
        if not model_name:
            return False
        return model_name.lower().startswith("mimo-")
    @staticmethod
    def _thinking_default_enabled(model_name: str) -> bool:
        """各模型的思考模式默认值。mimo-v2-flash 默认关闭，其他默认开启。"""
        if not model_name:
            return False
        return model_name.lower() != const.MIMO_V2_FLASH
    def _build_headers(self) -> dict:
        return {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.api_key}",
        }
    # ---------- simple chat (non-agent mode) ----------
    def reply(self, query, context=None):
        if context.type == ContextType.TEXT:
            logger.info("[MIMO] query={}".format(query))
            session_id = context["session_id"]
            reply = None
            clear_memory_commands = conf().get("clear_memory_commands", ["#清除记忆"])
            if query in clear_memory_commands:
                self.sessions.clear_session(session_id)
                reply = Reply(ReplyType.INFO, "记忆已清除")
            elif query == "#清除所有":
                self.sessions.clear_all_session()
                reply = Reply(ReplyType.INFO, "所有人记忆已清除")
            elif query == "#更新配置":
                load_config()
                reply = Reply(ReplyType.INFO, "配置已更新")
            if reply:
                return reply
            session = self.sessions.session_query(query, session_id)
            logger.debug("[MIMO] session query={}".format(session.messages))
            new_args = self.args.copy()
            reply_content = self.reply_text(session, args=new_args)
            logger.debug(
                "[MIMO] new_query={}, session_id={}, reply_cont={}, completion_tokens={}".format(
                    session.messages, session_id,
                    reply_content["content"], reply_content["completion_tokens"],
                )
            )
            if reply_content["completion_tokens"] == 0 and len(reply_content["content"]) > 0:
                reply = Reply(ReplyType.ERROR, reply_content["content"])
            elif reply_content["completion_tokens"] > 0:
                self.sessions.session_reply(
                    reply_content["content"], session_id, reply_content["total_tokens"],
                )
                reply = Reply(ReplyType.TEXT, reply_content["content"])
            else:
                reply = Reply(ReplyType.ERROR, reply_content["content"])
                logger.debug("[MIMO] reply {} used 0 tokens.".format(reply_content))
            return reply
        else:
            reply = Reply(ReplyType.ERROR, "Bot不支持处理{}类型的消息".format(context.type))
            return reply
    def reply_text(self, session, args=None, retry_count: int = 0) -> dict:
        try:
            headers = self._build_headers()
            body = dict(args) if args else dict(self.args)
            body["messages"] = session.messages
            model_name = str(body.get("model", ""))
            # 思考模式下 mimo-v2.5-pro / mimo-v2.5 不支持自定义 temperature/top_p,
            # 简单起见，所有支持思考的模型按默认配置走，剥离这些参数。
            if self._model_supports_thinking(model_name) and self._thinking_default_enabled(model_name):
                for k in ("temperature", "top_p", "presence_penalty", "frequency_penalty"):
                    body.pop(k, None)
            res = requests.post(
                f"{self.api_base}/chat/completions",
                headers=headers,
                json=body,
                timeout=180,
            )
            if res.status_code == 200:
                response = res.json()
                return {
                    "total_tokens": response["usage"]["total_tokens"],
                    "completion_tokens": response["usage"]["completion_tokens"],
                    "content": response["choices"][0]["message"]["content"],
                }
            else:
                try:
                    response = res.json()
                    error = response.get("error", {})
                except Exception:
                    error = {"message": res.text[:300]}
                logger.error(
                    f"[MIMO] chat failed, status_code={res.status_code}, "
                    f"msg={error.get('message')}, type={error.get('type')}"
                )
                result = {"completion_tokens": 0, "content": "提问太快啦，请休息一下再问我吧"}
                need_retry = False
                if res.status_code >= 500:
                    need_retry = retry_count < 2
                elif res.status_code == 401:
                    result["content"] = "授权失败，请检查API Key是否正确"
                elif res.status_code == 429:
                    result["content"] = "请求过于频繁，请稍后再试"
                    need_retry = retry_count < 2
                if need_retry:
                    time.sleep(3)
                    return self.reply_text(session, args, retry_count + 1)
                return result
        except Exception as e:
            logger.exception(e)
            if retry_count < 2:
                return self.reply_text(session, args, retry_count + 1)
            return {"completion_tokens": 0, "content": "我现在有点累了，等会再来吧"}
    # ==================== Agent mode support ====================
    def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs):
        """
        带工具调用支持的 MiMo API 调用 (供 agent 集成使用)。
        处理逻辑：
        - Claude 格式 → OpenAI 格式 转换（含 reasoning_content 全量回传）
        - System prompt 注入
        - SSE 流式响应（包含 tool_calls 与 reasoning_content 增量）
        - 思考模式开关传递
        """
        try:
            converted_messages = self._convert_messages_to_openai_format(messages)
            system_prompt = kwargs.pop("system", None)
            if system_prompt:
                if not converted_messages or converted_messages[0].get("role") != "system":
                    converted_messages.insert(0, {"role": "system", "content": system_prompt})
                else:
                    converted_messages[0] = {"role": "system", "content": system_prompt}
            converted_tools = None
            if tools:
                converted_tools = self._convert_tools_to_openai_format(tools)
            model = kwargs.pop("model", None) or self.args["model"]
            max_tokens = kwargs.pop("max_tokens", None)
            request_body = {
                "model": model,
                "messages": converted_messages,
                "stream": stream,
            }
            if max_tokens is not None:
                # MiMo 使用 max_completion_tokens 命名（含可见输出 + 推理 token）
                request_body["max_completion_tokens"] = max_tokens
            if converted_tools:
                request_body["tools"] = converted_tools
                request_body["tool_choice"] = kwargs.pop("tool_choice", "auto")
            # 思考模式：默认遵循各模型的官方默认值；caller 可显式覆盖
            thinking_param = kwargs.pop("thinking", None)
            thinking_active = False
            if self._model_supports_thinking(model):
                if thinking_param is None:
                    default_on = self._thinking_default_enabled(model)
                    thinking_param = {"type": "enabled" if default_on else "disabled"}
                request_body["thinking"] = thinking_param
                thinking_active = thinking_param.get("type") == "enabled"
            # 思考模式下 v2.5-pro / v2.5 不支持自定义 temperature；干脆全部剥离避免被静默忽略
            if thinking_active:
                for k in ("temperature", "top_p", "presence_penalty", "frequency_penalty"):
                    request_body.pop(k, None)
                    kwargs.pop(k, None)
            else:
                temperature = kwargs.pop("temperature", None)
                if temperature is not None:
                    request_body["temperature"] = temperature
                top_p = kwargs.pop("top_p", None)
                if top_p is not None:
                    request_body["top_p"] = top_p
            logger.debug(
                f"[MIMO] API call: model={model}, "
                f"tools={len(converted_tools) if converted_tools else 0}, "
                f"stream={stream}, thinking={thinking_active}"
            )
            if stream:
                return self._handle_stream_response(request_body)
            else:
                return self._handle_sync_response(request_body)
        except Exception as e:
            logger.error(f"[MIMO] call_with_tools error: {e}")
            import traceback
            logger.error(traceback.format_exc())
            def error_generator():
                yield {"error": True, "message": str(e), "status_code": 500}
            return error_generator()
    # -------------------- streaming --------------------
    def _handle_stream_response(self, request_body: dict):
        """SSE 流式 chunk 转为 OpenAI 标准 delta 输出（含 reasoning_content）。"""
        try:
            headers = self._build_headers()
            url = f"{self.api_base}/chat/completions"
            response = requests.post(url, headers=headers, json=request_body, stream=True, timeout=180)
            if response.status_code != 200:
                error_msg = response.text
                logger.error(f"[MIMO] API error: status={response.status_code}, msg={error_msg}")
                yield {"error": True, "message": error_msg, "status_code": response.status_code}
                return
            current_tool_calls = {}
            finish_reason = None
            for line in response.iter_lines():
                if not line:
                    continue
                line = line.decode("utf-8")
                if line.startswith("data: "):
                    data_str = line[6:]
                elif line.startswith("data:"):
                    data_str = line[5:]
                else:
                    continue
                if data_str.strip() == "[DONE]":
                    break
                try:
                    chunk = json.loads(data_str)
                except json.JSONDecodeError as e:
                    logger.warning(f"[MIMO] JSON decode error: {e}, data: {data_str[:200]}")
                    continue
                if chunk.get("error"):
                    error_data = chunk["error"]
                    error_msg = error_data.get("message", "Unknown error") if isinstance(error_data, dict) else str(error_data)
                    logger.error(f"[MIMO] stream error: {error_msg}")
                    yield {"error": True, "message": error_msg, "status_code": 500}
                    return
                if not chunk.get("choices"):
                    continue
                choice = chunk["choices"][0]
                delta = choice.get("delta", {})
                if choice.get("finish_reason"):
                    finish_reason = choice["finish_reason"]
                # 推理内容（思考模式）：单独 delta 透传给 agent_stream
                if delta.get("reasoning_content"):
                    yield {
                        "choices": [{
                            "index": 0,
                            "delta": {
                                "role": "assistant",
                                "reasoning_content": delta["reasoning_content"],
                            },
                            "finish_reason": None,
                        }]
                    }
                if delta.get("content"):
                    yield {
                        "choices": [{
                            "index": 0,
                            "delta": {
                                "role": "assistant",
                                "content": delta["content"],
                            },
                        }]
                    }
                if "tool_calls" in delta and delta["tool_calls"]:
                    for tool_call_chunk in delta["tool_calls"]:
                        index = tool_call_chunk.get("index", 0)
                        if index not in current_tool_calls:
                            current_tool_calls[index] = {
                                "id": tool_call_chunk.get("id", ""),
                                "name": tool_call_chunk.get("function", {}).get("name", ""),
                                "arguments": "",
                            }
                        if "function" in tool_call_chunk and "arguments" in tool_call_chunk["function"]:
                            current_tool_calls[index]["arguments"] += tool_call_chunk["function"]["arguments"]
                        yield {
                            "choices": [{
                                "index": 0,
                                "delta": {"tool_calls": [tool_call_chunk]},
                            }]
                        }
            yield {
                "choices": [{
                    "index": 0,
                    "delta": {},
                    "finish_reason": finish_reason,
                }]
            }
        except requests.exceptions.Timeout:
            logger.error("[MIMO] Request timeout")
            yield {"error": True, "message": "Request timeout", "status_code": 500}
        except Exception as e:
            logger.error(f"[MIMO] stream response error: {e}")
            import traceback
            logger.error(traceback.format_exc())
            yield {"error": True, "message": str(e), "status_code": 500}
    # -------------------- sync --------------------
    def _handle_sync_response(self, request_body: dict):
        """非流式响应；统一 yield 一份 Claude 格式 dict 与流式路径对齐。"""
        try:
            headers = self._build_headers()
            request_body.pop("stream", None)
            url = f"{self.api_base}/chat/completions"
            response = requests.post(url, headers=headers, json=request_body, timeout=180)
            if response.status_code != 200:
                error_msg = response.text
                logger.error(f"[MIMO] API error: status={response.status_code}, msg={error_msg}")
                yield {"error": True, "message": error_msg, "status_code": response.status_code}
                return
            result = response.json()
            message = result["choices"][0]["message"]
            finish_reason = result["choices"][0]["finish_reason"]
            response_data = {"role": "assistant", "content": []}
            # 推理内容包装成 thinking block，便于 agent 层持久化并在工具调用时回传
            if message.get("reasoning_content"):
                response_data["content"].append({
                    "type": "thinking",
                    "thinking": message["reasoning_content"],
                })
            if message.get("content"):
                response_data["content"].append({
                    "type": "text",
                    "text": message["content"],
                })
            if message.get("tool_calls"):
                for tool_call in message["tool_calls"]:
                    try:
                        tool_input = json.loads(tool_call["function"]["arguments"])
                    except (json.JSONDecodeError, TypeError):
                        tool_input = {}
                    response_data["content"].append({
                        "type": "tool_use",
                        "id": tool_call["id"],
                        "name": tool_call["function"]["name"],
                        "input": tool_input,
                    })
            if finish_reason == "tool_calls":
                response_data["stop_reason"] = "tool_use"
            elif finish_reason == "stop":
                response_data["stop_reason"] = "end_turn"
            else:
                response_data["stop_reason"] = finish_reason
            yield response_data
        except requests.exceptions.Timeout:
            logger.error("[MIMO] Request timeout")
            yield {"error": True, "message": "Request timeout", "status_code": 500}
        except Exception as e:
            logger.error(f"[MIMO] sync response error: {e}")
            import traceback
            logger.error(traceback.format_exc())
            yield {"error": True, "message": str(e), "status_code": 500}
    # -------------------- format conversion --------------------
    def _convert_messages_to_openai_format(self, messages):
        """
        将 Claude 格式（content blocks）转为 OpenAI 格式。
        关键约束：MiMo 思考模式下，一旦历史包含 tool_calls 的 assistant 轮次，
        所有后续 assistant 消息（含工具调用轮）必须回传 reasoning_content，
        否则 API 返回 400。本地无 trace 时用空字符串回填，MiMo 接受字段存在
        即可。
        """
        if not messages:
            return []
        has_tool_call_history = False
        for msg in messages:
            if msg.get("role") != "assistant":
                continue
            if msg.get("tool_calls"):
                has_tool_call_history = True
                break
            content = msg.get("content")
            if isinstance(content, list) and any(
                isinstance(b, dict) and b.get("type") == "tool_use" for b in content
            ):
                has_tool_call_history = True
                break
        converted = []
        for msg in messages:
            role = msg.get("role")
            content = msg.get("content")
            if not isinstance(content, list):
                if (
                    role == "assistant"
                    and isinstance(msg, dict)
                    and has_tool_call_history
                    and "reasoning_content" not in msg
                ):
                    patched = dict(msg)
                    patched["reasoning_content"] = ""
                    converted.append(patched)
                else:
                    converted.append(msg)
                continue
            if role == "user":
                has_tool_result = any(
                    isinstance(b, dict) and b.get("type") == "tool_result" for b in content
                )
                if has_tool_result:
                    text_parts = []
                    tool_results = []
                    for block in content:
                        if not isinstance(block, dict):
                            continue
                        if block.get("type") == "text":
                            text_parts.append(block.get("text", ""))
                        elif block.get("type") == "tool_result":
                            tool_call_id = block.get("tool_use_id") or ""
                            result_content = block.get("content", "")
                            if not isinstance(result_content, str):
                                result_content = json.dumps(result_content, ensure_ascii=False)
                            tool_results.append({
                                "role": "tool",
                                "tool_call_id": tool_call_id,
                                "content": result_content,
                            })
                    converted.extend(tool_results)
                    if text_parts:
                        converted.append({"role": "user", "content": "\n".join(text_parts)})
                else:
                    # 多模态原样保留（image_url / input_audio / video_url 等 block）
                    converted.append(msg)
            elif role == "assistant":
                openai_msg = {"role": "assistant"}
                text_parts = []
                tool_calls = []
                reasoning_parts = []
                for block in content:
                    if not isinstance(block, dict):
                        continue
                    btype = block.get("type")
                    if btype == "text":
                        text_parts.append(block.get("text", ""))
                    elif btype == "tool_use":
                        tool_calls.append({
                            "id": block.get("id"),
                            "type": "function",
                            "function": {
                                "name": block.get("name"),
                                "arguments": json.dumps(block.get("input", {})),
                            },
                        })
                    elif btype == "thinking":
                        reasoning_parts.append(block.get("thinking", ""))
                if text_parts:
                    openai_msg["content"] = "\n".join(text_parts)
                elif not tool_calls:
                    openai_msg["content"] = ""
                if tool_calls:
                    openai_msg["tool_calls"] = tool_calls
                    if not text_parts:
                        openai_msg["content"] = None
                if reasoning_parts:
                    openai_msg["reasoning_content"] = "\n".join(reasoning_parts)
                elif has_tool_call_history:
                    openai_msg["reasoning_content"] = ""
                converted.append(openai_msg)
            else:
                converted.append(msg)
        return converted
    def _convert_tools_to_openai_format(self, tools):
        """工具定义 Claude 格式 → OpenAI 格式。"""
        if not tools:
            return None
        converted = []
        for tool in tools:
            if "type" in tool and tool["type"] == "function":
                converted.append(tool)
            else:
                converted.append({
                    "type": "function",
                    "function": {
                        "name": tool.get("name"),
                        "description": tool.get("description"),
                        "parameters": tool.get("input_schema", {}),
                    },
                })
        return converted
    # -------------------- vision --------------------
    def call_vision(self, image_url: str, question: str,
                    model: Optional[str] = None,
                    max_tokens: int = 1000) -> dict:
        """通过 MiMo OpenAI 兼容的 /chat/completions 端点进行图像理解。"""
        try:
            # 主模型若不支持视觉（如 mimo-v2-flash），自动切到 mimo-v2.5-pro
            vision_model = model
            if not vision_model:
                cur = self.args.get("model") or DEFAULT_MODEL
                vision_model = cur if cur in MULTIMODAL_MODELS else const.MIMO_V2_5_PRO
            payload = {
                "model": vision_model,
                "max_completion_tokens": max_tokens,
                "messages": [{
                    "role": "user",
                    "content": [
                        {"type": "text", "text": question},
                        {"type": "image_url", "image_url": {"url": image_url}},
                    ],
                }],
            }
            headers = self._build_headers()
            resp = requests.post(
                f"{self.api_base}/chat/completions",
                headers=headers, json=payload, timeout=60,
            )
            if resp.status_code != 200:
                return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
            data = resp.json()
            if "error" in data:
                return {"error": True, "message": data["error"].get("message", str(data["error"]))}
            choice = data.get("choices", [{}])[0].get("message", {})
            # 部分模型在多模态下会把答案塞在 reasoning_content 而非 content
            content = choice.get("content") or choice.get("reasoning_content") or ""
            usage = data.get("usage", {})
            return {
                "model": vision_model,
                "content": content,
                "usage": {
                    "prompt_tokens": usage.get("prompt_tokens", 0),
                    "completion_tokens": usage.get("completion_tokens", 0),
                    "total_tokens": usage.get("total_tokens", 0),
                },
            }
        except Exception as e:
            logger.error(f"[MIMO] call_vision error: {e}")
            return {"error": True, "message": str(e)}
--- a/models/mimo/mimo_session.py
+++ b/models/mimo/mimo_session.py
@@ -0,0 +1,57 @@
 from common.log import logger
 from models.session_manager import Session
 class MimoSession(Session):
    def __init__(self, session_id, system_prompt=None, model="mimo-v2.5-pro"):
        super().__init__(session_id, system_prompt)
        self.model = model
        self.reset()
    def discard_exceeding(self, max_tokens, cur_tokens=None):
        precise = True
        try:
            cur_tokens = self.calc_tokens()
        except Exception as e:
            precise = False
            if cur_tokens is None:
                raise e
            logger.debug("Exception when counting tokens precisely for query: {}".format(e))
        while cur_tokens > max_tokens:
            if len(self.messages) > 2:
                self.messages.pop(1)
            elif len(self.messages) == 2 and self.messages[1]["role"] == "assistant":
                self.messages.pop(1)
                if precise:
                    cur_tokens = self.calc_tokens()
                else:
                    cur_tokens = cur_tokens - max_tokens
                break
            elif len(self.messages) == 2 and self.messages[1]["role"] == "user":
                logger.warn("user message exceed max_tokens. total_tokens={}".format(cur_tokens))
                break
            else:
                logger.debug("max_tokens={}, total_tokens={}, len(messages)={}".format(
                    max_tokens, cur_tokens, len(self.messages)))
                break
            if precise:
                cur_tokens = self.calc_tokens()
            else:
                cur_tokens = cur_tokens - max_tokens
        return cur_tokens
    def calc_tokens(self):
        return num_tokens_from_messages(self.messages, self.model)
 def num_tokens_from_messages(messages, model):
    tokens = 0
    for msg in messages:
        content = msg.get("content", "")
        if isinstance(content, str):
            tokens += len(content)
        elif isinstance(content, list):
            for block in content:
                if isinstance(block, dict):
                    tokens += len(block.get("text", ""))
    return tokens
--- a/voice/factory.py
+++ b/voice/factory.py
@@ -66,4 +66,8 @@ def create_voice(voice_type):
        from voice.zhipuai.zhipuai_voice import ZhipuAIVoice
        return ZhipuAIVoice()
    elif voice_type == "mimo":
        from voice.mimo.mimo_voice import MimoVoice
        return MimoVoice()
    raise RuntimeError
--- a/voice/mimo/init.py
+++ b/voice/mimo/init.py
--- a/voice/mimo/mimo_voice.py
+++ b/voice/mimo/mimo_voice.py
@@ -0,0 +1,109 @@
 # encoding:utf-8
 """
 小米 MiMo TTS - 基于 mimo-v2.5-tts 模型的语音合成。
 通过 /chat/completions 接口实现：assistant 消息内容为待合成文本，
 audio 字段指定预置音色（如 冰糖/茉莉/苏打/Mia/Chloe 等），返回 base64
 编码的音频字节。
 文档：https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5
 注意：MiMo 不提供 ASR 端点，因此 voiceToText 不实现。
 """
 import base64
 import datetime
 import os
 import random
 import requests
 from bridge.reply import Reply, ReplyType
 from common.log import logger
 from config import conf
 from voice.voice import Voice
 DEFAULT_API_BASE = "https://api.xiaomimimo.com/v1"
 DEFAULT_TTS_MODEL = "mimo-v2.5-tts"
 DEFAULT_TTS_VOICE = "冰糖"  # 默认音色：中国集群事实默认值
 REQUEST_TIMEOUT = (5, 120)
 class MimoVoice(Voice):
    def __init__(self):
        pass
    def voiceToText(self, voice_file: str):
        # MiMo 没有独立 ASR 端点；建议使用其他 provider（如 openai/zhipu/dashscope）
        logger.warning("[MimoVoice] voiceToText is not supported by MiMo API")
        return Reply(ReplyType.ERROR, "MiMo 暂不支持语音识别，请配置其他 voice_to_text provider")
    def textToVoice(self, text: str):
        try:
            api_key = conf().get("mimo_api_key", "")
            if not api_key:
                logger.error("[MimoVoice] mimo_api_key is not configured")
                return Reply(ReplyType.ERROR, "未配置 MiMo API key")
            api_base = (conf().get("mimo_api_base") or DEFAULT_API_BASE).rstrip("/")
            model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
            voice_id = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
            # 目标合成文本必须放在 assistant 消息；user 消息可选用作风格指令
            payload = {
                "model": model,
                "messages": [
                    {"role": "assistant", "content": text},
                ],
                "audio": {
                    "format": "wav",
                    "voice": voice_id,
                },
            }
            headers = {
                "Authorization": f"Bearer {api_key}",
                "Content-Type": "application/json",
            }
            url = f"{api_base}/chat/completions"
            response = requests.post(url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT)
            if response.status_code != 200:
                logger.error(
                    f"[MimoVoice] textToVoice failed: status={response.status_code} "
                    f"body={response.text[:500]} model={model} voice={voice_id}"
                )
                return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")
            data = response.json()
            if "error" in data:
                err = data["error"]
                msg = err.get("message", str(err)) if isinstance(err, dict) else str(err)
                logger.error(f"[MimoVoice] textToVoice api error: {msg}")
                return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")
            message = (data.get("choices") or [{}])[0].get("message", {}) or {}
            audio_obj = message.get("audio") or {}
            audio_b64 = audio_obj.get("data")
            if not audio_b64:
                logger.error(f"[MimoVoice] textToVoice empty audio in response: {data}")
                return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")
            try:
                audio_bytes = base64.b64decode(audio_b64)
            except Exception as e:
                logger.error(f"[MimoVoice] base64 decode failed: {e}")
                return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")
            file_name = (
                "tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
                + str(random.randint(0, 1000)) + ".wav"
            )
            os.makedirs(os.path.dirname(file_name), exist_ok=True)
            with open(file_name, "wb") as f:
                f.write(audio_bytes)
            logger.info(
                f"[MimoVoice] textToVoice model={model} voice={voice_id} "
                f"file={file_name} bytes={len(audio_bytes)}"
            )
            return Reply(ReplyType.VOICE, file_name)
        except Exception as e:
            logger.exception(f"[MimoVoice] textToVoice exception: {e}")
            return Reply(ReplyType.ERROR, "语音合成失败，请稍后再试")