From bccce2d7cbde47f221eac886855687c03c8819fc Mon Sep 17 00:00:00 2001 From: zhayujie Date: Thu, 28 May 2026 10:49:52 +0800 Subject: [PATCH] feat(models): support xiaomi mimo --- README.md | 1 + agent/tools/vision/vision.py | 3 + bridge/bridge.py | 4 + channel/web/web_channel.py | 34 +- common/const.py | 11 + config.py | 5 + docs/README.md | 30 ++ docs/docs.json | 3 + docs/en/models/index.mdx | 1 + docs/en/models/mimo.mdx | 136 +++++++ docs/ja/README.md | 1 + docs/ja/models/mimo.mdx | 135 +++++++ docs/models/index.mdx | 1 + docs/models/mimo.mdx | 135 +++++++ docs/zh/README.md | 1 + models/bot_factory.py | 4 + models/mimo/__init__.py | 0 models/mimo/mimo_bot.py | 668 +++++++++++++++++++++++++++++++++++ models/mimo/mimo_session.py | 57 +++ voice/factory.py | 4 + voice/mimo/__init__.py | 0 voice/mimo/mimo_voice.py | 109 ++++++ 22 files changed, 1340 insertions(+), 3 deletions(-) create mode 100644 docs/README.md create mode 100644 docs/en/models/mimo.mdx create mode 100644 docs/ja/models/mimo.mdx create mode 100644 docs/models/mimo.mdx create mode 100644 models/mimo/__init__.py create mode 100644 models/mimo/mimo_bot.py create mode 100644 models/mimo/mimo_session.py create mode 100644 voice/mimo/__init__.py create mode 100644 voice/mimo/mimo_voice.py diff --git a/README.md b/README.md index 2a72f513..8b9e044e 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,7 @@ CowAgent supports all mainstream LLM providers. **Chat, vision, image generation | [Kimi](https://docs.cowagent.ai/en/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | | | [MiniMax](https://docs.cowagent.ai/en/models/minimax) | MiniMax-M2.7 | ✅ | ✅ | ✅ | | ✅ | | | [ERNIE](https://docs.cowagent.ai/en/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | | +| [MiMo](https://docs.cowagent.ai/en/models/mimo) | mimo-v2.5 / pro | ✅ | ✅ | | | ✅ | | | [LinkAI](https://docs.cowagent.ai/en/models/linkai) | One key for 100+ models | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [Custom](https://docs.cowagent.ai/en/models/custom) | Local models / third-party proxy | ✅ | | | | | | diff --git a/agent/tools/vision/vision.py b/agent/tools/vision/vision.py index 56a2ecfe..498f3cd8 100644 --- a/agent/tools/vision/vision.py +++ b/agent/tools/vision/vision.py @@ -57,6 +57,7 @@ _DISCOVERABLE_MODELS = [ ("qianfan_api_key", const.QIANFAN, const.ERNIE_45_TURBO_VL, "Qianfan"), ("zhipu_ai_api_key", const.ZHIPU_AI, const.GLM_4_7, "ZhipuAI"), ("minimax_api_key", const.MiniMax, const.MINIMAX_M2_7, "MiniMax"), + ("mimo_api_key", const.MIMO, const.MIMO_V2_5_PRO, "MiMo"), ] # Model name prefix → discoverable provider display_name. @@ -73,6 +74,7 @@ _MODEL_PREFIX_TO_PROVIDER = [ ("glm-", "ZhipuAI"), ("minimax-", "MiniMax"), ("abab", "MiniMax"), + ("mimo-", "MiMo"), ] # Model prefixes that natively belong to OpenAI / LinkAI (raw HTTP providers). @@ -92,6 +94,7 @@ _PROVIDER_ID_TO_DISPLAY = { "qianfan": "Qianfan", "zhipu": "ZhipuAI", "minimax": "MiniMax", + "mimo": "MiMo", } diff --git a/bridge/bridge.py b/bridge/bridge.py index c0cb62e4..6eeb0887 100644 --- a/bridge/bridge.py +++ b/bridge/bridge.py @@ -63,6 +63,10 @@ class Bridge(object): if model_type and model_type.startswith("deepseek"): self.btype["chat"] = const.DEEPSEEK + # 小米 MiMo 系列模型,全部以 mimo- 开头 + if model_type and model_type.startswith("mimo-"): + self.btype["chat"] = const.MIMO + if model_type and isinstance(model_type, str): lowered_model_type = model_type.lower() if lowered_model_type == const.QIANFAN or lowered_model_type.startswith("ernie"): diff --git a/channel/web/web_channel.py b/channel/web/web_channel.py index ab1d6915..af4e241e 100644 --- a/channel/web/web_channel.py +++ b/channel/web/web_channel.py @@ -1387,6 +1387,7 @@ class ConfigHandler: const.DOUBAO_SEED_2_PRO, const.DOUBAO_SEED_2_CODE, const.KIMI_K2_6, const.KIMI_K2_5, const.KIMI_K2, const.ERNIE_5_1, const.ERNIE_5, const.ERNIE_X1_1, const.ERNIE_45_TURBO_128K, const.ERNIE_45_TURBO_32K, + const.MIMO_V2_5_PRO, const.MIMO_V2_5, ] # Generic placeholder hints surfaced in the web console. We deliberately @@ -1481,6 +1482,14 @@ class ConfigHandler: "api_base_placeholder": _PLACEHOLDER_QIANFAN, "models": [const.ERNIE_5_1, const.ERNIE_5, const.ERNIE_X1_1, const.ERNIE_45_TURBO_128K, const.ERNIE_45_TURBO_32K], }), + ("mimo", { + "label": {"zh": "小米 MiMo", "en": "MiMo"}, + "api_key_field": "mimo_api_key", + "api_base_key": "mimo_api_base", + "api_base_default": "https://api.xiaomimimo.com/v1", + "api_base_placeholder": _PLACEHOLDER_V1, + "models": [const.MIMO_V2_5_PRO, const.MIMO_V2_5], + }), ("linkai", { "label": "LinkAI", "api_key_field": "linkai_api_key", @@ -1502,10 +1511,10 @@ class ConfigHandler: EDITABLE_KEYS = { "model", "bot_type", "use_linkai", "open_ai_api_base", "deepseek_api_base", "qianfan_api_base", "claude_api_base", "gemini_api_base", - "zhipu_ai_api_base", "moonshot_base_url", "ark_base_url", "custom_api_base", + "zhipu_ai_api_base", "moonshot_base_url", "ark_base_url", "custom_api_base", "mimo_api_base", "open_ai_api_key", "deepseek_api_key", "qianfan_api_key", "claude_api_key", "gemini_api_key", "zhipu_ai_api_key", "dashscope_api_key", "moonshot_api_key", - "ark_api_key", "minimax_api_key", "linkai_api_key", "custom_api_key", + "ark_api_key", "minimax_api_key", "linkai_api_key", "custom_api_key", "mimo_api_key", "agent_max_context_tokens", "agent_max_context_turns", "agent_max_steps", "enable_thinking", "web_password", } @@ -1646,7 +1655,7 @@ class ModelsHandler: # Capability -> provider ids drawn from ConfigHandler.PROVIDER_MODELS. _ASR_PROVIDERS = ["openai", "dashscope", "zhipu", "linkai"] # Web-console white-list. Other vendors stay usable via direct config. - _TTS_PROVIDERS = ["openai", "minimax", "dashscope", "linkai"] + _TTS_PROVIDERS = ["openai", "minimax", "dashscope", "mimo", "linkai"] # TTS engine catalog (speech models, not voice timbres). Entries are # either a bare code or {value, hint?} when a friendly label helps. @@ -1661,6 +1670,10 @@ class ModelsHandler: "dashscope": [ {"value": "qwen3-tts-flash", "hint": "覆盖普通话、方言与主流外语"}, ], + # 小米 MiMo TTS 系列,通过 chat completions 接口合成 + "mimo": [ + {"value": "mimo-v2.5-tts", "hint": "预置音色 · 支持唱歌模式"}, + ], # Aggregating gateway: a single endpoint multiplexes several # underlying TTS engines, selected via the `model` field. # Each engine exposes its own voice catalog (see _TTS_PROVIDER_VOICES). @@ -1780,6 +1793,18 @@ class ModelsHandler: {"value": "Marcus", "hint": "陕西话 · 秦川"}, {"value": "Roy", "hint": "闽南语 · 阿杰"}, ], + # 小米 MiMo 预置音色列表(mimo-v2.5-tts),文档: + # https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5 + "mimo": [ + {"value": "冰糖", "hint": "中文 · 女声 · 冰糖"}, + {"value": "茉莉", "hint": "中文 · 女声 · 茉莉"}, + {"value": "苏打", "hint": "中文 · 男声 · 苏打"}, + {"value": "白桦", "hint": "中文 · 男声 · 白桦"}, + {"value": "Mia", "hint": "英文 · 女声 · Mia"}, + {"value": "Chloe", "hint": "英文 · 女声 · Chloe"}, + {"value": "Milo", "hint": "英文 · 男声 · Milo"}, + {"value": "Dean", "hint": "英文 · 男声 · Dean"}, + ], # Aggregating gateway: voices are scoped per engine model. The # frontend picks the correct list based on the selected model so # users don't see incompatible timbres for the active engine. @@ -1916,6 +1941,8 @@ class ModelsHandler: # (see models/minimax/minimax_bot.py::call_vision); the M2.x chat # family is text-only. "minimax": [const.MINIMAX_TEXT_01], + # MiMo 原生全模态模型:v2.5-pro / v2.5 支持图像/音频/视频输入 + "mimo": [const.MIMO_V2_5_PRO, const.MIMO_V2_5], # LinkAI proxies the underlying vendor; surface a curated set of # multimodal models. Order: gpt-4.1-mini → gpt-5.4-mini as the # cross-vendor baselines, then each vendor's recommended default. @@ -2045,6 +2072,7 @@ class ModelsHandler: ("qianfan", "qianfan_api_key", const.ERNIE_45_TURBO_VL), ("zhipu", "zhipu_ai_api_key", const.GLM_5V_TURBO), ("minimax", "minimax_api_key", const.MINIMAX_TEXT_01), + ("mimo", "mimo_api_key", const.MIMO_V2_5_PRO), ] @classmethod diff --git a/common/const.py b/common/const.py index 9cfcd63c..7addd6af 100644 --- a/common/const.py +++ b/common/const.py @@ -15,6 +15,7 @@ ZHIPU_AI = "zhipu" MOONSHOT = "moonshot" MiniMax = "minimax" DEEPSEEK = "deepseek" +MIMO = "mimo" # 小米 MiMo 大模型 CUSTOM = "custom" # custom OpenAI-compatible API, bot_type won't auto-switch on model change MODELSCOPE = "modelscope" @@ -140,6 +141,13 @@ KIMI_K2 = "kimi-k2" KIMI_K2_5 = "kimi-k2.5" KIMI_K2_6 = "kimi-k2.6" # Kimi K2.6 - Agent recommended model (default) +# 小米 MiMo +MIMO_V2_5_PRO = "mimo-v2.5-pro" # MiMo V2.5 Pro - 旗舰,长上下文(默认推荐) +MIMO_V2_5 = "mimo-v2.5" # MiMo V2.5 - 多模态(文/图/音/视频) +MIMO_V2_PRO = "mimo-v2-pro" # MiMo V2 Pro +MIMO_V2_OMNI = "mimo-v2-omni" # MiMo V2 Omni - 多模态 +MIMO_V2_FLASH = "mimo-v2-flash" # MiMo V2 Flash - 极速版 + # Doubao (Volcengine Ark) DOUBAO = "doubao" DOUBAO_SEED_2_CODE = "doubao-seed-2-0-code-preview-260215" @@ -182,6 +190,9 @@ MODEL_LIST = [ # MiniMax MiniMax, MINIMAX_M2_7, MINIMAX_M2_7_HIGHSPEED, MINIMAX_M2_5, MINIMAX_M2_1, MINIMAX_M2_1_LIGHTNING, MINIMAX_M2, MINIMAX_ABAB6_5, + # 小米 MiMo + MIMO, MIMO_V2_5_PRO, MIMO_V2_5, MIMO_V2_PRO, MIMO_V2_OMNI, MIMO_V2_FLASH, + # Claude CLAUDE3, CLAUDE_4_6_SONNET, CLAUDE_4_7_OPUS, CLAUDE_4_6_OPUS, CLAUDE_4_OPUS, CLAUDE_4_5_SONNET, CLAUDE_4_SONNET, CLAUDE_3_OPUS, CLAUDE_3_OPUS_0229, CLAUDE_35_SONNET, CLAUDE_35_SONNET_1022, CLAUDE_35_SONNET_0620, CLAUDE_3_SONNET, CLAUDE_3_HAIKU, diff --git a/config.py b/config.py index 6a3a00df..1d44dcc5 100644 --- a/config.py +++ b/config.py @@ -209,6 +209,9 @@ available_setting = { "Minimax_base_url": "", "deepseek_api_key": "", "deepseek_api_base": "https://api.deepseek.com/v1", + # 小米 MiMo 大模型 + "mimo_api_key": "", + "mimo_api_base": "https://api.xiaomimimo.com/v1", "web_host": "", # Web console bind address; empty means auto "web_port": 9899, "web_password": "", # Web console password; empty means no authentication required @@ -401,6 +404,8 @@ def load_config(): "minimax_api_base": "MINIMAX_API_BASE", "deepseek_api_key": "DEEPSEEK_API_KEY", "deepseek_api_base": "DEEPSEEK_API_BASE", + "mimo_api_key": "MIMO_API_KEY", + "mimo_api_base": "MIMO_API_BASE", "qianfan_api_key": "QIANFAN_API_KEY", "qianfan_api_base": "QIANFAN_API_BASE", "zhipu_ai_api_key": "ZHIPU_AI_API_KEY", diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..f406cc2a --- /dev/null +++ b/docs/README.md @@ -0,0 +1,30 @@ +# Documentation + +This directory contains the Mintlify documentation site for the project. + +## Prerequisites + +- Node.js v20.17.0 or higher (LTS recommended) + +## Install the CLI (one-time, global) + +```bash +npm i -g mint +``` + +## Run the docs locally + +From this `docs/` directory: + +```bash +mint dev +``` + +Then open http://localhost:3000 (or the port Mint reports if 3000 is in use). + +> The first run downloads the Mint preview framework (~90 MB) into `~/.mintlify/`. +> Subsequent runs start instantly from the local cache. + +## More + +- Mintlify docs: https://www.mintlify.com/docs diff --git a/docs/docs.json b/docs/docs.json index 00a5be67..e2826887 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -88,6 +88,7 @@ "models/doubao", "models/kimi", "models/qianfan", + "models/mimo", "models/linkai", "models/coding-plan", "models/custom" @@ -290,6 +291,7 @@ "en/models/doubao", "en/models/kimi", "en/models/qianfan", + "en/models/mimo", "en/models/linkai", "en/models/coding-plan", "en/models/custom" @@ -492,6 +494,7 @@ "ja/models/doubao", "ja/models/kimi", "ja/models/qianfan", + "ja/models/mimo", "ja/models/linkai", "ja/models/coding-plan", "ja/models/custom" diff --git a/docs/en/models/index.mdx b/docs/en/models/index.mdx index 9c7afe44..cbe74d41 100644 --- a/docs/en/models/index.mdx +++ b/docs/en/models/index.mdx @@ -21,6 +21,7 @@ A snapshot of each vendor's capabilities. "Text" refers to the main chat model; | [Doubao](/en/models/doubao) | doubao-seed-2.0 series | ✅ | ✅ | ✅ | | | ✅ | | [Kimi](/en/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | | | [ERNIE](/en/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | | +| [MiMo](/en/models/mimo) | mimo-v2.5-pro / v2.5 | ✅ | ✅ | | | ✅ | | | [LinkAI](/en/models/linkai) | 100+ models from multiple vendors | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [Custom](/en/models/custom) | Local models / third-party proxies | ✅ | | | | | | diff --git a/docs/en/models/mimo.mdx b/docs/en/models/mimo.mdx new file mode 100644 index 00000000..6f808b8e --- /dev/null +++ b/docs/en/models/mimo.mdx @@ -0,0 +1,136 @@ +--- +title: MiMo +description: Xiaomi MiMo model configuration (Text Chat + Image Understanding + Text-to-Speech) +--- + +Xiaomi MiMo is a native omni-modal large model. A single `mimo_api_key` enables text chat, image understanding, and text-to-speech all at once. + + + All capabilities below can be configured in one place via the "Model Management" page in the Web Console — no need to manually edit the configuration file. + + +## Text Chat + +```json +{ + "model": "mimo-v2.5-pro", + "mimo_api_key": "YOUR_API_KEY", + "mimo_api_base": "https://api.xiaomimimo.com/v1" +} +``` + +| Parameter | Description | +| --- | --- | +| `model` | Default recommendation: `mimo-v2.5-pro`; `mimo-v2.5` is also supported | +| `mimo_api_key` | Create one in the [MiMo Open Platform](https://platform.xiaomimimo.com/console/api-keys) | +| `mimo_api_base` | Optional, defaults to `https://api.xiaomimimo.com/v1` | + +### Model Selection + +| Model | Use Case | +| --- | --- | +| `mimo-v2.5-pro` | Flagship: native omni-modal + Agent capability, up to 1M tokens context | +| `mimo-v2.5` | General-purpose, native omni-modal (text / image / video / audio) | + +## Thinking Mode + +The MiMo V2.5 series enables "thinking mode" by default: the model emits `reasoning_content` (chain-of-thought) before the final answer, improving performance on complex tasks. + +Use the global `enable_thinking` flag to toggle visibility (also switchable from the Web Console settings): + +```json +{ + "enable_thinking": true +} +``` + +## Image Understanding + +Once `mimo_api_key` is configured, the Agent's Vision tool can automatically use MiMo's vision models: + +- When the main model itself is multimodal (`mimo-v2.5-pro` / `mimo-v2.5`), images are handled directly by the main model with no extra setup. +- When the main model belongs to another vendor, the Vision tool falls back to `mimo-v2.5-pro` in order. + +To force a specific Vision model, set it explicitly in the configuration: + +```json +{ + "tools": { + "vision": { + "provider": "mimo", + "model": "mimo-v2.5-pro" + } + } +} +``` + +## Text-to-Speech (TTS) + +```json +{ + "text_to_voice": "mimo", + "text_to_voice_model": "mimo-v2.5-tts", + "tts_voice_id": "冰糖" +} +``` + +| Parameter | Description | +| --- | --- | +| `text_to_voice_model` | Currently only `mimo-v2.5-tts` (preset voices + singing mode) | +| `tts_voice_id` | Preset voice name (Chinese voice IDs use the Chinese name directly) | + +### Preset Voices + +| Voice ID | Description | +| --- | --- | +| `Mia` | English · Female | +| `Chloe` | English · Female | +| `Milo` | English · Male | +| `Dean` | English · Male | +| `冰糖` | Chinese · Female (default) | +| `茉莉` | Chinese · Female | +| `苏打` | Chinese · Male | +| `白桦` | Chinese · Male | + + +You can also pick a voice visually from the Web Console under "Model Management → Text-to-Speech". + +### Style Control + +MiMo TTS supports embedding **audio tags** in the synthesis text to control emotion, tone, dialect, persona, and even singing. Tags must appear in the **text that will be synthesized to speech (i.e. the Agent's reply)**, with the overall style tag placed at the very beginning: + +``` +(style)content-to-synthesize +``` + +Half-width `()`, full-width `()`, and `[]` brackets are all accepted. Both Chinese and English style descriptors work — pick whichever language expresses the timbre most precisely. Common examples: + +| Category | Example tags | +| --- | --- | +| Basic emotions | `happy` `sad` `angry` `fear` `surprised` `excited` `aggrieved` `calm` `indifferent` | +| Compound emotions | `wistful` `relieved` `helpless` `guilty` `at ease` `uneasy` `touched` | +| Overall tone | `gentle` `aloof` `lively` `serious` `languid` `playful` `deep` `sharp` `cutting` | +| Voice character | `magnetic` `mellow` `bright` `ethereal` `childlike` `aged` `sweet` `husky` | +| Persona | `squeaky` `mature lady` `young boy` `uncle` `Taiwanese accent` | +| Dialect | `Northeastern` `Sichuan` `Henan` `Cantonese` | +| Role-play | `Sun Wukong` `Lin Daiyu` | +| Singing | `sing` / `singing` | + +Examples: + +- `(magnetic)The night is deep, and the city is still breathing.` +- `(gentle)Take a breath. You've got this.` +- `(serious)This is the final warning before the system reboots.` +- `(singing)Oh, when the saints go marching in…` + +You can also insert fine-grained audio tags at any position in the text to control breathing, laughter, pauses, etc. For example: + +``` +(nervous, deep breath) Phew… stay calm, stay calm. (faster pace) I've rehearsed this intro fifty times, it'll be fine. +``` + +See the [MiMo speech synthesis documentation](https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5) for the full tag list. + + + When CowAgent calls TTS, the Agent's reply text (including any `(...)` tags) is forwarded directly to MiMo for synthesis. Tell the model in its persona / system prompt to "prefix replies with a `(style)` tag to control the tone", and IM channels (WeChat / Feishu / DingTalk / WeCom) will play voice replies with the corresponding emotion, dialect, or even singing. + diff --git a/docs/ja/README.md b/docs/ja/README.md index b68a82d0..df71ec74 100644 --- a/docs/ja/README.md +++ b/docs/ja/README.md @@ -104,6 +104,7 @@ CowAgent は主要な LLM プロバイダーすべてに対応しています。 | [Kimi](https://docs.cowagent.ai/ja/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | | | [MiniMax](https://docs.cowagent.ai/ja/models/minimax) | MiniMax-M2.7 | ✅ | ✅ | ✅ | | ✅ | | | [ERNIE](https://docs.cowagent.ai/ja/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | | +| [MiMo](https://docs.cowagent.ai/ja/models/mimo) | mimo-v2.5-pro / v2.5 | ✅ | ✅ | | | ✅ | | | [LinkAI](https://docs.cowagent.ai/ja/models/linkai) | 1 つの Key で 100+ モデルに接続 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [カスタム](https://docs.cowagent.ai/ja/models/custom) | ローカルモデル / サードパーティプロキシ | ✅ | | | | | | diff --git a/docs/ja/models/mimo.mdx b/docs/ja/models/mimo.mdx new file mode 100644 index 00000000..c677810f --- /dev/null +++ b/docs/ja/models/mimo.mdx @@ -0,0 +1,135 @@ +--- +title: Xiaomi MiMo +description: Xiaomi MiMo モデル設定(テキスト対話 + 画像理解 + 音声合成) +--- + +Xiaomi MiMo はネイティブ全モーダル大規模言語モデルです。1 つの `mimo_api_key` でテキスト対話、画像理解、音声合成を同時に有効化できます。 + + + Web コンソールの「モデル管理」ページから、以下のすべての機能をワンストップで設定でき、設定ファイルを手動で編集する必要はありません。 + + +## テキスト対話 + +```json +{ + "model": "mimo-v2.5-pro", + "mimo_api_key": "YOUR_API_KEY", + "mimo_api_base": "https://api.xiaomimimo.com/v1" +} +``` + +| パラメータ | 説明 | +| --- | --- | +| `model` | 推奨は `mimo-v2.5-pro`。`mimo-v2.5` も使用可能 | +| `mimo_api_key` | [MiMo Open Platform](https://platform.xiaomimimo.com/console/api-keys) で作成 | +| `mimo_api_base` | 任意。デフォルトは `https://api.xiaomimimo.com/v1` | + +### モデル選択 + +| モデル | ユースケース | +| --- | --- | +| `mimo-v2.5-pro` | フラッグシップ。ネイティブ全モーダル + Agent 能力、最大 100 万トークンのコンテキスト | +| `mimo-v2.5` | 汎用版。ネイティブ全モーダル(テキスト / 画像 / 動画 / 音声) | + +## 思考モード + +MiMo V2.5 シリーズはデフォルトで「思考モード」が有効です。最終回答の前に `reasoning_content`(思考過程)を出力することで、複雑なタスクのパフォーマンスを高めます。 + +表示の有無はグローバル設定 `enable_thinking` で切り替え可能です(Web コンソールの設定ページからも変更できます): + +```json +{ + "enable_thinking": true +} +``` + +## 画像理解 + +`mimo_api_key` を設定すると、Agent の Vision ツールは自動的に MiMo のビジョンモデルを利用します: + +- メインモデル自体がマルチモーダル(`mimo-v2.5-pro` / `mimo-v2.5`)の場合は、画像はメインモデルが直接処理し、追加設定は不要です。 +- メインモデルが他社製の場合、Vision ツールは順序に従い `mimo-v2.5-pro` にフォールバックします。 + +特定の Vision モデルを強制したい場合は、設定ファイルで明示的に指定してください: + +```json +{ + "tools": { + "vision": { + "provider": "mimo", + "model": "mimo-v2.5-pro" + } + } +} +``` + +## 音声合成 + +```json +{ + "text_to_voice": "mimo", + "text_to_voice_model": "mimo-v2.5-tts", + "tts_voice_id": "冰糖" +} +``` + +| パラメータ | 説明 | +| --- | --- | +| `text_to_voice_model` | 現在は `mimo-v2.5-tts` のみ対応(プリセット音色 + 歌唱モード) | +| `tts_voice_id` | プリセット音色名(中国語の音色は中国語名がそのまま ID) | + +### プリセット音色 + +| 音色 ID | 説明 | +| --- | --- | +| `冰糖` | 中国語 · 女声(デフォルト) | +| `茉莉` | 中国語 · 女声 | +| `苏打` | 中国語 · 男声 | +| `白桦` | 中国語 · 男声 | +| `Mia` | 英語 · 女声 | +| `Chloe` | 英語 · 女声 | +| `Milo` | 英語 · 男声 | +| `Dean` | 英語 · 男声 | + +Web コンソールの「モデル管理 → 音声合成」のドロップダウンから視覚的に選択することもできます。 + +### スタイル制御 + +MiMo TTS は合成テキスト内に **音声タグ** を埋め込むことで、感情、語調、方言、キャラクター、さらには歌唱まで制御できます。タグは **最終的に音声合成されるテキスト(つまり Agent の返信内容)** に含める必要があり、全体スタイルのタグは先頭に置きます: + +``` +(スタイル)合成するテキスト +``` + +半角 `()`、全角 `()`、`[]` の 3 種類の括弧に対応。スタイル記述は中国語・英語のどちらでも OK で、最も的確に表現できる言語を選んでください。代表的なスタイル例: + +| 種類 | サンプルタグ | +| --- | --- | +| 基本感情 | `happy` `sad` `angry` `fear` `surprised` `excited` `aggrieved` `calm` `indifferent` | +| 複合感情 | `wistful` `relieved` `helpless` `guilty` `at ease` `uneasy` `touched` | +| 全体トーン | `gentle` `aloof` `lively` `serious` `languid` `playful` `deep` `sharp` `cutting` | +| 声質 | `magnetic` `mellow` `bright` `ethereal` `childlike` `aged` `sweet` `husky` | +| キャラクター調 | `squeaky` `mature lady` `young boy` `uncle` `Taiwanese accent` | +| 方言 | `Northeastern` `Sichuan` `Henan` `Cantonese` | +| ロールプレイ | `Sun Wukong` `Lin Daiyu` | +| 歌唱 | `sing` / `singing` | + +例: + +- `(magnetic)夜が深まり、街はまだ呼吸している。` +- `(gentle)深呼吸して。きっと大丈夫。` +- `(serious)これがシステム再起動前の最後の警告です。` +- `(singing)Twinkle, twinkle, little star, how I wonder what you are…` + +テキストの任意の位置に細かい音声タグを挿入して、呼吸、笑い声、間などを制御することもできます。例: + +``` +(nervous, deep breath) ふぅ……落ち着いて、落ち着いて。(faster pace) 自己紹介は五十回練習したから大丈夫。 +``` + +タグの完全な一覧は [MiMo 音声合成ドキュメント](https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5) を参照してください。 + + + CowAgent は TTS 呼び出し時、Agent の返信原文(`(...)` タグを含む)をそのまま MiMo に送信します。ペルソナ / システムプロンプトで「返信の冒頭に `(スタイル)` タグを付けて口調を指定する」よう指示すれば、IM チャネル(WeChat / Feishu / DingTalk / WeCom)の音声返信に感情・方言・歌唱などの効果を付与できます。 + diff --git a/docs/models/index.mdx b/docs/models/index.mdx index 5a7df20a..114c58e0 100644 --- a/docs/models/index.mdx +++ b/docs/models/index.mdx @@ -22,6 +22,7 @@ CowAgent 支持国内外主流厂商的大语言模型,模型接口实现在 | [豆包 Doubao](/models/doubao) | doubao-seed-2.0 系列 | ✅ | ✅ | ✅ | | | ✅ | | [Kimi](/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | | | [百度千帆](/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | | +| [小米 MiMo](/models/mimo) | mimo-v2.5-pro / v2.5 | ✅ | ✅ | | | ✅ | | | [LinkAI](/models/linkai) | 多厂商 100+ 模型统一接入 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [自定义](/models/custom) |本地模型 / 三方代理 | ✅ | | | | | | diff --git a/docs/models/mimo.mdx b/docs/models/mimo.mdx new file mode 100644 index 00000000..ea445df9 --- /dev/null +++ b/docs/models/mimo.mdx @@ -0,0 +1,135 @@ +--- +title: 小米 MiMo +description: 小米 MiMo 模型配置(文本对话 + 图像理解 + 语音合成) +--- + +小米 MiMo 是原生全模态大模型,单 `mimo_api_key` 即可同时启用文本对话、图像理解与语音合成。 + + + 通过 Web 控制台的「模型管理」页面可一站式配置以下全部能力,无需手动改配置文件。 + + +## 文本对话 + +```json +{ + "model": "mimo-v2.5-pro", + "mimo_api_key": "YOUR_API_KEY", + "mimo_api_base": "https://api.xiaomimimo.com/v1" +} +``` + +| 参数 | 说明 | +| --- | --- | +| `model` | 默认推荐 `mimo-v2.5-pro`,也可使用 `mimo-v2.5` | +| `mimo_api_key` | 在 [MiMo 开放平台](https://platform.xiaomimimo.com/console/api-keys) 创建 | +| `mimo_api_base` | 可选,默认为 `https://api.xiaomimimo.com/v1` | + +### 模型选择 + +| 模型 | 适用场景 | +| --- | --- | +| `mimo-v2.5-pro` | 旗舰,原生全模态 + Agent 能力,最高 100 万 tokens 上下文 | +| `mimo-v2.5` | 综合版,原生全模态(文本 / 图像 / 视频 / 音频) | + +## 思考模式 + +MiMo V2.5 系列默认开启「思考模式」:模型在输出最终回答前会先输出 `reasoning_content`(思维链),提升复杂任务表现。 + +通过全局配置 `enable_thinking` 控制是否展示(也可在 Web 控制台 - 配置页面切换): + +```json +{ + "enable_thinking": true +} +``` + +## 图像理解 + +配置 `mimo_api_key` 后,Agent 的 Vision 工具可以自动使用 MiMo 视觉模型: + +- 当主模型本身是多模态时(`mimo-v2.5-pro` / `mimo-v2.5`),直接由主模型识别图像,无需额外配置 +- 当主模型是其他厂商时,Vision 工具会根据顺序自动 fallback 到 `mimo-v2.5-pro` + +如需手动指定 Vision 模型,可在配置文件中显式配置: + +```json +{ + "tools": { + "vision": { + "provider": "mimo", + "model": "mimo-v2.5-pro" + } + } +} +``` + +## 语音合成 + +```json +{ + "text_to_voice": "mimo", + "text_to_voice_model": "mimo-v2.5-tts", + "tts_voice_id": "冰糖" +} +``` + +| 参数 | 说明 | +| --- | --- | +| `text_to_voice_model` | 当前仅支持 `mimo-v2.5-tts`(预置音色 + 唱歌模式) | +| `tts_voice_id` | 预置音色名(中文音色直接使用中文名作为 ID) | + +### 预置音色 + +| 音色 ID | 说明 | +| --- | --- | +| `冰糖` | 中文 · 女声(默认) | +| `茉莉` | 中文 · 女声 | +| `苏打` | 中文 · 男声 | +| `白桦` | 中文 · 男声 | +| `Mia` | 英文 · 女声 | +| `Chloe` | 英文 · 女声 | +| `Milo` | 英文 · 男声 | +| `Dean` | 英文 · 男声 | + +也可在 Web 控制台的「模型管理 → 语音合成」下拉框中可视化选择。 + +### 风格控制 + +MiMo TTS 支持在合成文本中嵌入 **音频标签** 来控制情绪、语调、方言、角色甚至唱歌。标签需出现在 **最终被合成为语音的文本(即 Agent 回复内容)** 中,整体风格标签写在开头: + +``` +(风格)待合成内容 +``` + +支持半角 `()`、全角 `()` 或 `[]` 三种括号。常见风格示例: + +| 类型 | 示例标签 | +| --- | --- | +| 基础情绪 | `开心` `悲伤` `愤怒` `恐惧` `惊讶` `兴奋` `委屈` `平静` `冷漠` | +| 复合情绪 | `怅然` `欣慰` `无奈` `愧疚` `释然` `忐忑` `动情` | +| 整体语调 | `温柔` `高冷` `活泼` `严肃` `慵懒` `俏皮` `深沉` `干练` `凌厉` | +| 音色定位 | `磁性` `醇厚` `清亮` `空灵` `稚嫩` `苍老` `甜美` `沙哑` | +| 人设腔调 | `夹子音` `御姐音` `正太音` `大叔音` `台湾腔` | +| 方言 | `东北话` `四川话` `河南话` `粤语` | +| 角色扮演 | `孙悟空` `林黛玉` | +| 唱歌 | `唱歌`(等价于 `sing` / `singing`) | + +示例: + +- (磁性)夜已经深了,城市还在呼吸。 +- (东北话)哎呀妈呀,这天儿也忒冷了吧! +- (粤语)呢个真係好正啊! +- (唱歌)原谅我这一生不羁放纵爱自由… + +也可以在文本任意位置插入细粒度音频标签来控制呼吸、笑声、停顿等,例如: + +``` +(紧张,深呼吸)呼……冷静,冷静。(语速加快)自我介绍我背了五十遍了,应该没问题。 +``` + +完整标签列表参见 [MiMo 语音合成文档](https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5)。 + + + CowAgent 在调用 TTS 时会将 Agent 的回复原文(含 `(...)` 标签)直接送入 MiMo 合成。你可以在人设 / 系统提示词里要求模型「在回复开头用 `(风格)` 标签控制语气」,即可让 IM 渠道(微信 / 飞书 / 钉钉 / 企微)的语音回复带上情绪、方言、唱歌等效果。 + diff --git a/docs/zh/README.md b/docs/zh/README.md index db54626e..095e9194 100644 --- a/docs/zh/README.md +++ b/docs/zh/README.md @@ -104,6 +104,7 @@ CowAgent 支持国内外主流厂商的大语言模型。**文本对话、图像 | [豆包 Doubao](https://docs.cowagent.ai/models/doubao) | doubao-seed-2.0 系列 | ✅ | ✅ | ✅ | | | ✅ | | [Kimi](https://docs.cowagent.ai/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | | | [百度ERNIE](https://docs.cowagent.ai/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | | +| [小米 MiMo](https://docs.cowagent.ai/models/mimo) | mimo-v2.5-pro / v2.5 | ✅ | ✅ | | | ✅ | | | [LinkAI](https://docs.cowagent.ai/models/linkai) | 一个 Key 接入 100+ 模型 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [自定义](https://docs.cowagent.ai/models/custom) | 本地模型 / 三方代理 | ✅ | | | | | | diff --git a/models/bot_factory.py b/models/bot_factory.py index 824aed04..5d07a236 100644 --- a/models/bot_factory.py +++ b/models/bot_factory.py @@ -25,6 +25,10 @@ def create_bot(bot_type): from models.qianfan.qianfan_bot import QianfanBot return QianfanBot() + elif bot_type == const.MIMO: + from models.mimo.mimo_bot import MimoBot + return MimoBot() + elif bot_type in (const.OPENAI, const.CHATGPT, const.CUSTOM): # OpenAI-compatible API from models.chatgpt.chat_gpt_bot import ChatGPTBot return ChatGPTBot() diff --git a/models/mimo/__init__.py b/models/mimo/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/models/mimo/mimo_bot.py b/models/mimo/mimo_bot.py new file mode 100644 index 00000000..a815e9f0 --- /dev/null +++ b/models/mimo/mimo_bot.py @@ -0,0 +1,668 @@ +# encoding:utf-8 + +""" +小米 MiMo Bot —— OpenAI 兼容协议,使用独立 API key / base 配置。 + +支持模型: +- mimo-v2.5-pro (旗舰,长上下文,默认开启思考) +- mimo-v2.5 (多模态:文/图/音/视频,默认开启思考) +- mimo-v2-pro (V2 Pro,默认开启思考) +- mimo-v2-omni (V2 多模态,默认开启思考) +- mimo-v2-flash (V2 极速版,默认关闭思考) + +思考模式说明: +- 开关参数:``{"thinking": {"type": "enabled" | "disabled"}}`` +- mimo-v2.5-pro / mimo-v2.5 在思考模式下 ``temperature`` 会被强制为 1.0, + 本地直接剥离 ``temperature`` / ``top_p`` 等参数避免歧义。 +- 多轮工具调用过程中,若历史包含 tool_calls,所有后续 assistant 消息必须回传 + ``reasoning_content``,否则 API 返回 400 错误。 +- 文档:https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/passing-back-reasoning_content +""" + +import json +import time +from typing import Optional + +import requests + +from bridge.context import ContextType +from bridge.reply import Reply, ReplyType +from common import const +from common.log import logger +from config import conf, load_config +from models.bot import Bot +from models.openai_compatible_bot import OpenAICompatibleBot +from models.session_manager import SessionManager +from .mimo_session import MimoSession + +DEFAULT_API_BASE = "https://api.xiaomimimo.com/v1" +DEFAULT_MODEL = const.MIMO_V2_5_PRO + +# 支持多模态输入(图/音/视频)的模型 +MULTIMODAL_MODELS = {const.MIMO_V2_5_PRO, const.MIMO_V2_5, const.MIMO_V2_OMNI} + + +class MimoBot(Bot, OpenAICompatibleBot): + def __init__(self): + super().__init__() + self.sessions = SessionManager( + MimoSession, + model=conf().get("model") or DEFAULT_MODEL, + ) + conf_model = conf().get("model") or DEFAULT_MODEL + self.args = { + "model": conf_model, + "temperature": conf().get("temperature", 1.0), + "top_p": conf().get("top_p", 0.95), + } + + # ---------- config helpers ---------- + + @property + def api_key(self): + return conf().get("mimo_api_key") + + @property + def api_base(self): + url = conf().get("mimo_api_base") or DEFAULT_API_BASE + return url.rstrip("/") + + def get_api_config(self): + """OpenAICompatibleBot 接口 —— 供 call_with_tools() 使用。""" + return { + "api_key": self.api_key, + "api_base": self.api_base, + "model": conf().get("model", DEFAULT_MODEL), + "default_temperature": conf().get("temperature", 1.0), + "default_top_p": conf().get("top_p", 0.95), + } + + @property + def supports_vision(self) -> bool: + """主模型为多模态模型时,允许 vision tool 走主 bot 通道。""" + model_name = (conf().get("model") or "").lower() + return model_name in MULTIMODAL_MODELS + + @staticmethod + def _model_supports_thinking(model_name: str) -> bool: + """全部 mimo 系列模型都支持 thinking 开关。""" + if not model_name: + return False + return model_name.lower().startswith("mimo-") + + @staticmethod + def _thinking_default_enabled(model_name: str) -> bool: + """各模型的思考模式默认值。mimo-v2-flash 默认关闭,其他默认开启。""" + if not model_name: + return False + return model_name.lower() != const.MIMO_V2_FLASH + + def _build_headers(self) -> dict: + return { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}", + } + + # ---------- simple chat (non-agent mode) ---------- + + def reply(self, query, context=None): + if context.type == ContextType.TEXT: + logger.info("[MIMO] query={}".format(query)) + + session_id = context["session_id"] + reply = None + clear_memory_commands = conf().get("clear_memory_commands", ["#清除记忆"]) + if query in clear_memory_commands: + self.sessions.clear_session(session_id) + reply = Reply(ReplyType.INFO, "记忆已清除") + elif query == "#清除所有": + self.sessions.clear_all_session() + reply = Reply(ReplyType.INFO, "所有人记忆已清除") + elif query == "#更新配置": + load_config() + reply = Reply(ReplyType.INFO, "配置已更新") + if reply: + return reply + + session = self.sessions.session_query(query, session_id) + logger.debug("[MIMO] session query={}".format(session.messages)) + + new_args = self.args.copy() + reply_content = self.reply_text(session, args=new_args) + logger.debug( + "[MIMO] new_query={}, session_id={}, reply_cont={}, completion_tokens={}".format( + session.messages, session_id, + reply_content["content"], reply_content["completion_tokens"], + ) + ) + if reply_content["completion_tokens"] == 0 and len(reply_content["content"]) > 0: + reply = Reply(ReplyType.ERROR, reply_content["content"]) + elif reply_content["completion_tokens"] > 0: + self.sessions.session_reply( + reply_content["content"], session_id, reply_content["total_tokens"], + ) + reply = Reply(ReplyType.TEXT, reply_content["content"]) + else: + reply = Reply(ReplyType.ERROR, reply_content["content"]) + logger.debug("[MIMO] reply {} used 0 tokens.".format(reply_content)) + return reply + else: + reply = Reply(ReplyType.ERROR, "Bot不支持处理{}类型的消息".format(context.type)) + return reply + + def reply_text(self, session, args=None, retry_count: int = 0) -> dict: + try: + headers = self._build_headers() + body = dict(args) if args else dict(self.args) + body["messages"] = session.messages + + model_name = str(body.get("model", "")) + # 思考模式下 mimo-v2.5-pro / mimo-v2.5 不支持自定义 temperature/top_p, + # 简单起见,所有支持思考的模型按默认配置走,剥离这些参数。 + if self._model_supports_thinking(model_name) and self._thinking_default_enabled(model_name): + for k in ("temperature", "top_p", "presence_penalty", "frequency_penalty"): + body.pop(k, None) + + res = requests.post( + f"{self.api_base}/chat/completions", + headers=headers, + json=body, + timeout=180, + ) + if res.status_code == 200: + response = res.json() + return { + "total_tokens": response["usage"]["total_tokens"], + "completion_tokens": response["usage"]["completion_tokens"], + "content": response["choices"][0]["message"]["content"], + } + else: + try: + response = res.json() + error = response.get("error", {}) + except Exception: + error = {"message": res.text[:300]} + logger.error( + f"[MIMO] chat failed, status_code={res.status_code}, " + f"msg={error.get('message')}, type={error.get('type')}" + ) + result = {"completion_tokens": 0, "content": "提问太快啦,请休息一下再问我吧"} + need_retry = False + if res.status_code >= 500: + need_retry = retry_count < 2 + elif res.status_code == 401: + result["content"] = "授权失败,请检查API Key是否正确" + elif res.status_code == 429: + result["content"] = "请求过于频繁,请稍后再试" + need_retry = retry_count < 2 + + if need_retry: + time.sleep(3) + return self.reply_text(session, args, retry_count + 1) + return result + except Exception as e: + logger.exception(e) + if retry_count < 2: + return self.reply_text(session, args, retry_count + 1) + return {"completion_tokens": 0, "content": "我现在有点累了,等会再来吧"} + + # ==================== Agent mode support ==================== + + def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs): + """ + 带工具调用支持的 MiMo API 调用 (供 agent 集成使用)。 + + 处理逻辑: + - Claude 格式 → OpenAI 格式 转换(含 reasoning_content 全量回传) + - System prompt 注入 + - SSE 流式响应(包含 tool_calls 与 reasoning_content 增量) + - 思考模式开关传递 + """ + try: + converted_messages = self._convert_messages_to_openai_format(messages) + + system_prompt = kwargs.pop("system", None) + if system_prompt: + if not converted_messages or converted_messages[0].get("role") != "system": + converted_messages.insert(0, {"role": "system", "content": system_prompt}) + else: + converted_messages[0] = {"role": "system", "content": system_prompt} + + converted_tools = None + if tools: + converted_tools = self._convert_tools_to_openai_format(tools) + + model = kwargs.pop("model", None) or self.args["model"] + max_tokens = kwargs.pop("max_tokens", None) + + request_body = { + "model": model, + "messages": converted_messages, + "stream": stream, + } + if max_tokens is not None: + # MiMo 使用 max_completion_tokens 命名(含可见输出 + 推理 token) + request_body["max_completion_tokens"] = max_tokens + + if converted_tools: + request_body["tools"] = converted_tools + request_body["tool_choice"] = kwargs.pop("tool_choice", "auto") + + # 思考模式:默认遵循各模型的官方默认值;caller 可显式覆盖 + thinking_param = kwargs.pop("thinking", None) + thinking_active = False + + if self._model_supports_thinking(model): + if thinking_param is None: + default_on = self._thinking_default_enabled(model) + thinking_param = {"type": "enabled" if default_on else "disabled"} + request_body["thinking"] = thinking_param + thinking_active = thinking_param.get("type") == "enabled" + + # 思考模式下 v2.5-pro / v2.5 不支持自定义 temperature;干脆全部剥离避免被静默忽略 + if thinking_active: + for k in ("temperature", "top_p", "presence_penalty", "frequency_penalty"): + request_body.pop(k, None) + kwargs.pop(k, None) + else: + temperature = kwargs.pop("temperature", None) + if temperature is not None: + request_body["temperature"] = temperature + top_p = kwargs.pop("top_p", None) + if top_p is not None: + request_body["top_p"] = top_p + + logger.debug( + f"[MIMO] API call: model={model}, " + f"tools={len(converted_tools) if converted_tools else 0}, " + f"stream={stream}, thinking={thinking_active}" + ) + + if stream: + return self._handle_stream_response(request_body) + else: + return self._handle_sync_response(request_body) + + except Exception as e: + logger.error(f"[MIMO] call_with_tools error: {e}") + import traceback + logger.error(traceback.format_exc()) + + def error_generator(): + yield {"error": True, "message": str(e), "status_code": 500} + return error_generator() + + # -------------------- streaming -------------------- + + def _handle_stream_response(self, request_body: dict): + """SSE 流式 chunk 转为 OpenAI 标准 delta 输出(含 reasoning_content)。""" + try: + headers = self._build_headers() + url = f"{self.api_base}/chat/completions" + response = requests.post(url, headers=headers, json=request_body, stream=True, timeout=180) + + if response.status_code != 200: + error_msg = response.text + logger.error(f"[MIMO] API error: status={response.status_code}, msg={error_msg}") + yield {"error": True, "message": error_msg, "status_code": response.status_code} + return + + current_tool_calls = {} + finish_reason = None + + for line in response.iter_lines(): + if not line: + continue + + line = line.decode("utf-8") + if line.startswith("data: "): + data_str = line[6:] + elif line.startswith("data:"): + data_str = line[5:] + else: + continue + if data_str.strip() == "[DONE]": + break + + try: + chunk = json.loads(data_str) + except json.JSONDecodeError as e: + logger.warning(f"[MIMO] JSON decode error: {e}, data: {data_str[:200]}") + continue + + if chunk.get("error"): + error_data = chunk["error"] + error_msg = error_data.get("message", "Unknown error") if isinstance(error_data, dict) else str(error_data) + logger.error(f"[MIMO] stream error: {error_msg}") + yield {"error": True, "message": error_msg, "status_code": 500} + return + + if not chunk.get("choices"): + continue + choice = chunk["choices"][0] + delta = choice.get("delta", {}) + + if choice.get("finish_reason"): + finish_reason = choice["finish_reason"] + + # 推理内容(思考模式):单独 delta 透传给 agent_stream + if delta.get("reasoning_content"): + yield { + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "reasoning_content": delta["reasoning_content"], + }, + "finish_reason": None, + }] + } + + if delta.get("content"): + yield { + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "content": delta["content"], + }, + }] + } + + if "tool_calls" in delta and delta["tool_calls"]: + for tool_call_chunk in delta["tool_calls"]: + index = tool_call_chunk.get("index", 0) + if index not in current_tool_calls: + current_tool_calls[index] = { + "id": tool_call_chunk.get("id", ""), + "name": tool_call_chunk.get("function", {}).get("name", ""), + "arguments": "", + } + if "function" in tool_call_chunk and "arguments" in tool_call_chunk["function"]: + current_tool_calls[index]["arguments"] += tool_call_chunk["function"]["arguments"] + + yield { + "choices": [{ + "index": 0, + "delta": {"tool_calls": [tool_call_chunk]}, + }] + } + + yield { + "choices": [{ + "index": 0, + "delta": {}, + "finish_reason": finish_reason, + }] + } + + except requests.exceptions.Timeout: + logger.error("[MIMO] Request timeout") + yield {"error": True, "message": "Request timeout", "status_code": 500} + except Exception as e: + logger.error(f"[MIMO] stream response error: {e}") + import traceback + logger.error(traceback.format_exc()) + yield {"error": True, "message": str(e), "status_code": 500} + + # -------------------- sync -------------------- + + def _handle_sync_response(self, request_body: dict): + """非流式响应;统一 yield 一份 Claude 格式 dict 与流式路径对齐。""" + try: + headers = self._build_headers() + request_body.pop("stream", None) + url = f"{self.api_base}/chat/completions" + response = requests.post(url, headers=headers, json=request_body, timeout=180) + + if response.status_code != 200: + error_msg = response.text + logger.error(f"[MIMO] API error: status={response.status_code}, msg={error_msg}") + yield {"error": True, "message": error_msg, "status_code": response.status_code} + return + + result = response.json() + message = result["choices"][0]["message"] + finish_reason = result["choices"][0]["finish_reason"] + + response_data = {"role": "assistant", "content": []} + + # 推理内容包装成 thinking block,便于 agent 层持久化并在工具调用时回传 + if message.get("reasoning_content"): + response_data["content"].append({ + "type": "thinking", + "thinking": message["reasoning_content"], + }) + + if message.get("content"): + response_data["content"].append({ + "type": "text", + "text": message["content"], + }) + + if message.get("tool_calls"): + for tool_call in message["tool_calls"]: + try: + tool_input = json.loads(tool_call["function"]["arguments"]) + except (json.JSONDecodeError, TypeError): + tool_input = {} + response_data["content"].append({ + "type": "tool_use", + "id": tool_call["id"], + "name": tool_call["function"]["name"], + "input": tool_input, + }) + + if finish_reason == "tool_calls": + response_data["stop_reason"] = "tool_use" + elif finish_reason == "stop": + response_data["stop_reason"] = "end_turn" + else: + response_data["stop_reason"] = finish_reason + + yield response_data + + except requests.exceptions.Timeout: + logger.error("[MIMO] Request timeout") + yield {"error": True, "message": "Request timeout", "status_code": 500} + except Exception as e: + logger.error(f"[MIMO] sync response error: {e}") + import traceback + logger.error(traceback.format_exc()) + yield {"error": True, "message": str(e), "status_code": 500} + + # -------------------- format conversion -------------------- + + def _convert_messages_to_openai_format(self, messages): + """ + 将 Claude 格式(content blocks)转为 OpenAI 格式。 + + 关键约束:MiMo 思考模式下,一旦历史包含 tool_calls 的 assistant 轮次, + 所有后续 assistant 消息(含工具调用轮)必须回传 reasoning_content, + 否则 API 返回 400。本地无 trace 时用空字符串回填,MiMo 接受字段存在 + 即可。 + """ + if not messages: + return [] + + has_tool_call_history = False + for msg in messages: + if msg.get("role") != "assistant": + continue + if msg.get("tool_calls"): + has_tool_call_history = True + break + content = msg.get("content") + if isinstance(content, list) and any( + isinstance(b, dict) and b.get("type") == "tool_use" for b in content + ): + has_tool_call_history = True + break + + converted = [] + + for msg in messages: + role = msg.get("role") + content = msg.get("content") + + if not isinstance(content, list): + if ( + role == "assistant" + and isinstance(msg, dict) + and has_tool_call_history + and "reasoning_content" not in msg + ): + patched = dict(msg) + patched["reasoning_content"] = "" + converted.append(patched) + else: + converted.append(msg) + continue + + if role == "user": + has_tool_result = any( + isinstance(b, dict) and b.get("type") == "tool_result" for b in content + ) + if has_tool_result: + text_parts = [] + tool_results = [] + + for block in content: + if not isinstance(block, dict): + continue + if block.get("type") == "text": + text_parts.append(block.get("text", "")) + elif block.get("type") == "tool_result": + tool_call_id = block.get("tool_use_id") or "" + result_content = block.get("content", "") + if not isinstance(result_content, str): + result_content = json.dumps(result_content, ensure_ascii=False) + tool_results.append({ + "role": "tool", + "tool_call_id": tool_call_id, + "content": result_content, + }) + + converted.extend(tool_results) + + if text_parts: + converted.append({"role": "user", "content": "\n".join(text_parts)}) + else: + # 多模态原样保留(image_url / input_audio / video_url 等 block) + converted.append(msg) + + elif role == "assistant": + openai_msg = {"role": "assistant"} + text_parts = [] + tool_calls = [] + reasoning_parts = [] + + for block in content: + if not isinstance(block, dict): + continue + btype = block.get("type") + if btype == "text": + text_parts.append(block.get("text", "")) + elif btype == "tool_use": + tool_calls.append({ + "id": block.get("id"), + "type": "function", + "function": { + "name": block.get("name"), + "arguments": json.dumps(block.get("input", {})), + }, + }) + elif btype == "thinking": + reasoning_parts.append(block.get("thinking", "")) + + if text_parts: + openai_msg["content"] = "\n".join(text_parts) + elif not tool_calls: + openai_msg["content"] = "" + + if tool_calls: + openai_msg["tool_calls"] = tool_calls + if not text_parts: + openai_msg["content"] = None + + if reasoning_parts: + openai_msg["reasoning_content"] = "\n".join(reasoning_parts) + elif has_tool_call_history: + openai_msg["reasoning_content"] = "" + + converted.append(openai_msg) + else: + converted.append(msg) + + return converted + + def _convert_tools_to_openai_format(self, tools): + """工具定义 Claude 格式 → OpenAI 格式。""" + if not tools: + return None + + converted = [] + for tool in tools: + if "type" in tool and tool["type"] == "function": + converted.append(tool) + else: + converted.append({ + "type": "function", + "function": { + "name": tool.get("name"), + "description": tool.get("description"), + "parameters": tool.get("input_schema", {}), + }, + }) + return converted + + # -------------------- vision -------------------- + + def call_vision(self, image_url: str, question: str, + model: Optional[str] = None, + max_tokens: int = 1000) -> dict: + """通过 MiMo OpenAI 兼容的 /chat/completions 端点进行图像理解。""" + try: + # 主模型若不支持视觉(如 mimo-v2-flash),自动切到 mimo-v2.5-pro + vision_model = model + if not vision_model: + cur = self.args.get("model") or DEFAULT_MODEL + vision_model = cur if cur in MULTIMODAL_MODELS else const.MIMO_V2_5_PRO + + payload = { + "model": vision_model, + "max_completion_tokens": max_tokens, + "messages": [{ + "role": "user", + "content": [ + {"type": "text", "text": question}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], + }], + } + headers = self._build_headers() + resp = requests.post( + f"{self.api_base}/chat/completions", + headers=headers, json=payload, timeout=60, + ) + if resp.status_code != 200: + return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"} + data = resp.json() + if "error" in data: + return {"error": True, "message": data["error"].get("message", str(data["error"]))} + choice = data.get("choices", [{}])[0].get("message", {}) + # 部分模型在多模态下会把答案塞在 reasoning_content 而非 content + content = choice.get("content") or choice.get("reasoning_content") or "" + usage = data.get("usage", {}) + return { + "model": vision_model, + "content": content, + "usage": { + "prompt_tokens": usage.get("prompt_tokens", 0), + "completion_tokens": usage.get("completion_tokens", 0), + "total_tokens": usage.get("total_tokens", 0), + }, + } + except Exception as e: + logger.error(f"[MIMO] call_vision error: {e}") + return {"error": True, "message": str(e)} diff --git a/models/mimo/mimo_session.py b/models/mimo/mimo_session.py new file mode 100644 index 00000000..76483f11 --- /dev/null +++ b/models/mimo/mimo_session.py @@ -0,0 +1,57 @@ +from common.log import logger +from models.session_manager import Session + + +class MimoSession(Session): + def __init__(self, session_id, system_prompt=None, model="mimo-v2.5-pro"): + super().__init__(session_id, system_prompt) + self.model = model + self.reset() + + def discard_exceeding(self, max_tokens, cur_tokens=None): + precise = True + try: + cur_tokens = self.calc_tokens() + except Exception as e: + precise = False + if cur_tokens is None: + raise e + logger.debug("Exception when counting tokens precisely for query: {}".format(e)) + while cur_tokens > max_tokens: + if len(self.messages) > 2: + self.messages.pop(1) + elif len(self.messages) == 2 and self.messages[1]["role"] == "assistant": + self.messages.pop(1) + if precise: + cur_tokens = self.calc_tokens() + else: + cur_tokens = cur_tokens - max_tokens + break + elif len(self.messages) == 2 and self.messages[1]["role"] == "user": + logger.warn("user message exceed max_tokens. total_tokens={}".format(cur_tokens)) + break + else: + logger.debug("max_tokens={}, total_tokens={}, len(messages)={}".format( + max_tokens, cur_tokens, len(self.messages))) + break + if precise: + cur_tokens = self.calc_tokens() + else: + cur_tokens = cur_tokens - max_tokens + return cur_tokens + + def calc_tokens(self): + return num_tokens_from_messages(self.messages, self.model) + + +def num_tokens_from_messages(messages, model): + tokens = 0 + for msg in messages: + content = msg.get("content", "") + if isinstance(content, str): + tokens += len(content) + elif isinstance(content, list): + for block in content: + if isinstance(block, dict): + tokens += len(block.get("text", "")) + return tokens diff --git a/voice/factory.py b/voice/factory.py index 3be60bbf..2bc356f4 100644 --- a/voice/factory.py +++ b/voice/factory.py @@ -66,4 +66,8 @@ def create_voice(voice_type): from voice.zhipuai.zhipuai_voice import ZhipuAIVoice return ZhipuAIVoice() + elif voice_type == "mimo": + from voice.mimo.mimo_voice import MimoVoice + + return MimoVoice() raise RuntimeError diff --git a/voice/mimo/__init__.py b/voice/mimo/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/voice/mimo/mimo_voice.py b/voice/mimo/mimo_voice.py new file mode 100644 index 00000000..2ae885f8 --- /dev/null +++ b/voice/mimo/mimo_voice.py @@ -0,0 +1,109 @@ +# encoding:utf-8 +""" +小米 MiMo TTS - 基于 mimo-v2.5-tts 模型的语音合成。 + +通过 /chat/completions 接口实现:assistant 消息内容为待合成文本, +audio 字段指定预置音色(如 冰糖/茉莉/苏打/Mia/Chloe 等),返回 base64 +编码的音频字节。 + +文档:https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5 +注意:MiMo 不提供 ASR 端点,因此 voiceToText 不实现。 +""" +import base64 +import datetime +import os +import random + +import requests + +from bridge.reply import Reply, ReplyType +from common.log import logger +from config import conf +from voice.voice import Voice + +DEFAULT_API_BASE = "https://api.xiaomimimo.com/v1" +DEFAULT_TTS_MODEL = "mimo-v2.5-tts" +DEFAULT_TTS_VOICE = "冰糖" # 默认音色:中国集群事实默认值 +REQUEST_TIMEOUT = (5, 120) + + +class MimoVoice(Voice): + def __init__(self): + pass + + def voiceToText(self, voice_file: str): + # MiMo 没有独立 ASR 端点;建议使用其他 provider(如 openai/zhipu/dashscope) + logger.warning("[MimoVoice] voiceToText is not supported by MiMo API") + return Reply(ReplyType.ERROR, "MiMo 暂不支持语音识别,请配置其他 voice_to_text provider") + + def textToVoice(self, text: str): + try: + api_key = conf().get("mimo_api_key", "") + if not api_key: + logger.error("[MimoVoice] mimo_api_key is not configured") + return Reply(ReplyType.ERROR, "未配置 MiMo API key") + + api_base = (conf().get("mimo_api_base") or DEFAULT_API_BASE).rstrip("/") + model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL + voice_id = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE + + # 目标合成文本必须放在 assistant 消息;user 消息可选用作风格指令 + payload = { + "model": model, + "messages": [ + {"role": "assistant", "content": text}, + ], + "audio": { + "format": "wav", + "voice": voice_id, + }, + } + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + url = f"{api_base}/chat/completions" + response = requests.post(url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT) + + if response.status_code != 200: + logger.error( + f"[MimoVoice] textToVoice failed: status={response.status_code} " + f"body={response.text[:500]} model={model} voice={voice_id}" + ) + return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试") + + data = response.json() + if "error" in data: + err = data["error"] + msg = err.get("message", str(err)) if isinstance(err, dict) else str(err) + logger.error(f"[MimoVoice] textToVoice api error: {msg}") + return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试") + + message = (data.get("choices") or [{}])[0].get("message", {}) or {} + audio_obj = message.get("audio") or {} + audio_b64 = audio_obj.get("data") + if not audio_b64: + logger.error(f"[MimoVoice] textToVoice empty audio in response: {data}") + return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试") + + try: + audio_bytes = base64.b64decode(audio_b64) + except Exception as e: + logger.error(f"[MimoVoice] base64 decode failed: {e}") + return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试") + + file_name = ( + "tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + + str(random.randint(0, 1000)) + ".wav" + ) + os.makedirs(os.path.dirname(file_name), exist_ok=True) + with open(file_name, "wb") as f: + f.write(audio_bytes) + logger.info( + f"[MimoVoice] textToVoice model={model} voice={voice_id} " + f"file={file_name} bytes={len(audio_bytes)}" + ) + return Reply(ReplyType.VOICE, file_name) + except Exception as e: + logger.exception(f"[MimoVoice] textToVoice exception: {e}") + return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")