feat(models): support xiaomi mimo

This commit is contained in:
zhayujie
2026-05-28 10:49:52 +08:00
parent 83cd6ad158
commit bccce2d7cb
22 changed files with 1340 additions and 3 deletions

View File

@@ -104,6 +104,7 @@ CowAgent supports all mainstream LLM providers. **Chat, vision, image generation
| [Kimi](https://docs.cowagent.ai/en/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | |
| [MiniMax](https://docs.cowagent.ai/en/models/minimax) | MiniMax-M2.7 | ✅ | ✅ | ✅ | | ✅ | |
| [ERNIE](https://docs.cowagent.ai/en/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | |
| [MiMo](https://docs.cowagent.ai/en/models/mimo) | mimo-v2.5 / pro | ✅ | ✅ | | | ✅ | |
| [LinkAI](https://docs.cowagent.ai/en/models/linkai) | One key for 100+ models | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Custom](https://docs.cowagent.ai/en/models/custom) | Local models / third-party proxy | ✅ | | | | | |

View File

@@ -57,6 +57,7 @@ _DISCOVERABLE_MODELS = [
("qianfan_api_key", const.QIANFAN, const.ERNIE_45_TURBO_VL, "Qianfan"),
("zhipu_ai_api_key", const.ZHIPU_AI, const.GLM_4_7, "ZhipuAI"),
("minimax_api_key", const.MiniMax, const.MINIMAX_M2_7, "MiniMax"),
("mimo_api_key", const.MIMO, const.MIMO_V2_5_PRO, "MiMo"),
]
# Model name prefix → discoverable provider display_name.
@@ -73,6 +74,7 @@ _MODEL_PREFIX_TO_PROVIDER = [
("glm-", "ZhipuAI"),
("minimax-", "MiniMax"),
("abab", "MiniMax"),
("mimo-", "MiMo"),
]
# Model prefixes that natively belong to OpenAI / LinkAI (raw HTTP providers).
@@ -92,6 +94,7 @@ _PROVIDER_ID_TO_DISPLAY = {
"qianfan": "Qianfan",
"zhipu": "ZhipuAI",
"minimax": "MiniMax",
"mimo": "MiMo",
}

View File

@@ -63,6 +63,10 @@ class Bridge(object):
if model_type and model_type.startswith("deepseek"):
self.btype["chat"] = const.DEEPSEEK
# 小米 MiMo 系列模型,全部以 mimo- 开头
if model_type and model_type.startswith("mimo-"):
self.btype["chat"] = const.MIMO
if model_type and isinstance(model_type, str):
lowered_model_type = model_type.lower()
if lowered_model_type == const.QIANFAN or lowered_model_type.startswith("ernie"):

View File

@@ -1387,6 +1387,7 @@ class ConfigHandler:
const.DOUBAO_SEED_2_PRO, const.DOUBAO_SEED_2_CODE,
const.KIMI_K2_6, const.KIMI_K2_5, const.KIMI_K2,
const.ERNIE_5_1, const.ERNIE_5, const.ERNIE_X1_1, const.ERNIE_45_TURBO_128K, const.ERNIE_45_TURBO_32K,
const.MIMO_V2_5_PRO, const.MIMO_V2_5,
]
# Generic placeholder hints surfaced in the web console. We deliberately
@@ -1481,6 +1482,14 @@ class ConfigHandler:
"api_base_placeholder": _PLACEHOLDER_QIANFAN,
"models": [const.ERNIE_5_1, const.ERNIE_5, const.ERNIE_X1_1, const.ERNIE_45_TURBO_128K, const.ERNIE_45_TURBO_32K],
}),
("mimo", {
"label": {"zh": "小米 MiMo", "en": "MiMo"},
"api_key_field": "mimo_api_key",
"api_base_key": "mimo_api_base",
"api_base_default": "https://api.xiaomimimo.com/v1",
"api_base_placeholder": _PLACEHOLDER_V1,
"models": [const.MIMO_V2_5_PRO, const.MIMO_V2_5],
}),
("linkai", {
"label": "LinkAI",
"api_key_field": "linkai_api_key",
@@ -1502,10 +1511,10 @@ class ConfigHandler:
EDITABLE_KEYS = {
"model", "bot_type", "use_linkai",
"open_ai_api_base", "deepseek_api_base", "qianfan_api_base", "claude_api_base", "gemini_api_base",
"zhipu_ai_api_base", "moonshot_base_url", "ark_base_url", "custom_api_base",
"zhipu_ai_api_base", "moonshot_base_url", "ark_base_url", "custom_api_base", "mimo_api_base",
"open_ai_api_key", "deepseek_api_key", "qianfan_api_key", "claude_api_key", "gemini_api_key",
"zhipu_ai_api_key", "dashscope_api_key", "moonshot_api_key",
"ark_api_key", "minimax_api_key", "linkai_api_key", "custom_api_key",
"ark_api_key", "minimax_api_key", "linkai_api_key", "custom_api_key", "mimo_api_key",
"agent_max_context_tokens", "agent_max_context_turns", "agent_max_steps",
"enable_thinking", "web_password",
}
@@ -1646,7 +1655,7 @@ class ModelsHandler:
# Capability -> provider ids drawn from ConfigHandler.PROVIDER_MODELS.
_ASR_PROVIDERS = ["openai", "dashscope", "zhipu", "linkai"]
# Web-console white-list. Other vendors stay usable via direct config.
_TTS_PROVIDERS = ["openai", "minimax", "dashscope", "linkai"]
_TTS_PROVIDERS = ["openai", "minimax", "dashscope", "mimo", "linkai"]
# TTS engine catalog (speech models, not voice timbres). Entries are
# either a bare code or {value, hint?} when a friendly label helps.
@@ -1661,6 +1670,10 @@ class ModelsHandler:
"dashscope": [
{"value": "qwen3-tts-flash", "hint": "覆盖普通话、方言与主流外语"},
],
# 小米 MiMo TTS 系列,通过 chat completions 接口合成
"mimo": [
{"value": "mimo-v2.5-tts", "hint": "预置音色 · 支持唱歌模式"},
],
# Aggregating gateway: a single endpoint multiplexes several
# underlying TTS engines, selected via the `model` field.
# Each engine exposes its own voice catalog (see _TTS_PROVIDER_VOICES).
@@ -1780,6 +1793,18 @@ class ModelsHandler:
{"value": "Marcus", "hint": "陕西话 · 秦川"},
{"value": "Roy", "hint": "闽南语 · 阿杰"},
],
# 小米 MiMo 预置音色列表mimo-v2.5-tts文档
# https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5
"mimo": [
{"value": "冰糖", "hint": "中文 · 女声 · 冰糖"},
{"value": "茉莉", "hint": "中文 · 女声 · 茉莉"},
{"value": "苏打", "hint": "中文 · 男声 · 苏打"},
{"value": "白桦", "hint": "中文 · 男声 · 白桦"},
{"value": "Mia", "hint": "英文 · 女声 · Mia"},
{"value": "Chloe", "hint": "英文 · 女声 · Chloe"},
{"value": "Milo", "hint": "英文 · 男声 · Milo"},
{"value": "Dean", "hint": "英文 · 男声 · Dean"},
],
# Aggregating gateway: voices are scoped per engine model. The
# frontend picks the correct list based on the selected model so
# users don't see incompatible timbres for the active engine.
@@ -1916,6 +1941,8 @@ class ModelsHandler:
# (see models/minimax/minimax_bot.py::call_vision); the M2.x chat
# family is text-only.
"minimax": [const.MINIMAX_TEXT_01],
# MiMo 原生全模态模型v2.5-pro / v2.5 支持图像/音频/视频输入
"mimo": [const.MIMO_V2_5_PRO, const.MIMO_V2_5],
# LinkAI proxies the underlying vendor; surface a curated set of
# multimodal models. Order: gpt-4.1-mini → gpt-5.4-mini as the
# cross-vendor baselines, then each vendor's recommended default.
@@ -2045,6 +2072,7 @@ class ModelsHandler:
("qianfan", "qianfan_api_key", const.ERNIE_45_TURBO_VL),
("zhipu", "zhipu_ai_api_key", const.GLM_5V_TURBO),
("minimax", "minimax_api_key", const.MINIMAX_TEXT_01),
("mimo", "mimo_api_key", const.MIMO_V2_5_PRO),
]
@classmethod

View File

@@ -15,6 +15,7 @@ ZHIPU_AI = "zhipu"
MOONSHOT = "moonshot"
MiniMax = "minimax"
DEEPSEEK = "deepseek"
MIMO = "mimo" # 小米 MiMo 大模型
CUSTOM = "custom" # custom OpenAI-compatible API, bot_type won't auto-switch on model change
MODELSCOPE = "modelscope"
@@ -140,6 +141,13 @@ KIMI_K2 = "kimi-k2"
KIMI_K2_5 = "kimi-k2.5"
KIMI_K2_6 = "kimi-k2.6" # Kimi K2.6 - Agent recommended model (default)
# 小米 MiMo
MIMO_V2_5_PRO = "mimo-v2.5-pro" # MiMo V2.5 Pro - 旗舰,长上下文(默认推荐)
MIMO_V2_5 = "mimo-v2.5" # MiMo V2.5 - 多模态(文/图/音/视频)
MIMO_V2_PRO = "mimo-v2-pro" # MiMo V2 Pro
MIMO_V2_OMNI = "mimo-v2-omni" # MiMo V2 Omni - 多模态
MIMO_V2_FLASH = "mimo-v2-flash" # MiMo V2 Flash - 极速版
# Doubao (Volcengine Ark)
DOUBAO = "doubao"
DOUBAO_SEED_2_CODE = "doubao-seed-2-0-code-preview-260215"
@@ -182,6 +190,9 @@ MODEL_LIST = [
# MiniMax
MiniMax, MINIMAX_M2_7, MINIMAX_M2_7_HIGHSPEED, MINIMAX_M2_5, MINIMAX_M2_1, MINIMAX_M2_1_LIGHTNING, MINIMAX_M2, MINIMAX_ABAB6_5,
# 小米 MiMo
MIMO, MIMO_V2_5_PRO, MIMO_V2_5, MIMO_V2_PRO, MIMO_V2_OMNI, MIMO_V2_FLASH,
# Claude
CLAUDE3, CLAUDE_4_6_SONNET, CLAUDE_4_7_OPUS, CLAUDE_4_6_OPUS, CLAUDE_4_OPUS, CLAUDE_4_5_SONNET, CLAUDE_4_SONNET, CLAUDE_3_OPUS, CLAUDE_3_OPUS_0229,
CLAUDE_35_SONNET, CLAUDE_35_SONNET_1022, CLAUDE_35_SONNET_0620, CLAUDE_3_SONNET, CLAUDE_3_HAIKU,

View File

@@ -209,6 +209,9 @@ available_setting = {
"Minimax_base_url": "",
"deepseek_api_key": "",
"deepseek_api_base": "https://api.deepseek.com/v1",
# 小米 MiMo 大模型
"mimo_api_key": "",
"mimo_api_base": "https://api.xiaomimimo.com/v1",
"web_host": "", # Web console bind address; empty means auto
"web_port": 9899,
"web_password": "", # Web console password; empty means no authentication required
@@ -401,6 +404,8 @@ def load_config():
"minimax_api_base": "MINIMAX_API_BASE",
"deepseek_api_key": "DEEPSEEK_API_KEY",
"deepseek_api_base": "DEEPSEEK_API_BASE",
"mimo_api_key": "MIMO_API_KEY",
"mimo_api_base": "MIMO_API_BASE",
"qianfan_api_key": "QIANFAN_API_KEY",
"qianfan_api_base": "QIANFAN_API_BASE",
"zhipu_ai_api_key": "ZHIPU_AI_API_KEY",

30
docs/README.md Normal file
View File

@@ -0,0 +1,30 @@
# Documentation
This directory contains the Mintlify documentation site for the project.
## Prerequisites
- Node.js v20.17.0 or higher (LTS recommended)
## Install the CLI (one-time, global)
```bash
npm i -g mint
```
## Run the docs locally
From this `docs/` directory:
```bash
mint dev
```
Then open http://localhost:3000 (or the port Mint reports if 3000 is in use).
> The first run downloads the Mint preview framework (~90 MB) into `~/.mintlify/`.
> Subsequent runs start instantly from the local cache.
## More
- Mintlify docs: https://www.mintlify.com/docs

View File

@@ -88,6 +88,7 @@
"models/doubao",
"models/kimi",
"models/qianfan",
"models/mimo",
"models/linkai",
"models/coding-plan",
"models/custom"
@@ -290,6 +291,7 @@
"en/models/doubao",
"en/models/kimi",
"en/models/qianfan",
"en/models/mimo",
"en/models/linkai",
"en/models/coding-plan",
"en/models/custom"
@@ -492,6 +494,7 @@
"ja/models/doubao",
"ja/models/kimi",
"ja/models/qianfan",
"ja/models/mimo",
"ja/models/linkai",
"ja/models/coding-plan",
"ja/models/custom"

View File

@@ -21,6 +21,7 @@ A snapshot of each vendor's capabilities. "Text" refers to the main chat model;
| [Doubao](/en/models/doubao) | doubao-seed-2.0 series | ✅ | ✅ | ✅ | | | ✅ |
| [Kimi](/en/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | |
| [ERNIE](/en/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | |
| [MiMo](/en/models/mimo) | mimo-v2.5-pro / v2.5 | ✅ | ✅ | | | ✅ | |
| [LinkAI](/en/models/linkai) | 100+ models from multiple vendors | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Custom](/en/models/custom) | Local models / third-party proxies | ✅ | | | | | |

136
docs/en/models/mimo.mdx Normal file
View File

@@ -0,0 +1,136 @@
---
title: MiMo
description: Xiaomi MiMo model configuration (Text Chat + Image Understanding + Text-to-Speech)
---
Xiaomi MiMo is a native omni-modal large model. A single `mimo_api_key` enables text chat, image understanding, and text-to-speech all at once.
<Tip>
All capabilities below can be configured in one place via the "Model Management" page in the Web Console — no need to manually edit the configuration file.
</Tip>
## Text Chat
```json
{
"model": "mimo-v2.5-pro",
"mimo_api_key": "YOUR_API_KEY",
"mimo_api_base": "https://api.xiaomimimo.com/v1"
}
```
| Parameter | Description |
| --- | --- |
| `model` | Default recommendation: `mimo-v2.5-pro`; `mimo-v2.5` is also supported |
| `mimo_api_key` | Create one in the [MiMo Open Platform](https://platform.xiaomimimo.com/console/api-keys) |
| `mimo_api_base` | Optional, defaults to `https://api.xiaomimimo.com/v1` |
### Model Selection
| Model | Use Case |
| --- | --- |
| `mimo-v2.5-pro` | Flagship: native omni-modal + Agent capability, up to 1M tokens context |
| `mimo-v2.5` | General-purpose, native omni-modal (text / image / video / audio) |
## Thinking Mode
The MiMo V2.5 series enables "thinking mode" by default: the model emits `reasoning_content` (chain-of-thought) before the final answer, improving performance on complex tasks.
Use the global `enable_thinking` flag to toggle visibility (also switchable from the Web Console settings):
```json
{
"enable_thinking": true
}
```
## Image Understanding
Once `mimo_api_key` is configured, the Agent's Vision tool can automatically use MiMo's vision models:
- When the main model itself is multimodal (`mimo-v2.5-pro` / `mimo-v2.5`), images are handled directly by the main model with no extra setup.
- When the main model belongs to another vendor, the Vision tool falls back to `mimo-v2.5-pro` in order.
To force a specific Vision model, set it explicitly in the configuration:
```json
{
"tools": {
"vision": {
"provider": "mimo",
"model": "mimo-v2.5-pro"
}
}
}
```
## Text-to-Speech (TTS)
```json
{
"text_to_voice": "mimo",
"text_to_voice_model": "mimo-v2.5-tts",
"tts_voice_id": "冰糖"
}
```
| Parameter | Description |
| --- | --- |
| `text_to_voice_model` | Currently only `mimo-v2.5-tts` (preset voices + singing mode) |
| `tts_voice_id` | Preset voice name (Chinese voice IDs use the Chinese name directly) |
### Preset Voices
| Voice ID | Description |
| --- | --- |
| `Mia` | English · Female |
| `Chloe` | English · Female |
| `Milo` | English · Male |
| `Dean` | English · Male |
| `冰糖` | Chinese · Female (default) |
| `茉莉` | Chinese · Female |
| `苏打` | Chinese · Male |
| `白桦` | Chinese · Male |
You can also pick a voice visually from the Web Console under "Model Management → Text-to-Speech".
### Style Control
MiMo TTS supports embedding **audio tags** in the synthesis text to control emotion, tone, dialect, persona, and even singing. Tags must appear in the **text that will be synthesized to speech (i.e. the Agent's reply)**, with the overall style tag placed at the very beginning:
```
(style)content-to-synthesize
```
Half-width `()`, full-width ``, and `[]` brackets are all accepted. Both Chinese and English style descriptors work — pick whichever language expresses the timbre most precisely. Common examples:
| Category | Example tags |
| --- | --- |
| Basic emotions | `happy` `sad` `angry` `fear` `surprised` `excited` `aggrieved` `calm` `indifferent` |
| Compound emotions | `wistful` `relieved` `helpless` `guilty` `at ease` `uneasy` `touched` |
| Overall tone | `gentle` `aloof` `lively` `serious` `languid` `playful` `deep` `sharp` `cutting` |
| Voice character | `magnetic` `mellow` `bright` `ethereal` `childlike` `aged` `sweet` `husky` |
| Persona | `squeaky` `mature lady` `young boy` `uncle` `Taiwanese accent` |
| Dialect | `Northeastern` `Sichuan` `Henan` `Cantonese` |
| Role-play | `Sun Wukong` `Lin Daiyu` |
| Singing | `sing` / `singing` |
Examples:
- `(magnetic)The night is deep, and the city is still breathing.`
- `(gentle)Take a breath. You've got this.`
- `(serious)This is the final warning before the system reboots.`
- `(singing)Oh, when the saints go marching in…`
You can also insert fine-grained audio tags at any position in the text to control breathing, laughter, pauses, etc. For example:
```
(nervous, deep breath) Phew… stay calm, stay calm. (faster pace) I've rehearsed this intro fifty times, it'll be fine.
```
See the [MiMo speech synthesis documentation](https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5) for the full tag list.
<Tip>
When CowAgent calls TTS, the Agent's reply text (including any `(...)` tags) is forwarded directly to MiMo for synthesis. Tell the model in its persona / system prompt to "prefix replies with a `(style)` tag to control the tone", and IM channels (WeChat / Feishu / DingTalk / WeCom) will play voice replies with the corresponding emotion, dialect, or even singing.
</Tip>

View File

@@ -104,6 +104,7 @@ CowAgent は主要な LLM プロバイダーすべてに対応しています。
| [Kimi](https://docs.cowagent.ai/ja/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | |
| [MiniMax](https://docs.cowagent.ai/ja/models/minimax) | MiniMax-M2.7 | ✅ | ✅ | ✅ | | ✅ | |
| [ERNIE](https://docs.cowagent.ai/ja/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | |
| [MiMo](https://docs.cowagent.ai/ja/models/mimo) | mimo-v2.5-pro / v2.5 | ✅ | ✅ | | | ✅ | |
| [LinkAI](https://docs.cowagent.ai/ja/models/linkai) | 1 つの Key で 100+ モデルに接続 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [カスタム](https://docs.cowagent.ai/ja/models/custom) | ローカルモデル / サードパーティプロキシ | ✅ | | | | | |

135
docs/ja/models/mimo.mdx Normal file
View File

@@ -0,0 +1,135 @@
---
title: Xiaomi MiMo
description: Xiaomi MiMo モデル設定(テキスト対話 + 画像理解 + 音声合成)
---
Xiaomi MiMo はネイティブ全モーダル大規模言語モデルです。1 つの `mimo_api_key` でテキスト対話、画像理解、音声合成を同時に有効化できます。
<Tip>
Web コンソールの「モデル管理」ページから、以下のすべての機能をワンストップで設定でき、設定ファイルを手動で編集する必要はありません。
</Tip>
## テキスト対話
```json
{
"model": "mimo-v2.5-pro",
"mimo_api_key": "YOUR_API_KEY",
"mimo_api_base": "https://api.xiaomimimo.com/v1"
}
```
| パラメータ | 説明 |
| --- | --- |
| `model` | 推奨は `mimo-v2.5-pro`。`mimo-v2.5` も使用可能 |
| `mimo_api_key` | [MiMo Open Platform](https://platform.xiaomimimo.com/console/api-keys) で作成 |
| `mimo_api_base` | 任意。デフォルトは `https://api.xiaomimimo.com/v1` |
### モデル選択
| モデル | ユースケース |
| --- | --- |
| `mimo-v2.5-pro` | フラッグシップ。ネイティブ全モーダル + Agent 能力、最大 100 万トークンのコンテキスト |
| `mimo-v2.5` | 汎用版。ネイティブ全モーダル(テキスト / 画像 / 動画 / 音声) |
## 思考モード
MiMo V2.5 シリーズはデフォルトで「思考モード」が有効です。最終回答の前に `reasoning_content`(思考過程)を出力することで、複雑なタスクのパフォーマンスを高めます。
表示の有無はグローバル設定 `enable_thinking` で切り替え可能ですWeb コンソールの設定ページからも変更できます):
```json
{
"enable_thinking": true
}
```
## 画像理解
`mimo_api_key` を設定すると、Agent の Vision ツールは自動的に MiMo のビジョンモデルを利用します:
- メインモデル自体がマルチモーダル(`mimo-v2.5-pro` / `mimo-v2.5`)の場合は、画像はメインモデルが直接処理し、追加設定は不要です。
- メインモデルが他社製の場合、Vision ツールは順序に従い `mimo-v2.5-pro` にフォールバックします。
特定の Vision モデルを強制したい場合は、設定ファイルで明示的に指定してください:
```json
{
"tools": {
"vision": {
"provider": "mimo",
"model": "mimo-v2.5-pro"
}
}
}
```
## 音声合成
```json
{
"text_to_voice": "mimo",
"text_to_voice_model": "mimo-v2.5-tts",
"tts_voice_id": "冰糖"
}
```
| パラメータ | 説明 |
| --- | --- |
| `text_to_voice_model` | 現在は `mimo-v2.5-tts` のみ対応(プリセット音色 + 歌唱モード) |
| `tts_voice_id` | プリセット音色名(中国語の音色は中国語名がそのまま ID |
### プリセット音色
| 音色 ID | 説明 |
| --- | --- |
| `冰糖` | 中国語 · 女声(デフォルト) |
| `茉莉` | 中国語 · 女声 |
| `苏打` | 中国語 · 男声 |
| `白桦` | 中国語 · 男声 |
| `Mia` | 英語 · 女声 |
| `Chloe` | 英語 · 女声 |
| `Milo` | 英語 · 男声 |
| `Dean` | 英語 · 男声 |
Web コンソールの「モデル管理 → 音声合成」のドロップダウンから視覚的に選択することもできます。
### スタイル制御
MiMo TTS は合成テキスト内に **音声タグ** を埋め込むことで、感情、語調、方言、キャラクター、さらには歌唱まで制御できます。タグは **最終的に音声合成されるテキスト(つまり Agent の返信内容)** に含める必要があり、全体スタイルのタグは先頭に置きます:
```
(スタイル)合成するテキスト
```
半角 `()`、全角 ``、`[]` の 3 種類の括弧に対応。スタイル記述は中国語・英語のどちらでも OK で、最も的確に表現できる言語を選んでください。代表的なスタイル例:
| 種類 | サンプルタグ |
| --- | --- |
| 基本感情 | `happy` `sad` `angry` `fear` `surprised` `excited` `aggrieved` `calm` `indifferent` |
| 複合感情 | `wistful` `relieved` `helpless` `guilty` `at ease` `uneasy` `touched` |
| 全体トーン | `gentle` `aloof` `lively` `serious` `languid` `playful` `deep` `sharp` `cutting` |
| 声質 | `magnetic` `mellow` `bright` `ethereal` `childlike` `aged` `sweet` `husky` |
| キャラクター調 | `squeaky` `mature lady` `young boy` `uncle` `Taiwanese accent` |
| 方言 | `Northeastern` `Sichuan` `Henan` `Cantonese` |
| ロールプレイ | `Sun Wukong` `Lin Daiyu` |
| 歌唱 | `sing` / `singing` |
例:
- `(magnetic)夜が深まり、街はまだ呼吸している。`
- `(gentle)深呼吸して。きっと大丈夫。`
- `(serious)これがシステム再起動前の最後の警告です。`
- `(singing)Twinkle, twinkle, little star, how I wonder what you are…`
テキストの任意の位置に細かい音声タグを挿入して、呼吸、笑い声、間などを制御することもできます。例:
```
(nervous, deep breath) ふぅ……落ち着いて、落ち着いて。(faster pace) 自己紹介は五十回練習したから大丈夫。
```
タグの完全な一覧は [MiMo 音声合成ドキュメント](https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5) を参照してください。
<Tip>
CowAgent は TTS 呼び出し時、Agent の返信原文(`(...)` タグを含む)をそのまま MiMo に送信します。ペルソナ / システムプロンプトで「返信の冒頭に `(スタイル)` タグを付けて口調を指定する」よう指示すれば、IM チャネルWeChat / Feishu / DingTalk / WeComの音声返信に感情・方言・歌唱などの効果を付与できます。
</Tip>

View File

@@ -22,6 +22,7 @@ CowAgent 支持国内外主流厂商的大语言模型,模型接口实现在
| [豆包 Doubao](/models/doubao) | doubao-seed-2.0 系列 | ✅ | ✅ | ✅ | | | ✅ |
| [Kimi](/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | |
| [百度千帆](/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | |
| [小米 MiMo](/models/mimo) | mimo-v2.5-pro / v2.5 | ✅ | ✅ | | | ✅ | |
| [LinkAI](/models/linkai) | 多厂商 100+ 模型统一接入 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [自定义](/models/custom) |本地模型 / 三方代理 | ✅ | | | | | |

135
docs/models/mimo.mdx Normal file
View File

@@ -0,0 +1,135 @@
---
title: 小米 MiMo
description: 小米 MiMo 模型配置(文本对话 + 图像理解 + 语音合成)
---
小米 MiMo 是原生全模态大模型,单 `mimo_api_key` 即可同时启用文本对话、图像理解与语音合成。
<Tip>
通过 Web 控制台的「模型管理」页面可一站式配置以下全部能力,无需手动改配置文件。
</Tip>
## 文本对话
```json
{
"model": "mimo-v2.5-pro",
"mimo_api_key": "YOUR_API_KEY",
"mimo_api_base": "https://api.xiaomimimo.com/v1"
}
```
| 参数 | 说明 |
| --- | --- |
| `model` | 默认推荐 `mimo-v2.5-pro`,也可使用 `mimo-v2.5` |
| `mimo_api_key` | 在 [MiMo 开放平台](https://platform.xiaomimimo.com/console/api-keys) 创建 |
| `mimo_api_base` | 可选,默认为 `https://api.xiaomimimo.com/v1` |
### 模型选择
| 模型 | 适用场景 |
| --- | --- |
| `mimo-v2.5-pro` | 旗舰,原生全模态 + Agent 能力,最高 100 万 tokens 上下文 |
| `mimo-v2.5` | 综合版,原生全模态(文本 / 图像 / 视频 / 音频) |
## 思考模式
MiMo V2.5 系列默认开启「思考模式」:模型在输出最终回答前会先输出 `reasoning_content`(思维链),提升复杂任务表现。
通过全局配置 `enable_thinking` 控制是否展示(也可在 Web 控制台 - 配置页面切换):
```json
{
"enable_thinking": true
}
```
## 图像理解
配置 `mimo_api_key` 后Agent 的 Vision 工具可以自动使用 MiMo 视觉模型:
- 当主模型本身是多模态时(`mimo-v2.5-pro` / `mimo-v2.5`),直接由主模型识别图像,无需额外配置
- 当主模型是其他厂商时Vision 工具会根据顺序自动 fallback 到 `mimo-v2.5-pro`
如需手动指定 Vision 模型,可在配置文件中显式配置:
```json
{
"tools": {
"vision": {
"provider": "mimo",
"model": "mimo-v2.5-pro"
}
}
}
```
## 语音合成
```json
{
"text_to_voice": "mimo",
"text_to_voice_model": "mimo-v2.5-tts",
"tts_voice_id": "冰糖"
}
```
| 参数 | 说明 |
| --- | --- |
| `text_to_voice_model` | 当前仅支持 `mimo-v2.5-tts`(预置音色 + 唱歌模式) |
| `tts_voice_id` | 预置音色名(中文音色直接使用中文名作为 ID |
### 预置音色
| 音色 ID | 说明 |
| --- | --- |
| `冰糖` | 中文 · 女声(默认) |
| `茉莉` | 中文 · 女声 |
| `苏打` | 中文 · 男声 |
| `白桦` | 中文 · 男声 |
| `Mia` | 英文 · 女声 |
| `Chloe` | 英文 · 女声 |
| `Milo` | 英文 · 男声 |
| `Dean` | 英文 · 男声 |
也可在 Web 控制台的「模型管理 → 语音合成」下拉框中可视化选择。
### 风格控制
MiMo TTS 支持在合成文本中嵌入 **音频标签** 来控制情绪、语调、方言、角色甚至唱歌。标签需出现在 **最终被合成为语音的文本(即 Agent 回复内容)** 中,整体风格标签写在开头:
```
(风格)待合成内容
```
支持半角 `()`、全角 `` 或 `[]` 三种括号。常见风格示例:
| 类型 | 示例标签 |
| --- | --- |
| 基础情绪 | `开心` `悲伤` `愤怒` `恐惧` `惊讶` `兴奋` `委屈` `平静` `冷漠` |
| 复合情绪 | `怅然` `欣慰` `无奈` `愧疚` `释然` `忐忑` `动情` |
| 整体语调 | `温柔` `高冷` `活泼` `严肃` `慵懒` `俏皮` `深沉` `干练` `凌厉` |
| 音色定位 | `磁性` `醇厚` `清亮` `空灵` `稚嫩` `苍老` `甜美` `沙哑` |
| 人设腔调 | `夹子音` `御姐音` `正太音` `大叔音` `台湾腔` |
| 方言 | `东北话` `四川话` `河南话` `粤语` |
| 角色扮演 | `孙悟空` `林黛玉` |
| 唱歌 | `唱歌`(等价于 `sing` / `singing` |
示例:
- (磁性)夜已经深了,城市还在呼吸。
- (东北话)哎呀妈呀,这天儿也忒冷了吧!
- (粤语)呢个真係好正啊!
- (唱歌)原谅我这一生不羁放纵爱自由…
也可以在文本任意位置插入细粒度音频标签来控制呼吸、笑声、停顿等,例如:
```
(紧张,深呼吸)呼……冷静,冷静。(语速加快)自我介绍我背了五十遍了,应该没问题。
```
完整标签列表参见 [MiMo 语音合成文档](https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5)。
<Tip>
CowAgent 在调用 TTS 时会将 Agent 的回复原文(含 `(...)` 标签)直接送入 MiMo 合成。你可以在人设 / 系统提示词里要求模型「在回复开头用 `(风格)` 标签控制语气」,即可让 IM 渠道(微信 / 飞书 / 钉钉 / 企微)的语音回复带上情绪、方言、唱歌等效果。
</Tip>

View File

@@ -104,6 +104,7 @@ CowAgent 支持国内外主流厂商的大语言模型。**文本对话、图像
| [豆包 Doubao](https://docs.cowagent.ai/models/doubao) | doubao-seed-2.0 系列 | ✅ | ✅ | ✅ | | | ✅ |
| [Kimi](https://docs.cowagent.ai/models/kimi) | kimi-k2.6 | ✅ | ✅ | | | | |
| [百度ERNIE](https://docs.cowagent.ai/models/qianfan) | ernie-5.1 | ✅ | ✅ | | | | |
| [小米 MiMo](https://docs.cowagent.ai/models/mimo) | mimo-v2.5-pro / v2.5 | ✅ | ✅ | | | ✅ | |
| [LinkAI](https://docs.cowagent.ai/models/linkai) | 一个 Key 接入 100+ 模型 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [自定义](https://docs.cowagent.ai/models/custom) | 本地模型 / 三方代理 | ✅ | | | | | |

View File

@@ -25,6 +25,10 @@ def create_bot(bot_type):
from models.qianfan.qianfan_bot import QianfanBot
return QianfanBot()
elif bot_type == const.MIMO:
from models.mimo.mimo_bot import MimoBot
return MimoBot()
elif bot_type in (const.OPENAI, const.CHATGPT, const.CUSTOM): # OpenAI-compatible API
from models.chatgpt.chat_gpt_bot import ChatGPTBot
return ChatGPTBot()

0
models/mimo/__init__.py Normal file
View File

668
models/mimo/mimo_bot.py Normal file
View File

@@ -0,0 +1,668 @@
# encoding:utf-8
"""
小米 MiMo Bot —— OpenAI 兼容协议,使用独立 API key / base 配置。
支持模型:
- mimo-v2.5-pro (旗舰,长上下文,默认开启思考)
- mimo-v2.5 (多模态:文/图/音/视频,默认开启思考)
- mimo-v2-pro (V2 Pro默认开启思考)
- mimo-v2-omni (V2 多模态,默认开启思考)
- mimo-v2-flash (V2 极速版,默认关闭思考)
思考模式说明:
- 开关参数:``{"thinking": {"type": "enabled" | "disabled"}}``
- mimo-v2.5-pro / mimo-v2.5 在思考模式下 ``temperature`` 会被强制为 1.0
本地直接剥离 ``temperature`` / ``top_p`` 等参数避免歧义。
- 多轮工具调用过程中,若历史包含 tool_calls所有后续 assistant 消息必须回传
``reasoning_content``,否则 API 返回 400 错误。
- 文档https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/passing-back-reasoning_content
"""
import json
import time
from typing import Optional
import requests
from bridge.context import ContextType
from bridge.reply import Reply, ReplyType
from common import const
from common.log import logger
from config import conf, load_config
from models.bot import Bot
from models.openai_compatible_bot import OpenAICompatibleBot
from models.session_manager import SessionManager
from .mimo_session import MimoSession
DEFAULT_API_BASE = "https://api.xiaomimimo.com/v1"
DEFAULT_MODEL = const.MIMO_V2_5_PRO
# 支持多模态输入(图/音/视频)的模型
MULTIMODAL_MODELS = {const.MIMO_V2_5_PRO, const.MIMO_V2_5, const.MIMO_V2_OMNI}
class MimoBot(Bot, OpenAICompatibleBot):
def __init__(self):
super().__init__()
self.sessions = SessionManager(
MimoSession,
model=conf().get("model") or DEFAULT_MODEL,
)
conf_model = conf().get("model") or DEFAULT_MODEL
self.args = {
"model": conf_model,
"temperature": conf().get("temperature", 1.0),
"top_p": conf().get("top_p", 0.95),
}
# ---------- config helpers ----------
@property
def api_key(self):
return conf().get("mimo_api_key")
@property
def api_base(self):
url = conf().get("mimo_api_base") or DEFAULT_API_BASE
return url.rstrip("/")
def get_api_config(self):
"""OpenAICompatibleBot 接口 —— 供 call_with_tools() 使用。"""
return {
"api_key": self.api_key,
"api_base": self.api_base,
"model": conf().get("model", DEFAULT_MODEL),
"default_temperature": conf().get("temperature", 1.0),
"default_top_p": conf().get("top_p", 0.95),
}
@property
def supports_vision(self) -> bool:
"""主模型为多模态模型时,允许 vision tool 走主 bot 通道。"""
model_name = (conf().get("model") or "").lower()
return model_name in MULTIMODAL_MODELS
@staticmethod
def _model_supports_thinking(model_name: str) -> bool:
"""全部 mimo 系列模型都支持 thinking 开关。"""
if not model_name:
return False
return model_name.lower().startswith("mimo-")
@staticmethod
def _thinking_default_enabled(model_name: str) -> bool:
"""各模型的思考模式默认值。mimo-v2-flash 默认关闭,其他默认开启。"""
if not model_name:
return False
return model_name.lower() != const.MIMO_V2_FLASH
def _build_headers(self) -> dict:
return {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}",
}
# ---------- simple chat (non-agent mode) ----------
def reply(self, query, context=None):
if context.type == ContextType.TEXT:
logger.info("[MIMO] query={}".format(query))
session_id = context["session_id"]
reply = None
clear_memory_commands = conf().get("clear_memory_commands", ["#清除记忆"])
if query in clear_memory_commands:
self.sessions.clear_session(session_id)
reply = Reply(ReplyType.INFO, "记忆已清除")
elif query == "#清除所有":
self.sessions.clear_all_session()
reply = Reply(ReplyType.INFO, "所有人记忆已清除")
elif query == "#更新配置":
load_config()
reply = Reply(ReplyType.INFO, "配置已更新")
if reply:
return reply
session = self.sessions.session_query(query, session_id)
logger.debug("[MIMO] session query={}".format(session.messages))
new_args = self.args.copy()
reply_content = self.reply_text(session, args=new_args)
logger.debug(
"[MIMO] new_query={}, session_id={}, reply_cont={}, completion_tokens={}".format(
session.messages, session_id,
reply_content["content"], reply_content["completion_tokens"],
)
)
if reply_content["completion_tokens"] == 0 and len(reply_content["content"]) > 0:
reply = Reply(ReplyType.ERROR, reply_content["content"])
elif reply_content["completion_tokens"] > 0:
self.sessions.session_reply(
reply_content["content"], session_id, reply_content["total_tokens"],
)
reply = Reply(ReplyType.TEXT, reply_content["content"])
else:
reply = Reply(ReplyType.ERROR, reply_content["content"])
logger.debug("[MIMO] reply {} used 0 tokens.".format(reply_content))
return reply
else:
reply = Reply(ReplyType.ERROR, "Bot不支持处理{}类型的消息".format(context.type))
return reply
def reply_text(self, session, args=None, retry_count: int = 0) -> dict:
try:
headers = self._build_headers()
body = dict(args) if args else dict(self.args)
body["messages"] = session.messages
model_name = str(body.get("model", ""))
# 思考模式下 mimo-v2.5-pro / mimo-v2.5 不支持自定义 temperature/top_p,
# 简单起见,所有支持思考的模型按默认配置走,剥离这些参数。
if self._model_supports_thinking(model_name) and self._thinking_default_enabled(model_name):
for k in ("temperature", "top_p", "presence_penalty", "frequency_penalty"):
body.pop(k, None)
res = requests.post(
f"{self.api_base}/chat/completions",
headers=headers,
json=body,
timeout=180,
)
if res.status_code == 200:
response = res.json()
return {
"total_tokens": response["usage"]["total_tokens"],
"completion_tokens": response["usage"]["completion_tokens"],
"content": response["choices"][0]["message"]["content"],
}
else:
try:
response = res.json()
error = response.get("error", {})
except Exception:
error = {"message": res.text[:300]}
logger.error(
f"[MIMO] chat failed, status_code={res.status_code}, "
f"msg={error.get('message')}, type={error.get('type')}"
)
result = {"completion_tokens": 0, "content": "提问太快啦,请休息一下再问我吧"}
need_retry = False
if res.status_code >= 500:
need_retry = retry_count < 2
elif res.status_code == 401:
result["content"] = "授权失败请检查API Key是否正确"
elif res.status_code == 429:
result["content"] = "请求过于频繁,请稍后再试"
need_retry = retry_count < 2
if need_retry:
time.sleep(3)
return self.reply_text(session, args, retry_count + 1)
return result
except Exception as e:
logger.exception(e)
if retry_count < 2:
return self.reply_text(session, args, retry_count + 1)
return {"completion_tokens": 0, "content": "我现在有点累了,等会再来吧"}
# ==================== Agent mode support ====================
def call_with_tools(self, messages, tools=None, stream: bool = False, **kwargs):
"""
带工具调用支持的 MiMo API 调用 (供 agent 集成使用)。
处理逻辑:
- Claude 格式 → OpenAI 格式 转换(含 reasoning_content 全量回传)
- System prompt 注入
- SSE 流式响应(包含 tool_calls 与 reasoning_content 增量)
- 思考模式开关传递
"""
try:
converted_messages = self._convert_messages_to_openai_format(messages)
system_prompt = kwargs.pop("system", None)
if system_prompt:
if not converted_messages or converted_messages[0].get("role") != "system":
converted_messages.insert(0, {"role": "system", "content": system_prompt})
else:
converted_messages[0] = {"role": "system", "content": system_prompt}
converted_tools = None
if tools:
converted_tools = self._convert_tools_to_openai_format(tools)
model = kwargs.pop("model", None) or self.args["model"]
max_tokens = kwargs.pop("max_tokens", None)
request_body = {
"model": model,
"messages": converted_messages,
"stream": stream,
}
if max_tokens is not None:
# MiMo 使用 max_completion_tokens 命名(含可见输出 + 推理 token
request_body["max_completion_tokens"] = max_tokens
if converted_tools:
request_body["tools"] = converted_tools
request_body["tool_choice"] = kwargs.pop("tool_choice", "auto")
# 思考模式默认遵循各模型的官方默认值caller 可显式覆盖
thinking_param = kwargs.pop("thinking", None)
thinking_active = False
if self._model_supports_thinking(model):
if thinking_param is None:
default_on = self._thinking_default_enabled(model)
thinking_param = {"type": "enabled" if default_on else "disabled"}
request_body["thinking"] = thinking_param
thinking_active = thinking_param.get("type") == "enabled"
# 思考模式下 v2.5-pro / v2.5 不支持自定义 temperature干脆全部剥离避免被静默忽略
if thinking_active:
for k in ("temperature", "top_p", "presence_penalty", "frequency_penalty"):
request_body.pop(k, None)
kwargs.pop(k, None)
else:
temperature = kwargs.pop("temperature", None)
if temperature is not None:
request_body["temperature"] = temperature
top_p = kwargs.pop("top_p", None)
if top_p is not None:
request_body["top_p"] = top_p
logger.debug(
f"[MIMO] API call: model={model}, "
f"tools={len(converted_tools) if converted_tools else 0}, "
f"stream={stream}, thinking={thinking_active}"
)
if stream:
return self._handle_stream_response(request_body)
else:
return self._handle_sync_response(request_body)
except Exception as e:
logger.error(f"[MIMO] call_with_tools error: {e}")
import traceback
logger.error(traceback.format_exc())
def error_generator():
yield {"error": True, "message": str(e), "status_code": 500}
return error_generator()
# -------------------- streaming --------------------
def _handle_stream_response(self, request_body: dict):
"""SSE 流式 chunk 转为 OpenAI 标准 delta 输出(含 reasoning_content"""
try:
headers = self._build_headers()
url = f"{self.api_base}/chat/completions"
response = requests.post(url, headers=headers, json=request_body, stream=True, timeout=180)
if response.status_code != 200:
error_msg = response.text
logger.error(f"[MIMO] API error: status={response.status_code}, msg={error_msg}")
yield {"error": True, "message": error_msg, "status_code": response.status_code}
return
current_tool_calls = {}
finish_reason = None
for line in response.iter_lines():
if not line:
continue
line = line.decode("utf-8")
if line.startswith("data: "):
data_str = line[6:]
elif line.startswith("data:"):
data_str = line[5:]
else:
continue
if data_str.strip() == "[DONE]":
break
try:
chunk = json.loads(data_str)
except json.JSONDecodeError as e:
logger.warning(f"[MIMO] JSON decode error: {e}, data: {data_str[:200]}")
continue
if chunk.get("error"):
error_data = chunk["error"]
error_msg = error_data.get("message", "Unknown error") if isinstance(error_data, dict) else str(error_data)
logger.error(f"[MIMO] stream error: {error_msg}")
yield {"error": True, "message": error_msg, "status_code": 500}
return
if not chunk.get("choices"):
continue
choice = chunk["choices"][0]
delta = choice.get("delta", {})
if choice.get("finish_reason"):
finish_reason = choice["finish_reason"]
# 推理内容(思考模式):单独 delta 透传给 agent_stream
if delta.get("reasoning_content"):
yield {
"choices": [{
"index": 0,
"delta": {
"role": "assistant",
"reasoning_content": delta["reasoning_content"],
},
"finish_reason": None,
}]
}
if delta.get("content"):
yield {
"choices": [{
"index": 0,
"delta": {
"role": "assistant",
"content": delta["content"],
},
}]
}
if "tool_calls" in delta and delta["tool_calls"]:
for tool_call_chunk in delta["tool_calls"]:
index = tool_call_chunk.get("index", 0)
if index not in current_tool_calls:
current_tool_calls[index] = {
"id": tool_call_chunk.get("id", ""),
"name": tool_call_chunk.get("function", {}).get("name", ""),
"arguments": "",
}
if "function" in tool_call_chunk and "arguments" in tool_call_chunk["function"]:
current_tool_calls[index]["arguments"] += tool_call_chunk["function"]["arguments"]
yield {
"choices": [{
"index": 0,
"delta": {"tool_calls": [tool_call_chunk]},
}]
}
yield {
"choices": [{
"index": 0,
"delta": {},
"finish_reason": finish_reason,
}]
}
except requests.exceptions.Timeout:
logger.error("[MIMO] Request timeout")
yield {"error": True, "message": "Request timeout", "status_code": 500}
except Exception as e:
logger.error(f"[MIMO] stream response error: {e}")
import traceback
logger.error(traceback.format_exc())
yield {"error": True, "message": str(e), "status_code": 500}
# -------------------- sync --------------------
def _handle_sync_response(self, request_body: dict):
"""非流式响应;统一 yield 一份 Claude 格式 dict 与流式路径对齐。"""
try:
headers = self._build_headers()
request_body.pop("stream", None)
url = f"{self.api_base}/chat/completions"
response = requests.post(url, headers=headers, json=request_body, timeout=180)
if response.status_code != 200:
error_msg = response.text
logger.error(f"[MIMO] API error: status={response.status_code}, msg={error_msg}")
yield {"error": True, "message": error_msg, "status_code": response.status_code}
return
result = response.json()
message = result["choices"][0]["message"]
finish_reason = result["choices"][0]["finish_reason"]
response_data = {"role": "assistant", "content": []}
# 推理内容包装成 thinking block便于 agent 层持久化并在工具调用时回传
if message.get("reasoning_content"):
response_data["content"].append({
"type": "thinking",
"thinking": message["reasoning_content"],
})
if message.get("content"):
response_data["content"].append({
"type": "text",
"text": message["content"],
})
if message.get("tool_calls"):
for tool_call in message["tool_calls"]:
try:
tool_input = json.loads(tool_call["function"]["arguments"])
except (json.JSONDecodeError, TypeError):
tool_input = {}
response_data["content"].append({
"type": "tool_use",
"id": tool_call["id"],
"name": tool_call["function"]["name"],
"input": tool_input,
})
if finish_reason == "tool_calls":
response_data["stop_reason"] = "tool_use"
elif finish_reason == "stop":
response_data["stop_reason"] = "end_turn"
else:
response_data["stop_reason"] = finish_reason
yield response_data
except requests.exceptions.Timeout:
logger.error("[MIMO] Request timeout")
yield {"error": True, "message": "Request timeout", "status_code": 500}
except Exception as e:
logger.error(f"[MIMO] sync response error: {e}")
import traceback
logger.error(traceback.format_exc())
yield {"error": True, "message": str(e), "status_code": 500}
# -------------------- format conversion --------------------
def _convert_messages_to_openai_format(self, messages):
"""
将 Claude 格式content blocks转为 OpenAI 格式。
关键约束MiMo 思考模式下,一旦历史包含 tool_calls 的 assistant 轮次,
所有后续 assistant 消息(含工具调用轮)必须回传 reasoning_content
否则 API 返回 400。本地无 trace 时用空字符串回填MiMo 接受字段存在
即可。
"""
if not messages:
return []
has_tool_call_history = False
for msg in messages:
if msg.get("role") != "assistant":
continue
if msg.get("tool_calls"):
has_tool_call_history = True
break
content = msg.get("content")
if isinstance(content, list) and any(
isinstance(b, dict) and b.get("type") == "tool_use" for b in content
):
has_tool_call_history = True
break
converted = []
for msg in messages:
role = msg.get("role")
content = msg.get("content")
if not isinstance(content, list):
if (
role == "assistant"
and isinstance(msg, dict)
and has_tool_call_history
and "reasoning_content" not in msg
):
patched = dict(msg)
patched["reasoning_content"] = ""
converted.append(patched)
else:
converted.append(msg)
continue
if role == "user":
has_tool_result = any(
isinstance(b, dict) and b.get("type") == "tool_result" for b in content
)
if has_tool_result:
text_parts = []
tool_results = []
for block in content:
if not isinstance(block, dict):
continue
if block.get("type") == "text":
text_parts.append(block.get("text", ""))
elif block.get("type") == "tool_result":
tool_call_id = block.get("tool_use_id") or ""
result_content = block.get("content", "")
if not isinstance(result_content, str):
result_content = json.dumps(result_content, ensure_ascii=False)
tool_results.append({
"role": "tool",
"tool_call_id": tool_call_id,
"content": result_content,
})
converted.extend(tool_results)
if text_parts:
converted.append({"role": "user", "content": "\n".join(text_parts)})
else:
# 多模态原样保留image_url / input_audio / video_url 等 block
converted.append(msg)
elif role == "assistant":
openai_msg = {"role": "assistant"}
text_parts = []
tool_calls = []
reasoning_parts = []
for block in content:
if not isinstance(block, dict):
continue
btype = block.get("type")
if btype == "text":
text_parts.append(block.get("text", ""))
elif btype == "tool_use":
tool_calls.append({
"id": block.get("id"),
"type": "function",
"function": {
"name": block.get("name"),
"arguments": json.dumps(block.get("input", {})),
},
})
elif btype == "thinking":
reasoning_parts.append(block.get("thinking", ""))
if text_parts:
openai_msg["content"] = "\n".join(text_parts)
elif not tool_calls:
openai_msg["content"] = ""
if tool_calls:
openai_msg["tool_calls"] = tool_calls
if not text_parts:
openai_msg["content"] = None
if reasoning_parts:
openai_msg["reasoning_content"] = "\n".join(reasoning_parts)
elif has_tool_call_history:
openai_msg["reasoning_content"] = ""
converted.append(openai_msg)
else:
converted.append(msg)
return converted
def _convert_tools_to_openai_format(self, tools):
"""工具定义 Claude 格式 → OpenAI 格式。"""
if not tools:
return None
converted = []
for tool in tools:
if "type" in tool and tool["type"] == "function":
converted.append(tool)
else:
converted.append({
"type": "function",
"function": {
"name": tool.get("name"),
"description": tool.get("description"),
"parameters": tool.get("input_schema", {}),
},
})
return converted
# -------------------- vision --------------------
def call_vision(self, image_url: str, question: str,
model: Optional[str] = None,
max_tokens: int = 1000) -> dict:
"""通过 MiMo OpenAI 兼容的 /chat/completions 端点进行图像理解。"""
try:
# 主模型若不支持视觉(如 mimo-v2-flash自动切到 mimo-v2.5-pro
vision_model = model
if not vision_model:
cur = self.args.get("model") or DEFAULT_MODEL
vision_model = cur if cur in MULTIMODAL_MODELS else const.MIMO_V2_5_PRO
payload = {
"model": vision_model,
"max_completion_tokens": max_tokens,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": image_url}},
],
}],
}
headers = self._build_headers()
resp = requests.post(
f"{self.api_base}/chat/completions",
headers=headers, json=payload, timeout=60,
)
if resp.status_code != 200:
return {"error": True, "message": f"HTTP {resp.status_code}: {resp.text[:300]}"}
data = resp.json()
if "error" in data:
return {"error": True, "message": data["error"].get("message", str(data["error"]))}
choice = data.get("choices", [{}])[0].get("message", {})
# 部分模型在多模态下会把答案塞在 reasoning_content 而非 content
content = choice.get("content") or choice.get("reasoning_content") or ""
usage = data.get("usage", {})
return {
"model": vision_model,
"content": content,
"usage": {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
},
}
except Exception as e:
logger.error(f"[MIMO] call_vision error: {e}")
return {"error": True, "message": str(e)}

View File

@@ -0,0 +1,57 @@
from common.log import logger
from models.session_manager import Session
class MimoSession(Session):
def __init__(self, session_id, system_prompt=None, model="mimo-v2.5-pro"):
super().__init__(session_id, system_prompt)
self.model = model
self.reset()
def discard_exceeding(self, max_tokens, cur_tokens=None):
precise = True
try:
cur_tokens = self.calc_tokens()
except Exception as e:
precise = False
if cur_tokens is None:
raise e
logger.debug("Exception when counting tokens precisely for query: {}".format(e))
while cur_tokens > max_tokens:
if len(self.messages) > 2:
self.messages.pop(1)
elif len(self.messages) == 2 and self.messages[1]["role"] == "assistant":
self.messages.pop(1)
if precise:
cur_tokens = self.calc_tokens()
else:
cur_tokens = cur_tokens - max_tokens
break
elif len(self.messages) == 2 and self.messages[1]["role"] == "user":
logger.warn("user message exceed max_tokens. total_tokens={}".format(cur_tokens))
break
else:
logger.debug("max_tokens={}, total_tokens={}, len(messages)={}".format(
max_tokens, cur_tokens, len(self.messages)))
break
if precise:
cur_tokens = self.calc_tokens()
else:
cur_tokens = cur_tokens - max_tokens
return cur_tokens
def calc_tokens(self):
return num_tokens_from_messages(self.messages, self.model)
def num_tokens_from_messages(messages, model):
tokens = 0
for msg in messages:
content = msg.get("content", "")
if isinstance(content, str):
tokens += len(content)
elif isinstance(content, list):
for block in content:
if isinstance(block, dict):
tokens += len(block.get("text", ""))
return tokens

View File

@@ -66,4 +66,8 @@ def create_voice(voice_type):
from voice.zhipuai.zhipuai_voice import ZhipuAIVoice
return ZhipuAIVoice()
elif voice_type == "mimo":
from voice.mimo.mimo_voice import MimoVoice
return MimoVoice()
raise RuntimeError

0
voice/mimo/__init__.py Normal file
View File

109
voice/mimo/mimo_voice.py Normal file
View File

@@ -0,0 +1,109 @@
# encoding:utf-8
"""
小米 MiMo TTS - 基于 mimo-v2.5-tts 模型的语音合成。
通过 /chat/completions 接口实现assistant 消息内容为待合成文本,
audio 字段指定预置音色(如 冰糖/茉莉/苏打/Mia/Chloe 等),返回 base64
编码的音频字节。
文档https://platform.xiaomimimo.com/docs/zh-CN/usage-guide/speech-synthesis-v2.5
注意MiMo 不提供 ASR 端点,因此 voiceToText 不实现。
"""
import base64
import datetime
import os
import random
import requests
from bridge.reply import Reply, ReplyType
from common.log import logger
from config import conf
from voice.voice import Voice
DEFAULT_API_BASE = "https://api.xiaomimimo.com/v1"
DEFAULT_TTS_MODEL = "mimo-v2.5-tts"
DEFAULT_TTS_VOICE = "冰糖" # 默认音色:中国集群事实默认值
REQUEST_TIMEOUT = (5, 120)
class MimoVoice(Voice):
def __init__(self):
pass
def voiceToText(self, voice_file: str):
# MiMo 没有独立 ASR 端点;建议使用其他 provider如 openai/zhipu/dashscope
logger.warning("[MimoVoice] voiceToText is not supported by MiMo API")
return Reply(ReplyType.ERROR, "MiMo 暂不支持语音识别,请配置其他 voice_to_text provider")
def textToVoice(self, text: str):
try:
api_key = conf().get("mimo_api_key", "")
if not api_key:
logger.error("[MimoVoice] mimo_api_key is not configured")
return Reply(ReplyType.ERROR, "未配置 MiMo API key")
api_base = (conf().get("mimo_api_base") or DEFAULT_API_BASE).rstrip("/")
model = conf().get("text_to_voice_model") or DEFAULT_TTS_MODEL
voice_id = conf().get("tts_voice_id") or DEFAULT_TTS_VOICE
# 目标合成文本必须放在 assistant 消息user 消息可选用作风格指令
payload = {
"model": model,
"messages": [
{"role": "assistant", "content": text},
],
"audio": {
"format": "wav",
"voice": voice_id,
},
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
url = f"{api_base}/chat/completions"
response = requests.post(url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT)
if response.status_code != 200:
logger.error(
f"[MimoVoice] textToVoice failed: status={response.status_code} "
f"body={response.text[:500]} model={model} voice={voice_id}"
)
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
data = response.json()
if "error" in data:
err = data["error"]
msg = err.get("message", str(err)) if isinstance(err, dict) else str(err)
logger.error(f"[MimoVoice] textToVoice api error: {msg}")
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
message = (data.get("choices") or [{}])[0].get("message", {}) or {}
audio_obj = message.get("audio") or {}
audio_b64 = audio_obj.get("data")
if not audio_b64:
logger.error(f"[MimoVoice] textToVoice empty audio in response: {data}")
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
try:
audio_bytes = base64.b64decode(audio_b64)
except Exception as e:
logger.error(f"[MimoVoice] base64 decode failed: {e}")
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")
file_name = (
"tmp/" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+ str(random.randint(0, 1000)) + ".wav"
)
os.makedirs(os.path.dirname(file_name), exist_ok=True)
with open(file_name, "wb") as f:
f.write(audio_bytes)
logger.info(
f"[MimoVoice] textToVoice model={model} voice={voice_id} "
f"file={file_name} bytes={len(audio_bytes)}"
)
return Reply(ReplyType.VOICE, file_name)
except Exception as e:
logger.exception(f"[MimoVoice] textToVoice exception: {e}")
return Reply(ReplyType.ERROR, "语音合成失败,请稍后再试")