From 68ce2e5232b7032059206f65c88495e513769af3 Mon Sep 17 00:00:00 2001 From: zhayujie Date: Thu, 23 Apr 2026 12:39:39 +0800 Subject: [PATCH] feat(skill): multi-provider image generation with auto-fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add Gemini, Seedream (Volcengine Ark), Qwen (DashScope), MiniMax providers to image-generation skill with universal sequential fallback: OpenAI → Gemini → Seedream → Qwen → MiniMax → LinkAI - Each provider filters unsupported size tiers to valid values (e.g. Seedream 1K→2K, Qwen 3K→2K, Gemini 3K→2K) - Pinned model only tries its native provider; auto-routing uses each provider's default model - Support skill-namespaced config (config.skill.image-generation.model → SKILL_IMAGE_GENERATION_MODEL env var) - Add image lightbox (click-to-enlarge) in web console - Add docs for built-in skills (skill-creator, knowledge-wiki, image-generation) under docs/skills/ --- channel/web/static/js/console.js | 37 +- config.py | 39 + docs/docs.json | 25 +- docs/en/skills/image-generation.mdx | 158 +++++ docs/en/skills/knowledge-wiki.mdx | 112 +++ docs/en/skills/skill-creator.mdx | 180 +++++ docs/ja/skills/image-generation.mdx | 158 +++++ docs/ja/skills/knowledge-wiki.mdx | 112 +++ docs/ja/skills/skill-creator.mdx | 180 +++++ docs/skills/image-generation.mdx | 160 +++++ docs/skills/knowledge-wiki.mdx | 112 +++ docs/skills/skill-creator.mdx | 180 +++++ docs/tools/index.mdx | 2 +- docs/tools/vision.mdx | 2 +- skills/image-generation/SKILL.md | 73 +- skills/image-generation/scripts/generate.py | 743 +++++++++++++++++++- 16 files changed, 2189 insertions(+), 84 deletions(-) create mode 100644 docs/en/skills/image-generation.mdx create mode 100644 docs/en/skills/knowledge-wiki.mdx create mode 100644 docs/en/skills/skill-creator.mdx create mode 100644 docs/ja/skills/image-generation.mdx create mode 100644 docs/ja/skills/knowledge-wiki.mdx create mode 100644 docs/ja/skills/skill-creator.mdx create mode 100644 docs/skills/image-generation.mdx create mode 100644 docs/skills/knowledge-wiki.mdx create mode 100644 docs/skills/skill-creator.mdx diff --git a/channel/web/static/js/console.js b/channel/web/static/js/console.js index ae155524..22006421 100644 --- a/channel/web/static/js/console.js +++ b/channel/web/static/js/console.js @@ -363,13 +363,32 @@ function _buildVideoHtml(url) { ` ${escapeHtml(fileName)}`; } +function _openImageLightbox(src) { + let overlay = document.getElementById('cow-lightbox'); + if (!overlay) { + overlay = document.createElement('div'); + overlay.id = 'cow-lightbox'; + overlay.style.cssText = 'position:fixed;inset:0;z-index:9999;background:rgba(0,0,0,0.85);display:flex;align-items:center;justify-content:center;cursor:zoom-out;opacity:0;transition:opacity .2s'; + overlay.onclick = () => { overlay.style.opacity = '0'; setTimeout(() => overlay.style.display = 'none', 200); }; + const img = document.createElement('img'); + img.id = 'cow-lightbox-img'; + img.style.cssText = 'max-width:92vw;max-height:92vh;border-radius:8px;box-shadow:0 4px 24px rgba(0,0,0,0.5);object-fit:contain;'; + img.onclick = (e) => e.stopPropagation(); + overlay.appendChild(img); + document.body.appendChild(overlay); + } + overlay.querySelector('#cow-lightbox-img').src = src; + overlay.style.display = 'flex'; + requestAnimationFrame(() => overlay.style.opacity = '1'); +} + function _buildImageHtml(url) { const webUrl = _toWebUrl(url); const safeUrl = webUrl.replace(/"/g, '"'); return `
` + `image` + + `onclick="_openImageLightbox(this.src)" ` + + `style="max-width:520px;width:100%;border-radius:10px;box-shadow:0 2px 8px rgba(0,0,0,0.15);display:block;cursor:zoom-in;">` + `
`; } @@ -413,12 +432,12 @@ function injectImagePreviews(html) { } function _rewriteLocalImgSrc(html) { - return html.replace(/]*?)src="([^"]+)"/gi, (match, pre, src) => { + return html.replace(/]*?)src="([^"]+)"([^>]*?)>/gi, (match, pre, src, post) => { const webSrc = _toWebUrl(src); - if (webSrc !== src) { - return ``; }); } @@ -1189,8 +1208,8 @@ function startSSE(requestId, loadingEl, timestamp, titleInfo) { const imgEl = document.createElement('img'); imgEl.src = item.content; imgEl.alt = 'screenshot'; - imgEl.style.cssText = 'max-width:600px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);'; - imgEl.onclick = () => window.open(item.content, '_blank'); + imgEl.style.cssText = 'max-width:600px;border-radius:8px;margin:8px 0;cursor:zoom-in;box-shadow:0 1px 4px rgba(0,0,0,0.1);'; + imgEl.onclick = () => _openImageLightbox(imgEl.src); mediaEl.appendChild(imgEl); scrollChatToBottom(); diff --git a/config.py b/config.py index 2ae5f7ff..7c597ba7 100644 --- a/config.py +++ b/config.py @@ -206,6 +206,10 @@ available_setting = { "agent_max_steps": 20, # Agent模式下单次运行最大决策步数 "enable_thinking": False, # Whether to enable deep thinking for web channel "knowledge": True, # 是否开启知识库功能 + # Per-skill runtime config. Nested keys are flattened to env vars at startup + # using the rule: skill[][] -> SKILL__ + # (e.g. skill["image-generation"].model -> SKILL_IMAGE_GENERATION_MODEL). + "skill": {}, } @@ -384,6 +388,8 @@ def load_config(): "moonshot_api_base": "MOONSHOT_API_BASE", "ark_api_key": "ARK_API_KEY", "ark_api_base": "ARK_API_BASE", + "dashscope_api_key": "DASHSCOPE_API_KEY", + "dashscope_api_base": "DASHSCOPE_API_BASE", # Channel credentials (used by skills that check env vars) "feishu_app_id": "FEISHU_APP_ID", "feishu_app_secret": "FEISHU_APP_SECRET", @@ -404,12 +410,45 @@ def load_config(): if val: os.environ[env_key] = str(val) injected += 1 + + injected += _sync_skill_config_to_env(config.get("skill", {})) + if injected: logger.info("[INIT] Synced {} config values to environment variables".format(injected)) config.load_user_datas() +def _sync_skill_config_to_env(skill_section) -> int: + """Flatten skill-namespaced config into environment variables. + + Mapping rule: ``config["skill"][][]`` -> ``SKILL__`` + (e.g. ``skill["image-generation"].model`` -> ``SKILL_IMAGE_GENERATION_MODEL``). + + This lets subprocess-based skill scripts read their own settings without + importing project code. Existing env vars are NOT overwritten so the + real environment always wins. + + Returns the number of variables actually injected. + """ + if not isinstance(skill_section, dict): + return 0 + injected = 0 + for skill_name, skill_conf in skill_section.items(): + if not isinstance(skill_conf, dict): + continue + name_part = str(skill_name).replace("-", "_").upper() + for key, val in skill_conf.items(): + if val is None or val == "": + continue + env_key = "SKILL_{}_{}".format(name_part, str(key).upper()) + if env_key in os.environ: + continue + os.environ[env_key] = str(val) + injected += 1 + return injected + + def get_root(): return os.path.dirname(os.path.abspath(__file__)) diff --git a/docs/docs.json b/docs/docs.json index 01b9e391..5b4cf1ee 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -133,6 +133,14 @@ "skills/create", "skills/hub" ] + }, + { + "group": "内置技能", + "pages": [ + "skills/skill-creator", + "skills/knowledge-wiki", + "skills/image-generation" + ] } ] }, @@ -306,9 +314,16 @@ "pages": [ "en/skills/index", "en/skills/install", - "en/skills/skill-creator", "en/skills/hub" ] + }, + { + "group": "Built-in Skills", + "pages": [ + "en/skills/skill-creator", + "en/skills/knowledge-wiki", + "en/skills/image-generation" + ] } ] }, @@ -485,6 +500,14 @@ "ja/skills/create", "ja/skills/hub" ] + }, + { + "group": "内蔵スキル", + "pages": [ + "ja/skills/skill-creator", + "ja/skills/knowledge-wiki", + "ja/skills/image-generation" + ] } ] }, diff --git a/docs/en/skills/image-generation.mdx b/docs/en/skills/image-generation.mdx new file mode 100644 index 00000000..49c0ed7d --- /dev/null +++ b/docs/en/skills/image-generation.mdx @@ -0,0 +1,158 @@ +--- +title: image-generation - Image Generation +description: Text-to-image / image-to-image / multi-image fusion with automatic multi-provider routing and fallback +--- + +A general-purpose image generation and editing skill supporting six providers: OpenAI, Gemini, Seedream (Volcengine Ark), Qwen (DashScope), MiniMax, and LinkAI. No need to choose a model manually — the script automatically selects a configured provider based on a fixed priority order. + +## Model Selection + +`image-generation` uses a "fixed priority + automatic fallback" strategy — just configure your keys and it works: + +1. **Priority order**: `OpenAI → Gemini → Seedream → Qwen → MiniMax → LinkAI` +2. **Unconfigured providers are skipped**: only providers with an API key participate +3. **Automatic fallback on failure**: on errors like 401, model not enabled, or network issues, the next provider is tried +4. **Specified model goes first**: if a specific model name is provided, its provider is promoted to the front + +### Supported Models + +| Provider | Models / Aliases | Notes | +| --- | --- | --- | +| OpenAI | `gpt-image-2`, `gpt-image-1` | General-purpose, high quality, supports `quality` parameter | +| Gemini Nano Banana | `nano-banana-2`, `nano-banana-pro`, `nano-banana` | Corresponds to `gemini-3.1-flash`, `gemini-3-pro`, `gemini-2.5-flash` image variants | +| Seedream (Volcengine Ark) | `seedream-5.0-lite`, `seedream-4.5` | Native 2K–4K, up to 14 reference images for fusion | +| Qwen (DashScope) | `qwen-image-2.0`, `qwen-image-2.0-pro` | Strong with Chinese text rendering and text-image layouts | +| MiniMax | `image-01` | Fast and simple image generation | +| LinkAI | Any model | Universal proxy, used as fallback | + + +By default, the Agent does not pick a model — it uses automatic routing. If you want a specific model, just say so in the conversation, e.g. "use seedream to draw a cat" or "generate a poster with gpt-image-2". You can also pin a default model via the "Custom Configuration" section below. + + +## Custom Configuration + +### API Key Setup + +You need **at least one** provider key. Configuring multiple providers enables automatic fallback. There are three ways to set up keys: + +#### Option 1: Automatic Reuse of Existing Keys + +If you have already configured model keys in the web console or `config.json` (e.g. `openai_api_key`, `gemini_api_key`, etc.), these keys are **automatically synced** to the corresponding environment variables at startup. In other words, if your chat model works, image generation can use the same key with zero extra configuration. + +#### Option 2: Configure in config.json + +Add the key fields directly to `config.json`: + +```json +{ + "openai_api_key": "sk-xxx", + "openai_api_base": "https://api.openai.com/v1", + "gemini_api_key": "AIza-xxx", + "ark_api_key": "xxx", + "dashscope_api_key": "sk-xxx", + "minimax_api_key": "xxx", + "linkai_api_key": "xxx" +} +``` + +A restart is required after changes. Each key also has a corresponding `*_api_base` field for custom endpoints. + +#### Option 3: Configure via Conversation + +Send an API key in the chat and the Agent will save it to `~/cow/.env` using the `env_config` tool — **no restart needed**. For example: + +``` +Set OPENAI_API_KEY to sk-xxx +``` + +Or: + +``` +Configure ARK_API_KEY as xxx +``` + +### API Key Reference + +| Environment Variable | config.json Field | Provider | Default Base URL | +| --- | --- | --- | --- | +| `OPENAI_API_KEY` | `openai_api_key` | OpenAI | `https://api.openai.com/v1` | +| `GEMINI_API_KEY` | `gemini_api_key` | Gemini | `https://generativelanguage.googleapis.com` | +| `ARK_API_KEY` | `ark_api_key` | Volcengine Ark (Seedream) | `https://ark.cn-beijing.volces.com/api/v3` | +| `DASHSCOPE_API_KEY` | `dashscope_api_key` | Alibaba DashScope (Qwen) | `https://dashscope.aliyuncs.com` | +| `MINIMAX_API_KEY` | `minimax_api_key` | MiniMax | `https://api.minimaxi.com` | +| `LINKAI_API_KEY` | `linkai_api_key` | LinkAI | `https://api.link-ai.tech` | + +### Pinning a Default Model + +To force all image generation through a specific provider's model, add this to `config.json`: + +```json +"skill": { + "image-generation": { + "model": "seedream-5.0-lite" + } +} +``` + +At startup, this is automatically converted to the environment variable `SKILL_IMAGE_GENERATION_MODEL`, and the script will always use this model's provider for generation. + +## Enabling and Disabling + +`image-generation` is a built-in skill that **automatically adjusts its status based on API keys**: + +- **Key configured**: the skill is active — the Agent will invoke it when asked to draw +- **Key not configured**: the skill still appears in context (marked as "needs configuration") — the Agent will guide the user to set up a key rather than failing silently + +To control it manually: + +```text +/skill disable image-generation # Disable (won't be invoked even if keys are present) +/skill enable image-generation # Re-enable +``` + +In the terminal: `cow skill disable image-generation` / `cow skill enable image-generation`. + +## Parameters + +| Parameter | Type | Required | Default | Description | +| --- | --- | --- | --- | --- | +| `prompt` | string | Yes | — | Image description | +| `image_url` | string / list | No | null | Input image(s) for editing — local path or URL. Pass multiple for multi-image fusion | +| `quality` | string | No | auto | `low` / `medium` / `high` — only some providers support this | +| `size` | string | No | auto | `512` / `1K` / `2K` / `3K` / `4K`, or pixel value like `1024x1024` | +| `aspect_ratio` | string | No | null | `1:1` / `3:2` / `2:3` / `16:9` / `9:16` / `21:9`; Gemini also supports `1:4` / `4:1` / `1:8` / `8:1` | + + +**Higher quality and larger size cost more and take longer.** + +- For everyday conversations and quick previews, use the defaults (`auto`) or `quality=low` + `size=1K` — roughly 20 seconds +- For posters or when the user explicitly asks for high resolution, use `quality=high` + `size=2K/4K` — may take 1–5 minutes depending on the model + + +## Output + +On success: + +```json +{ + "model": "doubao-seedream-5-0-260128", + "images": [ + {"url": "/path/to/output.png"} + ] +} +``` + +On failure: `{ "error": "..." }`. After an error, **do not retry directly** — it is almost always a configuration issue (wrong key, incorrect API base, model not enabled). Have the user fix the configuration first. + +## Common Use Cases + +- **Text-to-image**: generate illustrations, posters, icons, avatars, storyboards, etc. from a description +- **Image-to-image**: change styles, swap elements, add decorations or text on an existing image +- **Multi-image fusion**: combine multiple reference images into one (outfit swaps, character group photos, etc.) + + +- Bash timeout should be set to 600 seconds. Each provider has a 300-second HTTP timeout, but the script may try multiple providers sequentially +- Input images are automatically compressed to ≤ 4 MB with the longest edge ≤ 4096 px +- Gemini / Seedream / Qwen / MiniMax do not support the `quality` parameter — passing it has no effect +- Seedream defaults to 2K; `seedream-5.0-lite` supports up to 3K; `seedream-4.5` supports up to 4K + diff --git a/docs/en/skills/knowledge-wiki.mdx b/docs/en/skills/knowledge-wiki.mdx new file mode 100644 index 00000000..9b54aad0 --- /dev/null +++ b/docs/en/skills/knowledge-wiki.mdx @@ -0,0 +1,112 @@ +--- +title: knowledge-wiki - Knowledge Base +description: Maintain a local structured knowledge base with automatic archiving, categorisation, and cross-referencing +--- + +Organises notes, insights, and reference materials from your conversations into a structured local knowledge base, automatically maintaining an index and cross-references between pages. + +`knowledge-wiki` maintains a `knowledge/` directory in your workspace — essentially the Agent's "second brain". The skill is marked `always: true`, so it is **always loaded** and requires no external dependencies. + +## When It Triggers + +- You share an article, document, or URL that you want to keep for future reference +- A conversation produces conclusions worth retaining long-term +- You want to look up something you accumulated earlier + +## Directory Structure + +``` +knowledge/ +├── index.md # Global index (must be maintained) +├── log.md # Operation log (append-only) +└── / # Category subdirectories (grouped by content) + └── .md # Knowledge page (lowercase-hyphenated filename) +``` + +## Three Core Operations + +### 1. Ingest + +When you share some material, the Agent will: + +1. Read and understand the original content, extracting key information +2. Decide which category it belongs to — check `index.md` first; create a new category if none fits +3. Generate a knowledge page at `knowledge//.md` +4. Update the index `index.md` and the log `log.md` + +### 2. Synthesise + +When a conversation produces new conclusions or insights: + +1. Create a new knowledge page under an appropriate category +2. Add cross-links to and from related existing pages +3. Update the index and log + +### 3. Query + +When you ask about previously accumulated knowledge: + +1. Search `index.md` for potentially relevant pages +2. Open specific pages with the `read` tool +3. Supplement with `memory_search` if needed +4. Include links to knowledge pages in the answer so you can click through to the source + +## Page Format + +```markdown +# Page Title + +> Source: + +Body content. Link between pages using relative paths: +[Related Page](../category/related-page.md) + +## Key Points + +- ... + +## Related Pages + +- [Page A](../category/page-a.md) — why it's related +``` + + +- `> Source:` records where this knowledge came from. Always include it when there is a clear source +- Cross-references are important: when creating or updating a page, remember to add back-links in the related pages too +- **Only link to pages that already exist.** If a concept deserves its own page, create it first, then add the link + + +## Index Format + +`knowledge/index.md` uses a flat list grouped by category, one knowledge page per line: + +```markdown +# Knowledge Index + +## Category A +- [Page Title](category-a/page-slug.md) — one-line summary + +## Category B +- [Page Title](category-b/page-slug.md) — one-line summary +``` + +No tables, no emojis. Category names and organisation can be adjusted freely. + +## Log Format + +`knowledge/log.md` is append-only — newest entries go at the bottom: + +```markdown +## [YYYY-MM-DD] ingest | Page Title +## [YYYY-MM-DD] synthesize | Page Title +``` + +## Writing Guidelines + +- **Filenames**: lowercase with hyphens, e.g. `machine-learning.md` +- **One topic per page** — link related content across pages +- **Update, don't duplicate** — if a page already exists, update it rather than creating a new one +- **Always update the index** `knowledge/index.md` after any change +- **Distill, don't copy** — capture the key points, not the entire source +- **Use full paths when referencing knowledge pages in conversations**, e.g. `[Title](knowledge//.md)`. Use relative paths only for inter-page links +- **Include links when answering questions based on knowledge pages** so users can dig deeper diff --git a/docs/en/skills/skill-creator.mdx b/docs/en/skills/skill-creator.mdx new file mode 100644 index 00000000..2753cd45 --- /dev/null +++ b/docs/en/skills/skill-creator.mdx @@ -0,0 +1,180 @@ +--- +title: skill-creator - Skill Creator +description: Create, install, and update skills — standardises SKILL.md format and directory structure +--- + +`skill-creator` is a "meta-skill" that helps the Agent create, install, and update other skills, ensuring every skill follows a consistent `SKILL.md` format and directory layout. + +## When It Triggers + +- The user wants to install a skill from a URL or remote repository +- The user wants to create a brand-new skill from scratch +- An existing skill needs upgrading or restructuring + +## What Is a Skill? + +A skill is a reusable instruction set plus optional scripts and assets. It injects domain expertise into the Agent so it can handle specific tasks like a specialist. + +A skill typically contains: + +1. **Specialised workflow** — step-by-step instructions for a category of tasks +2. **Tool usage** — how to call a particular API or process a particular file format +3. **Domain knowledge** — team conventions, business rules, data schemas, etc. +4. **Attached resources** — scripts, reference docs, templates, etc. + + +**Core principle: less is more.** Only write what the Agent wouldn't figure out on its own. For every line you add, ask yourself: is it worth the tokens? + + +## Directory Structure + +``` +skill-name/ +├── SKILL.md # Required: skill definition +│ ├── YAML frontmatter (name / description are mandatory) +│ └── Markdown body (instructions + examples) +└── Optional resources + ├── scripts/ # Executable scripts (Python / Bash, etc.) + ├── references/ # Large reference docs the Agent reads on demand + └── assets/ # Templates, icons, etc. used directly in output +``` + +## SKILL.md Specification + +Frontmatter fields in the SKILL.md header: + +| Field | Description | +| --- | --- | +| `name` | Skill name — lowercase with hyphens, must match the directory name | +| `description` | **The most important field.** Clearly state what the skill does and when to use it. The Agent reads this to decide whether to invoke it. All trigger-related descriptions go here, not in the body | +| `metadata.cowagent.requires.bins` | System CLI tools that must be installed | +| `metadata.cowagent.requires.env` | Required environment variables (all must be present) | +| `metadata.cowagent.requires.anyEnv` | Multiple API keys — at least one must be set | +| `metadata.cowagent.requires.anyBins` | Multiple tools — at least one must be installed | +| `metadata.cowagent.always` | Set to `true` to always load, skipping dependency checks | +| `metadata.cowagent.emoji` | Display emoji (optional) | +| `metadata.cowagent.os` | OS restriction, e.g. `["darwin", "linux"]` | + + +The `category` field does not need to be set manually — the system automatically sets it to `skill`. + + +Two ways to declare API key dependencies: + +```yaml +metadata: + cowagent: + requires: + env: ["MYAPI_KEY"] # Must be present +``` + +```yaml +metadata: + cowagent: + requires: + anyEnv: ["OPENAI_API_KEY", "LINKAI_API_KEY"] # At least one +``` + +**Skills are auto-enabled/disabled based on dependencies**: they activate when all required environment variables are present and deactivate when any are missing — no need for manual `/skill enable`. + +## Resource Directories + +| Directory | What goes here | What does NOT go here | +| --- | --- | --- | +| `scripts/` | Code that needs to run repeatedly, or scripts that produce deterministic results | Demo-only code snippets | +| `references/` | Documents **over 500 lines** that genuinely won't fit in SKILL.md (e.g. a full DB schema) | General API docs, tutorials, examples | +| `assets/` | Files that appear in the final output (templates, icons, boilerplate, etc.) | Explanatory documentation | + + +**In principle, everything goes in `SKILL.md`** — only split into resource directories when it truly won't fit. + +Do not add `README.md`, `CHANGELOG.md`, or `INSTALLATION_GUIDE.md` to a skill — put everything in `SKILL.md`. Resource directories should only contain scripts that actually run or assets that are actually used. + + +## Installing External Skills + +After installation, the skill lands in `/skills//`. + +| Source | How to install | +| --- | --- | +| URL (single file) | curl / web_fetch | +| URL (zip archive) | Download and extract | +| Local SKILL.md | Read directly | +| Local zip archive | Extract | + +Installation steps: + +1. Locate the `SKILL.md` (may be at the root or in a subdirectory of the archive) +2. Read the `name` from the frontmatter +3. Copy the **entire skill directory** (including `SKILL.md`, `scripts/`, `assets/`, etc.) to `/skills//` +4. If the archive contains an `INSTALL.md` or similar setup script, run it — but the final result must still reside under `/skills//` + +## Creating a Skill from Scratch + +Recommended order: + +1. **Clarify requirements** — ask the user for a few concrete use cases (don't ask too many at once) +2. **Plan the structure** — does this skill need scripts? Reference docs? Template assets? +3. **Scaffold** — use the init script: + + ```bash + scripts/init_skill.py --path /skills [--resources scripts,references,assets] [--examples] + ``` + +4. **Fill in content** — write SKILL.md, add scripts and resources. Always test scripts after writing them +5. **Validate** (optional): + + ```bash + scripts/quick_validate.py /skills/ + ``` + +6. **Iterate** — keep improving based on real-world usage feedback + +## Naming Conventions + +- Use only lowercase letters, digits, and hyphens. Normalise user-given names, e.g. `Plan Mode` → `plan-mode` +- Maximum 64 characters +- Keep it short, start with a verb, make it self-explanatory +- Use tool names as prefixes when appropriate, e.g. `gh-address-comments`, `linear-address-issue` +- The directory name and the `name` field must match exactly + +## Three-Level Loading + +Skills are not loaded into context all at once — they use a three-level progressive loading mechanism: + +1. **Metadata** (`name` + `description`) — always in context (~100 words). The Agent uses this to decide whether to invoke the skill +2. **SKILL.md body** — loaded only when the skill is activated; keep it under 500 lines +3. **Resource files** — read on demand by the Agent + +For skills with multiple variants (e.g. multi-cloud deployment), organise like this: + +``` +cloud-deploy/ +├── SKILL.md # Main workflow and provider selection logic +└── references/ + ├── aws.md + ├── gcp.md + └── azure.md +``` + +When the user picks AWS, the Agent only reads `aws.md` — no need to load all three providers. + +## Common Design Patterns + +**Step-by-step**: numbered steps with corresponding scripts. + +```markdown +1. Analyse form structure (run analyze_form.py) +2. Generate field mappings (edit fields.json) +3. Auto-fill the form (run fill_form.py) +``` + +**Branching**: different flows based on user intent. + +```markdown +1. Determine operation type: + **Creating new content?** → follow the "Create" workflow + **Editing existing content?** → follow the "Edit" workflow +``` + +**Template-based**: when output format has strict requirements, include a template in SKILL.md for the Agent to follow. diff --git a/docs/ja/skills/image-generation.mdx b/docs/ja/skills/image-generation.mdx new file mode 100644 index 00000000..cafc9eb3 --- /dev/null +++ b/docs/ja/skills/image-generation.mdx @@ -0,0 +1,158 @@ +--- +title: image-generation - 画像生成 +description: テキストから画像生成 / 画像編集 / 複数画像の融合、複数プロバイダーの自動ルーティングとフォールバック対応 +--- + +汎用の画像生成・編集スキルです。OpenAI、Gemini、Seedream(Volcengine Ark)、Qwen(DashScope)、MiniMax、LinkAI の 6 社に対応。モデルを手動で選ぶ必要はなく、固定の優先順位に従って、設定済みのプロバイダーを自動的に選択します。 + +## モデル選択 + +`image-generation` は「固定優先度 + 自動フォールバック」のストラテジーを採用しています。API Key を設定するだけで使えます: + +1. **優先順位**: `OpenAI → Gemini → Seedream → Qwen → MiniMax → LinkAI` +2. **未設定のプロバイダーはスキップ**: API Key が設定されているプロバイダーのみが参加 +3. **失敗時は自動で次へ**: 401、モデル未開通、ネットワークエラーなどの場合、次のプロバイダーを試行 +4. **モデル指定時は前置**: 特定のモデル名を渡すと、そのプロバイダーが最前列に昇格 + +### 対応モデル + +| プロバイダー | モデル / エイリアス | 特徴 | +| --- | --- | --- | +| OpenAI | `gpt-image-2`、`gpt-image-1` | 汎用テキスト→画像、高品質、`quality` パラメータ対応 | +| Gemini Nano Banana | `nano-banana-2`、`nano-banana-pro`、`nano-banana` | `gemini-3.1-flash`、`gemini-3-pro`、`gemini-2.5-flash` の画像バージョン | +| Seedream(Volcengine Ark) | `seedream-5.0-lite`、`seedream-4.5` | ネイティブ 2K–4K、最大 14 枚の参照画像を融合 | +| Qwen(DashScope) | `qwen-image-2.0`、`qwen-image-2.0-pro` | 中国語テキスト描画やテキスト・画像レイアウトに強い | +| MiniMax | `image-01` | シンプルで高速な画像生成 | +| LinkAI | 任意のモデル | 汎用プロキシ、フォールバック用 | + + +デフォルトでは Agent はモデルを選ばず、自動ルーティングを使用します。特定のモデルを使いたい場合は、会話で直接指定してください(例:「seedream で猫を描いて」「gpt-image-2 でポスターを作って」)。下記の「カスタム設定」でデフォルトモデルを固定することもできます。 + + +## カスタム設定 + +### API Key の設定 + +**少なくとも 1 つ**のプロバイダーの Key が必要です。複数設定すると自動フォールバックが有効になります。設定方法は 3 通り: + +#### 方法 1:既存のモデル Key を自動再利用 + +Web コンソールや `config.json` で対話モデルの Key(`openai_api_key`、`gemini_api_key` など)を設定済みの場合、起動時にこれらの Key は対応する環境変数に**自動同期**されます。つまり、対話モデルが使えていれば、画像生成も同じ Key で追加設定なしに利用できます。 + +#### 方法 2:config.json で設定 + +`config.json` に Key フィールドを直接記述: + +```json +{ + "openai_api_key": "sk-xxx", + "openai_api_base": "https://api.openai.com/v1", + "gemini_api_key": "AIza-xxx", + "ark_api_key": "xxx", + "dashscope_api_key": "sk-xxx", + "minimax_api_key": "xxx", + "linkai_api_key": "xxx" +} +``` + +変更後は再起動が必要です。各 Key には対応する `*_api_base` フィールドがあり、カスタムエンドポイントを指定できます。 + +#### 方法 3:会話で直接設定 + +チャットで API Key を送信すると、Agent が `env_config` ツールで `~/cow/.env` に保存します。**再起動不要**でただちに反映されます。例: + +``` +OPENAI_API_KEY を sk-xxx に設定して +``` + +または: + +``` +ARK_API_KEY を xxx に設定して +``` + +### API Key 一覧 + +| 環境変数 | config.json フィールド | プロバイダー | デフォルト Base URL | +| --- | --- | --- | --- | +| `OPENAI_API_KEY` | `openai_api_key` | OpenAI | `https://api.openai.com/v1` | +| `GEMINI_API_KEY` | `gemini_api_key` | Gemini | `https://generativelanguage.googleapis.com` | +| `ARK_API_KEY` | `ark_api_key` | Volcengine Ark(Seedream) | `https://ark.cn-beijing.volces.com/api/v3` | +| `DASHSCOPE_API_KEY` | `dashscope_api_key` | Alibaba DashScope(Qwen) | `https://dashscope.aliyuncs.com` | +| `MINIMAX_API_KEY` | `minimax_api_key` | MiniMax | `https://api.minimaxi.com` | +| `LINKAI_API_KEY` | `linkai_api_key` | LinkAI | `https://api.link-ai.tech` | + +### デフォルトモデルの固定 + +すべての画像生成を特定のプロバイダーのモデルで固定したい場合、`config.json` に以下を追加: + +```json +"skill": { + "image-generation": { + "model": "seedream-5.0-lite" + } +} +``` + +起動時にこの設定は環境変数 `SKILL_IMAGE_GENERATION_MODEL` に自動変換され、スクリプトはこのモデルのプロバイダーを常に使用します。 + +## 有効化と無効化 + +`image-generation` は内蔵スキルで、**API Key に基づいてステータスが自動調整**されます: + +- **Key 設定済み**:スキルはアクティブ — Agent は画像生成リクエストを受けると呼び出す +- **Key 未設定**:スキルはコンテキストに表示される(「設定が必要」とマーク)— Agent は呼び出し失敗の代わりに Key の設定を案内する + +手動で制御する場合: + +```text +/skill disable image-generation # 無効化(Key があっても呼び出されない) +/skill enable image-generation # 再有効化 +``` + +ターミナルでは `cow skill disable image-generation` / `cow skill enable image-generation`。 + +## パラメータ + +| パラメータ | 型 | 必須 | デフォルト | 説明 | +| --- | --- | --- | --- | --- | +| `prompt` | string | はい | — | 画像の説明 | +| `image_url` | string / list | いいえ | null | 編集用の入力画像。ローカルパスまたは URL。複数指定で複数画像融合 | +| `quality` | string | いいえ | auto | `low` / `medium` / `high` — 一部のプロバイダーのみ対応 | +| `size` | string | いいえ | auto | `512` / `1K` / `2K` / `3K` / `4K`、またはピクセル値(例: `1024x1024`) | +| `aspect_ratio` | string | いいえ | null | `1:1` / `3:2` / `2:3` / `16:9` / `9:16` / `21:9`;Gemini は `1:4` / `4:1` / `1:8` / `8:1` にも対応 | + + +**品質が高いほど・解像度が大きいほど、コストが高く、時間がかかります。** + +- 日常の会話やプレビューにはデフォルト(`auto`)、または `quality=low` + `size=1K` を使用 — 約 20 秒で生成 +- ポスターやユーザーが高解像度を明示的に要求した場合は `quality=high` + `size=2K/4K` — モデルによって 1〜5 分かかる場合があります + + +## 出力 + +成功時: + +```json +{ + "model": "doubao-seedream-5-0-260128", + "images": [ + {"url": "/path/to/output.png"} + ] +} +``` + +失敗時:`{ "error": "..." }`。エラー後は**直接リトライしないでください** — ほぼ確実に設定の問題です(Key の誤り、API ベース URL の不一致、モデル未開通など)。まず設定を修正してから再試行してください。 + +## よくある使い方 + +- **テキスト→画像**:説明からイラスト、ポスター、アイコン、アバター、絵コンテなどを生成 +- **画像→画像**:既存の画像のスタイル変更、要素の入れ替え、装飾やテキストの追加 +- **複数画像の融合**:複数の参照画像を 1 枚に合成(着せ替え、キャラクター集合写真など) + + +- bash タイムアウトは 600 秒に設定してください。各プロバイダーの HTTP タイムアウトは 300 秒ですが、スクリプトが複数のプロバイダーを順番に試行する場合があります +- 入力画像は自動的に 4 MB 以下・最長辺 4096 px 以下に圧縮されます +- Gemini / Seedream / Qwen / MiniMax は `quality` パラメータに対応していません(渡しても無視されます) +- Seedream のデフォルトは 2K。`seedream-5.0-lite` は 3K まで、`seedream-4.5` は 4K まで対応 + diff --git a/docs/ja/skills/knowledge-wiki.mdx b/docs/ja/skills/knowledge-wiki.mdx new file mode 100644 index 00000000..28760c80 --- /dev/null +++ b/docs/ja/skills/knowledge-wiki.mdx @@ -0,0 +1,112 @@ +--- +title: knowledge-wiki - ナレッジベース +description: ローカルの構造化ナレッジベースを管理し、自動でアーカイブ・分類・相互参照を行う +--- + +会話で生まれた資料、アイデア、メモをローカルの構造化ナレッジベースに整理し、インデックスとページ間の相互参照を自動で維持します。 + +`knowledge-wiki` はワークスペース内の `knowledge/` ディレクトリを管理します。Agent の「外部メモリ」のようなものです。`always: true` が設定されているため**常にコンテキストにロード**され、外部依存は不要です。 + +## いつ起動するか + +- 記事、ドキュメント、URL を共有して、後で参照できるように残したいとき +- 会話の中で長期保存に値する結論が出たとき +- 以前蓄積したナレッジを調べたいとき + +## ディレクトリ構成 + +``` +knowledge/ +├── index.md # グローバルインデックス(必ずメンテナンスする) +├── log.md # 操作ログ(追記のみ) +└── / # カテゴリサブディレクトリ(内容ごとにグループ化) + └── .md # ナレッジページ(小文字ハイフン区切りのファイル名) +``` + +## 3 つの基本操作 + +### 1. 収録(Ingest) + +資料を共有すると、Agent は: + +1. 原文を読んで理解し、重要な情報を抽出 +2. どのカテゴリに属するか判断 — まず `index.md` をチェックし、適切なカテゴリがなければ新規作成 +3. `knowledge//.md` にナレッジページを生成 +4. インデックス `index.md` とログ `log.md` を更新 + +### 2. 統合(Synthesize) + +会話の中で新しい結論やインサイトが生まれたとき: + +1. 適切なカテゴリの下に新しいナレッジページを作成 +2. 関連する既存ページに相互リンクを追加 +3. インデックスとログを更新 + +### 3. 検索(Query) + +以前蓄積したナレッジについて質問されたとき: + +1. `index.md` から関連しそうなページを探す +2. `read` ツールで具体的なページを開く +3. 必要に応じて `memory_search` で補完検索 +4. 回答にナレッジページへのリンクを含め、ユーザーが原文を確認できるようにする + +## ページの書き方 + +```markdown +# ページタイトル + +> Source: <ソース URL または簡単な説明> + +本文。ページ間は相対パスでリンク: +[関連ページ](../category/related-page.md) + +## 要点 + +- ... + +## 関連ページ + +- [ページ A](../category/page-a.md) — 関連する理由 +``` + + +- `> Source:` はこのナレッジの出典を記録します。明確な出典がある場合は必ず記載してください +- 相互参照は重要です:ページを作成・更新したら、関連ページにも逆リンクを追加してください +- **既に存在するページにのみリンクしてください**。ある概念が独立ページに値する場合は、先にページを作成してからリンクを追加してください + + +## インデックス形式 + +`knowledge/index.md` はフラットリスト形式で、カテゴリごとにグループ化し、各ナレッジページを 1 行で表します: + +```markdown +# Knowledge Index + +## カテゴリ A +- [ページタイトル](category-a/page-slug.md) — 一行の要約 + +## カテゴリ B +- [ページタイトル](category-b/page-slug.md) — 一行の要約 +``` + +テーブルや絵文字は使いません。カテゴリ名や構成は柔軟に調整できます。 + +## ログ形式 + +`knowledge/log.md` は追記のみ、最新のエントリが一番下: + +```markdown +## [YYYY-MM-DD] ingest | ページタイトル +## [YYYY-MM-DD] synthesize | ページタイトル +``` + +## 執筆ガイドライン + +- **ファイル名**は小文字+ハイフン(例: `machine-learning.md`) +- **1 ページ 1 トピック** — 関連コンテンツはリンクで繋ぐ +- **重複ページを作らず、既存ページを更新する** +- **変更のたびにインデックスを更新する**(`knowledge/index.md`) +- **要点を抽出し、全文をコピーしない** +- **会話中にナレッジページを参照する際はフルパスを使用**(例: `[タイトル](knowledge//.md)`)。ページ間の相互リンクのみ相対パスを使用 +- **ナレッジページに基づいて回答する際はリンクを含める** — ユーザーが詳細を確認できるように diff --git a/docs/ja/skills/skill-creator.mdx b/docs/ja/skills/skill-creator.mdx new file mode 100644 index 00000000..130548ec --- /dev/null +++ b/docs/ja/skills/skill-creator.mdx @@ -0,0 +1,180 @@ +--- +title: skill-creator - スキル作成 +description: スキルの作成・インストール・更新、SKILL.md の書き方とディレクトリ構成の標準化 +--- + +`skill-creator` は「メタスキル」です。Agent が他のスキルを作成・インストール・更新する際に呼び出され、すべてのスキルの `SKILL.md` の書き方とディレクトリ構成を統一します。 + +## いつ起動するか + +- ユーザーが URL やリモートリポジトリからスキルをインストールしたいとき +- ユーザーが新しいスキルをゼロから作成したいとき +- 既存のスキルをアップグレード・リファクタリングする必要があるとき + +## スキルとは + +スキルは「再利用可能な説明書」にオプションのスクリプトやリソースを加えたものです。特定のドメインの専門知識を Agent に注入し、該当タスクをスペシャリストのように処理できるようにします。 + +スキルには通常、以下が含まれます: + +1. **専門ワークフロー** — ある種のタスクの完全な手順 +2. **ツールの使い方** — 特定の API やファイル形式の処理方法 +3. **ドメイン知識** — チームの規約、ビジネスルール、データ構造など +4. **付属リソース** — スクリプト、参考ドキュメント、テンプレートなど + + +**基本原則:省けるものは省く。** Agent が自力で推測できない内容だけを書きましょう。1 行追加するたびに「このトークンコストに見合うか?」と自問してください。 + + +## ディレクトリ構成 + +``` +skill-name/ +├── SKILL.md # 必須:スキル定義 +│ ├── YAML frontmatter(name / description は必須) +│ └── Markdown 本文(説明 + 例) +└── オプションリソース + ├── scripts/ # 実行可能スクリプト(Python / Bash など) + ├── references/ # 分量が多い参考ドキュメント(Agent が必要時に読む) + └── assets/ # テンプレート、アイコンなど(出力に直接使われるもの) +``` + +## SKILL.md 仕様 + +SKILL.md ヘッダーの `frontmatter` フィールド: + +| フィールド | 説明 | +| --- | --- | +| `name` | スキル名。小文字+ハイフン、ディレクトリ名と一致させる | +| `description` | **最も重要なフィールド**。「このスキルが何をするか」「いつ使うべきか」を明記する。Agent はこれを見て呼び出すかどうかを判断する。トリガーに関する記述はすべてここに書き、本文には書かない | +| `metadata.cowagent.requires.bins` | システムに必要な CLI ツール | +| `metadata.cowagent.requires.env` | 必要な環境変数(すべて揃っている必要がある) | +| `metadata.cowagent.requires.anyEnv` | 複数の API Key のうち 1 つあればよい | +| `metadata.cowagent.requires.anyBins` | 複数のツールのうち 1 つあればよい | +| `metadata.cowagent.always` | `true` にすると常にロードされ、依存チェックをスキップ | +| `metadata.cowagent.emoji` | 表示用の絵文字(任意) | +| `metadata.cowagent.os` | OS 制限、例: `["darwin", "linux"]` | + + +`category` フィールドは手動で設定する必要はありません。システムが自動的に `skill` に設定します。 + + +API Key 依存の宣言方法は 2 通り: + +```yaml +metadata: + cowagent: + requires: + env: ["MYAPI_KEY"] # 必須 +``` + +```yaml +metadata: + cowagent: + requires: + anyEnv: ["OPENAI_API_KEY", "LINKAI_API_KEY"] # いずれか 1 つ +``` + +**スキルは依存関係に基づいて自動的に有効/無効になります**:環境変数が揃えば自動有効、不足すれば自動無効。手動で `/skill enable` する必要はありません。 + +## リソースディレクトリの使い方 + +| ディレクトリ | 入れるもの | 入れないもの | +| --- | --- | --- | +| `scripts/` | 繰り返し実行するコード、確定的な結果が必要なスクリプト | デモ用のコード片 | +| `references/` | **500 行超**で SKILL.md に収まらない大きなドキュメント(完全な DB スキーマなど) | 一般的な API ドキュメント、チュートリアル | +| `assets/` | 最終出力に含まれるファイル(テンプレート、アイコン、ボイラープレートなど) | 説明用ドキュメント | + + +**原則としてすべての内容を `SKILL.md` に書きます** — リソースディレクトリに分割するのは本当に収まらない場合だけです。 + +`README.md`、`CHANGELOG.md`、`INSTALLATION_GUIDE.md` などをスキルに追加しないでください。すべて `SKILL.md` に入れましょう。リソースディレクトリには実際に実行するスクリプトや実際に使う素材だけを配置してください。 + + +## 外部スキルのインストール + +インストール後、スキルは `/skills//` に配置されます。 + +| ソース | インストール方法 | +| --- | --- | +| URL(単一ファイル) | curl / web_fetch | +| URL(zip アーカイブ) | ダウンロードして展開 | +| ローカル SKILL.md | 直接読み込み | +| ローカル zip アーカイブ | 展開 | + +インストール手順: + +1. `SKILL.md` を見つける(アーカイブのルートまたはサブディレクトリにある場合がある) +2. frontmatter から `name` を読み取る +3. **スキルディレクトリ全体**(`SKILL.md`、`scripts/`、`assets/` など)を `/skills//` にコピー +4. アーカイブに `INSTALL.md` などのセットアップスクリプトがあれば実行するが、最終的に `/skills//` に収まっている必要がある + +## スキルをゼロから作成 + +推奨手順: + +1. **要件を明確にする** — ユーザーに具体的なユースケースをいくつか挙げてもらう(一度に多く聞きすぎない) +2. **構成を計画する** — スクリプトは必要か?参考ドキュメントは?テンプレートは? +3. **スキャフォールド** — 初期化スクリプトを使用: + + ```bash + scripts/init_skill.py --path /skills [--resources scripts,references,assets] [--examples] + ``` + +4. **内容を埋める** — SKILL.md を書き、スクリプトとリソースを追加。スクリプトは必ず実行テストする +5. **バリデーション**(任意): + + ```bash + scripts/quick_validate.py /skills/ + ``` + +6. **イテレーション** — 実際の使用フィードバックに基づいて継続的に改善 + +## 命名規則 + +- 小文字、数字、ハイフンのみ使用。ユーザーの入力は正規化する(例: `Plan Mode` → `plan-mode`) +- 64 文字以内 +- 短く、動詞で始め、一目で何をするか分かるように +- 必要に応じてツール名をプレフィックスにする(例: `gh-address-comments`、`linear-address-issue`) +- ディレクトリ名と `name` フィールドは完全に一致させる + +## 3 段階ローディング + +スキルは一度にすべてコンテキストに読み込まれるわけではなく、3 段階で必要に応じてロードされます: + +1. **メタ情報**(`name` + `description`) — 常にコンテキスト内(約 100 語)。Agent がスキルを使うかどうかの判断に使用 +2. **SKILL.md 本文** — スキルが有効化されたときだけロード。500 行以内を推奨 +3. **リソースファイル** — Agent が必要なときに読み込む + +複数のバリエーション(例: マルチクラウドデプロイ)を持つスキルは次のように整理: + +``` +cloud-deploy/ +├── SKILL.md # メインワークフローとプロバイダー選択ロジック +└── references/ + ├── aws.md + ├── gcp.md + └── azure.md +``` + +ユーザーが AWS を選んだら、Agent は `aws.md` だけを読みます。3 社分のドキュメントをすべてロードする必要はありません。 + +## よくあるデザインパターン + +**ステップ式**:番号付きの手順と対応スクリプト。 + +```markdown +1. フォーム構造を分析(analyze_form.py を実行) +2. フィールドマッピングを生成(fields.json を編集) +3. フォームを自動入力(fill_form.py を実行) +``` + +**分岐式**:ユーザーの意図に応じて異なるフローへ。 + +```markdown +1. 操作タイプを判定: + **新規作成?** → 「作成フロー」へ + **既存の編集?** → 「編集フロー」へ +``` + +**テンプレート式**:出力形式に厳密な要件がある場合、SKILL.md にテンプレートを含め、Agent にそれに従って出力させる。 diff --git a/docs/skills/image-generation.mdx b/docs/skills/image-generation.mdx new file mode 100644 index 00000000..e64cc846 --- /dev/null +++ b/docs/skills/image-generation.mdx @@ -0,0 +1,160 @@ +--- +title: image-generation - 图像生成 +description: 文生图 / 图生图 / 多图融合,支持多家厂商自动路由与回退 +--- + +通用的图像生成与编辑技能,支持 OpenAI、Gemini、Seedream(火山方舟)、Qwen(百炼)、MiniMax、LinkAI 共六家厂商。不需要手动选模型,脚本会按固定优先级自动挑选已配置的厂商来出图。 + +## 模型选择 + +`image-generation` 采用「固定优先级 + 自动回退」的策略,配好 Key 就能用: + +1. **优先级顺序**:`OpenAI → Gemini → Seedream → Qwen → MiniMax → LinkAI` +2. **没配 Key 的跳过**:只有设了 API Key 的厂商才会参与 +3. **失败自动切下一家**:遇到 401、模型未开通、网络异常等错误时,会自动试下一个 +4. **指定模型时前置**:如果明确传了某个模型名,对应厂商会被提到最前面先试 + +### 支持的模型 + +| 厂商 | 模型 / 别名 | 特点 | +| --- | --- | --- | +| OpenAI | `gpt-image-2`、`gpt-image-1` | 通用文生图,高质量、高智能,支持 `quality` 参数控制画质 | +| Gemini Nano Banana | `nano-banana-2`、`nano-banana-pro`、`nano-banana` | 对应 `gemini-3.1-flash`、`gemini-3-pro`、`gemini-2.5-flash` 的图像版本 | +| Seedream(火山方舟) | `seedream-5.0-lite`、`seedream-4.5` | 原生 2K–4K,最多 14 张图融合 | +| Qwen(百炼) | `qwen-image-2.0`、`qwen-image-2.0-pro` | 擅长中文排版和图文融合 | +| MiniMax | `image-01` | 简单快速的图片生成 | +| LinkAI | 任意模型 | 通用代理,兜底用 | + + +默认情况下 Agent 不会主动选模型,而是走自动路由。如果你想用某个特定模型,直接在对话里说就行,比如「用 seedream 画一只猫」或「用 gpt-image-2 生成海报」。也可以通过下面的「自定义配置」固定默认模型。 + + +## 自定义配置 + +### API Key 配置 + +至少需要配**一个**厂商的 Key,配多个就能享受自动回退能力。有三种配置方式: + +#### 方式一:已有模型 Key 自动复用 + +如果你在 web控制台 或 `config.json` 中配置了对话模型的 Key(比如 `openai_api_key`、`gemini_api_key` 等),启动时这些 Key 会被**自动同步**到对应的环境变量。也就是说,只要你的对话模型能用,图像生成就能直接用同一个 Key,不需要额外配置。 + +#### 方式二:在 config.json 中配置 + +在 `config.json` 中直接写对应的 Key 字段即可,支持的字段如下: + +```json +{ + "openai_api_key": "sk-xxx", + "openai_api_base": "https://api.openai.com/v1", + "gemini_api_key": "AIza-xxx", + "ark_api_key": "xxx", + "dashscope_api_key": "sk-xxx", + "minimax_api_key": "xxx", + "linkai_api_key": "xxx" +} +``` + +修改后需要重启生效。每个 Key 还有对应的 `*_api_base` 字段可以自定义接口地址。 + +#### 方式三:对话中直接配置 + +在对话里发送 API Key,Agent 会通过 `env_config` 工具自动保存到 `~/cow/.env`,**不需要重启**就能生效。例如: + +``` +帮我配置 OPENAI_API_KEY 为 sk-xxx +``` + +或者: + +``` +设置 ARK_API_KEY 为 xxx +``` + +### API Key 一览 + +| 环境变量 | config.json 字段 | 对应厂商 | 默认 Base URL | +| --- | --- | --- | --- | +| `OPENAI_API_KEY` | `openai_api_key` | OpenAI | `https://api.openai.com/v1` | +| `GEMINI_API_KEY` | `gemini_api_key` | Gemini | `https://generativelanguage.googleapis.com` | +| `ARK_API_KEY` | `ark_api_key` | 火山方舟(Seedream) | `https://ark.cn-beijing.volces.com/api/v3` | +| `DASHSCOPE_API_KEY` | `dashscope_api_key` | 阿里百炼(Qwen) | `https://dashscope.aliyuncs.com` | +| `MINIMAX_API_KEY` | `minimax_api_key` | MiniMax | `https://api.minimaxi.com` | +| `LINKAI_API_KEY` | `linkai_api_key` | LinkAI | `https://api.link-ai.tech` | + + +### 指定默认模型 + +如果想让所有图像生成固定走某个厂商的模型,可以在 `config.json` 里加: + +```json +"skill": { + "image-generation": { + "model": "seedream-5.0-lite" + } +} +``` + +启动时这段配置会被自动转成环境变量 `SKILL_IMAGE_GENERATION_MODEL`,脚本读到后会固定使用这个模型所在的厂商进行生成。 + + +## 开启和关闭 + +`image-generation` 是内置技能,**会根据 API Key 自动调整状态**: + +- **Key 已配置**:技能正常可用,Agent 收到画图请求时会直接调用 +- **Key 未配置**:技能仍然会出现在上下文中(标记为「需要配置」),Agent 会引导用户去配 Key,而不是直接调用失败 + +如果想手动控制,也可以用命令: + +```text +/skill disable image-generation # 手动关闭(即使有 Key 也不会被调用) +/skill enable image-generation # 重新开启 +``` + +终端里对应的命令是 `cow skill disable image-generation` / `cow skill enable image-generation`。 + +## 参数 + +| 参数 | 类型 | 必填 | 默认 | 说明 | +| --- | --- | --- | --- | --- | +| `prompt` | string | 是 | — | 图像描述 | +| `image_url` | string / list | 否 | null | 编辑用的输入图,支持本地路径或 URL。传多个就是多图融合 | +| `quality` | string | 否 | auto | `low` / `medium` / `high`,只有部分厂商支持 | +| `size` | string | 否 | auto | `512` / `1K` / `2K` / `3K` / `4K`,也可以写像素值如 `1024x1024` | +| `aspect_ratio` | string | 否 | null | `1:1` / `3:2` / `2:3` / `16:9` / `9:16` / `21:9`;Gemini 还支持 `1:4` / `4:1` / `1:8` / `8:1` | + + +**质量越高、分辨率越大,花的钱越多、等的时间越长。** + +- 日常对话和快速预览直接用默认(`auto`),或者 `quality=low` + `size=1K`,大概 20 秒出图 +- 做海报、用户明确要高清的时候再上 `quality=high` + `size=2K/4K`,可能要等 1~5 分钟,取决于不同模型的速度 + + +## 输出 + +成功时返回: + +```json +{ + "model": "doubao-seedream-5-0-260128", + "images": [ + {"url": "/path/to/output.png"} + ] +} +``` + +失败时返回 `{ "error": "..." }`。出错后**不要直接重试**——大概率是配置问题(Key 填错、API 地址不对、模型没开通),让用户修好配置再试。 + +## 常见用法 + +- **文生图**:根据描述生成插画、海报、图标、头像、分镜图等 +- **图生图**:在已有图片上改风格、换元素、加装饰、加文字等 +- **多图融合**:把多张参考图合成一张(换装、角色合影等) + + +- bash 超时建议设 600 秒。单个厂商的 HTTP 超时是 300 秒,但脚本可能依次尝试多个厂商 +- 输入的图片会自动压缩到 4MB 以内、最长边不超过 4096px +- Gemini / Seedream / Qwen / MiniMax 不支持 `quality` 参数,传了也没用 +- Seedream 默认出 2K 图,`seedream-5.0-lite` 支持到 3K,`seedream-4.5` 支持到 4K + diff --git a/docs/skills/knowledge-wiki.mdx b/docs/skills/knowledge-wiki.mdx new file mode 100644 index 00000000..40b4d298 --- /dev/null +++ b/docs/skills/knowledge-wiki.mdx @@ -0,0 +1,112 @@ +--- +title: knowledge-wiki - 知识库 +description: 维护本地结构化知识库,自动归档、分类和交叉引用 +--- + +帮你把对话中产生的资料、灵感和零散笔记整理成结构化的本地知识库,自动维护索引和页面之间的交叉引用。 + +`knowledge-wiki` 在工作空间下维护一个 `knowledge/` 目录,相当于 Agent 的「外脑」。技能设置了 `always: true`,会**常驻上下文**,不需要任何外部依赖。 + +## 什么时候会触发 + +- 你分享了一篇文章、一份文档或一个 URL,想要沉淀下来 +- 聊天过程中聊出了值得长期保留的结论 +- 你想查一下之前积累过的知识 + +## 目录结构 + +``` +knowledge/ +├── index.md # 全局索引(必须维护) +├── log.md # 操作日志(只追加) +└── / # 分类子目录(按内容自由分组) + └── .md # 知识页(文件名用小写加中划线) +``` + +## 三个核心操作 + +### 1. 收录(Ingest) + +你分享了一段资料时,Agent 会: + +1. 读懂原文,提取关键信息 +2. 按内容决定放到哪个分类下——先看 `index.md` 里有没有合适的分类,没有就新建一个 +3. 生成知识页 `knowledge//.md` +4. 更新索引 `index.md` 和日志 `log.md` + +### 2. 综合(Synthesize) + +聊天中产生了新的结论或洞见时: + +1. 在合适的分类下创建新知识页 +2. 给相关的已有页面加上互相指向的链接 +3. 更新索引和日志 + +### 3. 查询(Query) + +你问到以前积累的知识时: + +1. 先从 `index.md` 里找可能相关的页面 +2. 用 `read` 工具打开具体页面 +3. 需要时再用 `memory_search` 补充检索 +4. 回答里会带上知识页的链接,方便你点过去看原文 + +## 知识页怎么写 + +```markdown +# 页面标题 + +> Source: <来源 URL 或简要说明> + +正文内容。页面之间用相对路径链接: +[相关页](../category/related-page.md) + +## 要点 + +- ... + +## 相关页面 + +- [页面 A](../category/page-a.md) — 为什么相关 +``` + + +- `> Source:` 用来记录这条知识的来源。有明确来源时一定要写 +- 交叉引用很重要:创建或更新某页时,记得也去关联页面里补上反向链接 +- **只链接已经存在的页面**。如果某个概念值得单独成页,先建好再加链接 + + +## 索引格式 + +`knowledge/index.md` 采用扁平列表,按分类分组,每个知识页占一行: + +```markdown +# Knowledge Index + +## 分类 A +- [页面标题](category-a/page-slug.md) — 一句话摘要 + +## 分类 B +- [页面标题](category-b/page-slug.md) — 一句话摘要 +``` + +不用表格,不加 emoji。分类怎么起名、怎么组织都可以灵活调整。 + +## 日志格式 + +`knowledge/log.md` 只追加、不修改,最新的写在最下面: + +```markdown +## [YYYY-MM-DD] ingest | 页面标题 +## [YYYY-MM-DD] synthesize | 页面标题 +``` + +## 写作约定 + +- **文件名**用小写加中划线,比如 `machine-learning.md` +- **一页只讲一件事**,需要关联的内容通过链接串起来 +- **有了就更新,不要重复建页** +- **每次改完都要更新索引** `knowledge/index.md` +- **写精华别抄全文**,抓住要点就行 +- **对话里引用知识页时用完整路径**,比如 `[标题](knowledge//.md)`。页面之间互相链接才用相对路径 +- **基于知识页回答问题时附上链接**,方便深入查阅 diff --git a/docs/skills/skill-creator.mdx b/docs/skills/skill-creator.mdx new file mode 100644 index 00000000..623a74f6 --- /dev/null +++ b/docs/skills/skill-creator.mdx @@ -0,0 +1,180 @@ +--- +title: skill-creator - 技能创建 +description: 创建、安装、更新技能,规范 SKILL.md 写法与目录结构 +--- + +`skill-creator` 是一个「元技能」,专门用来帮助 Agent 创建、安装和更新其他技能,确保所有技能的 `SKILL.md` 写法和目录结构保持一致。 + +## 什么时候会触发 + +- 用户想从 URL 或远程仓库安装一个技能 +- 用户想从头创建一个全新的技能 +- 需要升级或重构已有技能 + +## 技能是什么 + +简单来说,技能就是一份「可复用的说明书」加上可选的脚本和资源。它给 Agent 注入了某个领域的专业知识,让 Agent 在遇到对应任务时能像专家一样处理。 + +一个技能通常包含以下内容: + +1. **专项工作流** — 某类任务的完整步骤 +2. **工具用法** — 怎么调某种 API 或处理某种文件 +3. **领域知识** — 团队约定、业务规则、数据结构之类 +4. **附带资源** — 脚本、参考文档、模板等 + + +**核心原则:能省则省**。只写 Agent 自己想不到的内容,每加一行都要问自己:值不值得占这些 token? + + +## 目录结构 + +``` +skill-name/ +├── SKILL.md # 必需:技能定义 +│ ├── YAML frontmatter(必填 name / description) +│ └── Markdown 正文(说明 + 示例) +└── 可选资源 + ├── scripts/ # 可执行脚本(Python / Bash 等) + ├── references/ # 内容较多的参考文档,Agent 按需读取 + └── assets/ # 模板、图标等,会直接用在输出里 +``` + +## SKILL.md 规范定义 + +SKILL.md 文件头部的 `frontmatter` 字段: + +| 字段 | 说明 | +| --- | --- | +| `name` | 技能名,小写加中划线,必须和目录名一致 | +| `description` | **最关键的字段**。写清楚「这个技能干什么」和「什么情况下该用它」,Agent 看到这段来决定要不要调它。注意:所有触发相关的描述都放在这里,不要写到正文里 | +| `metadata.cowagent.requires.bins` | 系统里必须装了哪些命令行工具 | +| `metadata.cowagent.requires.env` | 需要哪些环境变量(全部满足才行) | +| `metadata.cowagent.requires.anyEnv` | 多个 API Key 满足一个就行 | +| `metadata.cowagent.requires.anyBins` | 多个工具满足一个就行 | +| `metadata.cowagent.always` | 设为 `true` 会始终加载,不检查依赖 | +| `metadata.cowagent.emoji` | 展示用的 emoji(可选) | +| `metadata.cowagent.os` | 限定系统,如 `["darwin", "linux"]` | + + +`category` 字段不需要手写,系统会自动设成 `skill`。 + + +声明 API Key 依赖有两种写法: + +```yaml +metadata: + cowagent: + requires: + env: ["MYAPI_KEY"] # 必须有 +``` + +```yaml +metadata: + cowagent: + requires: + anyEnv: ["OPENAI_API_KEY", "LINKAI_API_KEY"] # 有一个就行 +``` + +**技能会自动按依赖启禁用**:环境变量齐了就自动启用,缺了就自动禁用,不需要手动 `/skill enable`。 + +## 资源目录怎么用 + +| 目录 | 放什么 | 不要放 | +| --- | --- | --- | +| `scripts/` | 需要反复执行的代码,或需要确定性结果的脚本 | 纯演示用的代码片段 | +| `references/` | **超过 500 行**、SKILL.md 实在塞不下的大文档(比如完整的数据库 Schema) | 普通 API 文档、示例、教程 | +| `assets/` | 会出现在最终产物里的文件(模板、图标、样板代码等) | 说明性文档 | + + +**原则上所有内容都写在 `SKILL.md` 里**,只有确实放不下才拆到资源目录。 + +不要给技能加 `README.md`、`CHANGELOG.md`、`INSTALLATION_GUIDE.md` 之类的文件——全部放进 `SKILL.md`。资源目录里只放真正要跑的脚本或真正要用的素材。 + + +## 安装外部技能 + +安装后最终落在 `/skills//` 目录。 + +| 来源 | 怎么装 | +| --- | --- | +| URL(单文件) | curl / web_fetch 直接拉 | +| URL(zip 包) | 下载解压 | +| 本地 SKILL.md | 直接读 | +| 本地 zip 包 | 解压 | + +安装步骤: + +1. 找到 `SKILL.md`(可能在包的根目录或某个子目录里) +2. 从 frontmatter 里读出 `name` +3. 把**整个技能目录**(包括 `SKILL.md`、`scripts/`、`assets/` 等)复制到 `/skills//` +4. 如果包里有 `INSTALL.md` 之类的安装脚本,照着跑一遍,但最终结果仍然要落在 `/skills//` 下 + +## 从头创建技能 + +推荐按这个顺序来: + +1. **搞清楚需求** — 让用户举几个具体的使用场景,一次别问太多 +2. **想好结构** — 这个技能需要脚本吗?需要参考文档吗?需要模板素材吗? +3. **生成骨架** — 用初始化脚本: + + ```bash + scripts/init_skill.py --path /skills [--resources scripts,references,assets] [--examples] + ``` + +4. **填充内容** — 写好 SKILL.md、补上脚本和资源。脚本写完一定要实际跑一遍 +5. **格式校验**(可选): + + ```bash + scripts/quick_validate.py /skills/ + ``` + +6. **迭代完善** — 实际用起来之后根据反馈持续改进 + +## 命名规则 + +- 只用小写字母、数字和中划线。用户给的名字需要做标准化处理,比如 `Plan Mode` → `plan-mode` +- 长度别超过 64 个字符 +- 尽量短、用动词开头、一看就知道干什么 +- 必要时用工具名做前缀,比如 `gh-address-comments`、`linear-address-issue` +- 目录名和 `name` 字段必须完全一致 + +## 三级加载机制 + +技能不会一次性全部塞进上下文,而是分三级按需加载: + +1. **元信息**(`name` + `description`)— 常驻上下文,约 100 词。Agent 靠它判断「要不要用这个技能」 +2. **SKILL.md 正文** — 确定要用了才加载,建议控制在 500 行以内 +3. **资源文件** — Agent 需要的时候再读 + +如果一个技能涉及多个变体(比如多云厂商部署),建议这样组织: + +``` +cloud-deploy/ +├── SKILL.md # 主流程和厂商选择逻辑 +└── references/ + ├── aws.md + ├── gcp.md + └── azure.md +``` + +用户选了 AWS,Agent 只需要读 `aws.md`,不用把三家的文档全加载进来。 + +## 常见设计模式 + +**步骤式**:按编号列出操作步骤和对应脚本。 + +```markdown +1. 分析表单结构(运行 analyze_form.py) +2. 生成字段映射(编辑 fields.json) +3. 自动填充表单(运行 fill_form.py) +``` + +**分支式**:根据用户意图走不同流程。 + +```markdown +1. 判断操作类型: + **新建内容?** → 走「创建流程」 + **编辑已有内容?** → 走「编辑流程」 +``` + +**模板式**:输出格式有严格要求时,在 SKILL.md 里直接给一个样板,让 Agent 照着写。 diff --git a/docs/tools/index.mdx b/docs/tools/index.mdx index 2b981758..dcd0a986 100644 --- a/docs/tools/index.mdx +++ b/docs/tools/index.mdx @@ -50,7 +50,7 @@ description: CowAgent 内置工具系统 搜索互联网获取实时信息 - + 分析图片内容(识别、描述、OCR 文字提取等) diff --git a/docs/tools/vision.mdx b/docs/tools/vision.mdx index 69d7255e..40d9c66b 100644 --- a/docs/tools/vision.mdx +++ b/docs/tools/vision.mdx @@ -1,5 +1,5 @@ --- -title: vision - 图片分析 +title: vision - 图片理解 description: 分析图片内容(识别、描述、OCR 等) --- diff --git a/skills/image-generation/SKILL.md b/skills/image-generation/SKILL.md index 0195d7c7..b55e192c 100644 --- a/skills/image-generation/SKILL.md +++ b/skills/image-generation/SKILL.md @@ -6,12 +6,24 @@ metadata: requires: anyEnv: - OPENAI_API_KEY + - GEMINI_API_KEY + - ARK_API_KEY + - DASHSCOPE_API_KEY + - MINIMAX_API_KEY - LINKAI_API_KEY --- # Image Generation -Generate and edit images using AI models (GPT-Image-2, GPT-Image-1, etc.). +Generate and edit images using AI models. The script automatically picks a backend based on which API keys are configured — **you don't need to specify a model unless the user explicitly names one**. + +Supported models (passed via `model` only when the user asks for a specific one): + +- **OpenAI** — `gpt-image-2`, `gpt-image-1` +- **Gemini Nano Banana** — `nano-banana-2`, `nano-banana-pro`, `nano-banana` +- **Seedream (Volcengine Ark)** — `seedream-5.0-lite`, `seedream-4.5` +- **Qwen (DashScope)** — `qwen-image-2.0`, `qwen-image-2.0-pro` +- **MiniMax** — `image-01` ## Usage @@ -21,18 +33,19 @@ Run `scripts/generate.py` with a JSON argument. The path is relative to this ski python /scripts/generate.py '' ``` -**Set bash timeout to at least 300 seconds**, as image generation can take 30–200s depending on quality/size. +**Set bash timeout to at least 600 seconds**, as image generation can take 30–200s per provider, and the script may try multiple providers sequentially. ### Parameters | Parameter | Type | Required | Default | Description | |-----------|------|----------|---------|-------------| | `prompt` | string | yes | — | Image description | -| `model` | string | no | `gpt-image-2` | Model name (`gpt-image-2`, `gpt-image-1`) | -| `image_url` | string / list | no | null | Input image(s) for editing: local file path or URL | -| `quality` | string | no | auto | `low` / `medium` / `high`; omit to let the model choose | -| `size` | string | no | auto | `1K`/`2K`/`4K`, pixel value (`1024x1024`), or omit to let the model choose | -| `aspect_ratio` | string | no | null | `1:1` / `3:2` / `2:3` / `16:9` / `9:16` | +| `image_url` | string / list | no | null | Input image(s) for editing: local file path or URL. Multi-image fusion is supported (pass a list) | +| `quality` | string | no | auto | `low` / `medium` / `high` (only some backends honour this) | +| `size` | string | no | auto | `512` / `1K` / `2K` / `3K` / `4K`, or pixel value (`1024x1024`) | +| `aspect_ratio` | string | no | null | `1:1` / `3:2` / `2:3` / `16:9` / `9:16` / `21:9` (some backends also support extreme ratios like `1:4` / `8:1`) | + +**Higher `quality` and larger `size` cost more and run slower.** Default to omitting both (`auto`) so the model picks a balanced setting. Only raise them when the user explicitly asks for high quality / a poster / print-ready output. For quick previews or chat scenarios prefer `quality=low` + `size=1K`. ### Example — generate @@ -40,28 +53,26 @@ python /scripts/generate.py '' python /scripts/generate.py '{"prompt": "A corgi astronaut floating in space"}' ``` -With explicit quality/size: +With aspect ratio: ```bash -python /scripts/generate.py '{"prompt": "A corgi astronaut", "quality": "low", "size": "1K", "aspect_ratio": "1:1"}' +python /scripts/generate.py '{"prompt": "Isometric miniature city of Shanghai at sunset", "size": "2K", "aspect_ratio": "16:9"}' ``` ### Important: Editing vs Generating -When the user asks to **edit, modify, or improve an existing image**, you need to pass the original image via `image_url`. Prefer passing **local file paths** directly — the script handles file reading internally. Without `image_url`, the script generates a brand-new image instead of editing. +When the user asks to **edit, modify, or improve an existing image**, pass the original image via `image_url`. Prefer **local file paths** directly — the script handles file reading internally. Without `image_url`, the script generates a brand-new image instead of editing. ### Example — edit (image-to-image) -Local file (preferred): - ```bash python /scripts/generate.py '{"prompt": "Add a Santa hat to the dog", "image_url": "/path/to/dog.png"}' ``` -URL: +Multi-image fusion — pass a list: ```bash -python /scripts/generate.py '{"prompt": "Make the background blue", "image_url": "https://example.com/photo.png"}' +python /scripts/generate.py '{"prompt": "Combine these characters into a group photo", "image_url": ["/path/a.png", "/path/b.png"]}' ``` ### Output @@ -70,6 +81,7 @@ Prints JSON to stdout: ```json { + "model": "doubao-seedream-5-0-260128", "images": [ {"url": "/path/to/output.png"} ] @@ -86,39 +98,20 @@ On error: } ``` -### Environment Variables +### Setup -| Variable | Required | Description | -|----------|----------|-------------| -| `OPENAI_API_KEY` | yes (unless using LinkAI) | OpenAI API key | -| `OPENAI_API_BASE` | no | Custom API base URL (default: `https://api.openai.com/v1`) | -| `LINKAI_API_KEY` | alt | LinkAI API key (used when `OPENAI_API_KEY` is absent) | -| `LINKAI_API_BASE` | no | LinkAI API base URL | +The script needs **at least one** of these API keys (set via `env_config` or `config.json`): -### Size + Aspect Ratio Resolution +`OPENAI_API_KEY` / `GEMINI_API_KEY` / `ARK_API_KEY` / `DASHSCOPE_API_KEY` / `MINIMAX_API_KEY` / `LINKAI_API_KEY` -`size` and `aspect_ratio` are combined to determine the actual pixel dimensions: - -| size | aspect_ratio | pixels | -|------|-------------|--------| -| `1K` | `1:1` | 1024×1024 | -| `1K` | `3:2` | 1536×1024 | -| `1K` | `2:3` | 1024×1536 | -| `2K` | `1:1` | 2048×2048 | -| `2K` | `16:9` | 2048×1152 | -| `2K` | `9:16` | 1152×2048 | -| `4K` | `16:9` | 3840×2160 | -| `4K` | `9:16` | 2160×3840 | - -When an exact match isn't found, the script tries: exact match → upgrade to higher tier with same ratio → cross-tier match by ratio → tier default. +Each also has an optional `*_API_BASE` for custom endpoints. The script automatically picks the first configured backend and falls back to the next if it fails — no need to specify a model. ### Error Handling -The script internally tries all available providers (OpenAI → LinkAI) in sequence. If it returns an error, **do NOT retry with the same or similar parameters** — the failure is a configuration issue (wrong API key, unsupported API base, etc.), not a transient error. Instead, inform the user about the configuration problem and ask them to fix it (e.g. set the correct `OPENAI_API_KEY` / `OPENAI_API_BASE` via `env_config`), then retry after the configuration is updated. +If the script returns an error after trying all configured backends, **do NOT retry with the same parameters** — the failure is almost always a configuration issue (wrong API key, unsupported API base). Tell the user to fix it via `env_config`, then retry. ### Notes -- HTTP timeout is 300s — high-resolution + high-quality generation can take over 200s. -- When `quality` and `size` are omitted, the API uses `auto` — the model picks the best quality/size based on the prompt. -- `quality=low` + `size=1K` is the fastest combination (~20s). Use when speed matters more than fidelity. +- HTTP timeout is 300s — high-resolution generation can take over 200s. +- Omit `quality` / `size` to let the model pick automatically (`auto`). - Input images for editing are auto-compressed to ≤ 4MB / longest edge ≤ 4096px. diff --git a/skills/image-generation/scripts/generate.py b/skills/image-generation/scripts/generate.py index 99e8d24d..17386353 100644 --- a/skills/image-generation/scripts/generate.py +++ b/skills/image-generation/scripts/generate.py @@ -5,8 +5,17 @@ Unified image generation script. Usage: python generate.py '' -Supports GPT-Image-2 / GPT-Image-1 via the OpenAI-compatible Images API. -Designed for easy extension to other providers (Gemini, etc.). +Supported model families (each provider is tried in priority order: +OpenAI → Gemini → Seedream → Qwen → MiniMax → LinkAI; missing API keys +are skipped, and the provider that natively owns the requested model is +promoted to the front of the queue): + + - gpt-image-2 / gpt-image-1 → OpenAI + - nano-banana / gemini-*-image-* → Gemini + - doubao-seedream-* / seedream-* → Seedream (Volcengine Ark) + - qwen-image-2.0 / qwen-image-2.0-pro / etc. → Qwen (DashScope) + - image-01 / minimax-image → MiniMax + - any model → LinkAI (universal proxy) Dependencies: requests (stdlib: json, sys, os, base64, io, abc, uuid, pathlib, urllib) """ @@ -16,6 +25,7 @@ import sys import os import base64 import io +import time import uuid import re from abc import ABC, abstractmethod @@ -192,9 +202,14 @@ class ImageProvider(ABC): image_url: str | list | None = None, quality: str | None = None, size: str | None = None, + aspect_ratio: str | None = None, output_dir: str = ".", ) -> list[str]: - """Generate image(s) and return list of local file paths.""" + """Generate image(s) and return list of local file paths. + + `size` may be a tier ("1K" / "2K" / "4K" / "512") or pixels ("WxH"). + Providers that need pixel sizes should call `resolve_size(size, aspect_ratio)`. + """ ... @@ -205,10 +220,12 @@ class ImageProvider(ABC): class OpenAIProvider(ImageProvider): """Provider for OpenAI Image API (generations + edits).""" + DEFAULT_MODEL = "gpt-image-2" + def __init__(self, api_key: str, api_base: str, model: str): self.api_key = api_key self.api_base = api_base.rstrip("/") - self.model = model + self.model = model or self.DEFAULT_MODEL def _headers(self) -> dict: return { @@ -267,11 +284,14 @@ class OpenAIProvider(ImageProvider): image_url=None, quality: str | None = None, size: str | None = None, + aspect_ratio: str | None = None, output_dir: str = ".", ) -> list[str]: + # OpenAI Images API expects pixel size like 1024x1024. + resolved = resolve_size(size, aspect_ratio) if (size or aspect_ratio) else None if image_url: - return self._edit(prompt, image_url=image_url, quality=quality, size=size, output_dir=output_dir) - return self._create(prompt, quality=quality, size=size, output_dir=output_dir) + return self._edit(prompt, image_url=image_url, quality=quality, size=resolved, output_dir=output_dir) + return self._create(prompt, quality=quality, size=resolved, output_dir=output_dir) def _create(self, prompt: str, *, quality: str | None, size: str | None, output_dir: str) -> list[str]: url = f"{self.api_base}/images/generations" @@ -337,10 +357,12 @@ class OpenAIProvider(ImageProvider): class LinkAIProvider(ImageProvider): """Provider for LinkAI unified image generation API.""" + DEFAULT_MODEL = "gpt-image-2" + def __init__(self, api_key: str, api_base: str, model: str): self.api_key = api_key self.api_base = api_base.rstrip("/") - self.model = model + self.model = model or self.DEFAULT_MODEL def generate( self, @@ -349,6 +371,7 @@ class LinkAIProvider(ImageProvider): image_url=None, quality: str | None = None, size: str | None = None, + aspect_ratio: str | None = None, output_dir: str = ".", ) -> list[str]: url = f"{self.api_base}/v1/images/generations" @@ -358,8 +381,12 @@ class LinkAIProvider(ImageProvider): } if quality: payload["quality"] = quality + # LinkAI accepts both pixel sizes (1024x1024) and tier shorthand (1K/2K/4K). + # Pass through whatever the caller gave us; also forward aspect_ratio. if size: payload["size"] = size + if aspect_ratio: + payload["aspect_ratio"] = aspect_ratio if image_url: urls = image_url if isinstance(image_url, list) else [image_url] resolved = [] @@ -408,23 +435,654 @@ class LinkAIProvider(ImageProvider): return paths +# --------------------------------------------------------------------------- +# Gemini provider (Nano Banana family — gemini-*-image-*) +# --------------------------------------------------------------------------- + +# Friendly aliases → real Gemini model id +_GEMINI_MODEL_ALIASES = { + "nano-banana": "gemini-2.5-flash-image", + "nano-banana-2": "gemini-3.1-flash-image-preview", + "nano-banana-pro": "gemini-3-pro-image-preview", +} + + +class GeminiProvider(ImageProvider): + """Provider for Google Gemini native image generation (Nano Banana family).""" + + DEFAULT_MODEL = "gemini-3.1-flash-image-preview" # nano-banana-2 + + def __init__(self, api_key: str, api_base: str, model: str): + self.api_key = api_key + self.api_base = api_base.rstrip("/") + self.model = _GEMINI_MODEL_ALIASES.get(model, model or self.DEFAULT_MODEL) + + def generate( + self, + prompt: str, + *, + image_url=None, + quality: str | None = None, # not used; Gemini has no `quality` param + size: str | None = None, + aspect_ratio: str | None = None, + output_dir: str = ".", + ) -> list[str]: + # Build request parts: prompt text + optional inline images + parts: list[dict] = [{"text": prompt}] + if image_url: + urls = image_url if isinstance(image_url, list) else [image_url] + for u in urls: + data = _compress_image(_load_image(u)) + mime = _guess_mime(data) + parts.append({ + "inline_data": { + "mime_type": mime, + "data": base64.b64encode(data).decode(), + } + }) + + payload: dict = { + "contents": [{"parts": parts}], + "generationConfig": {"responseModalities": ["IMAGE"]}, + } + + # Gemini natively supports aspectRatio + imageSize tiers (512/1K/2K/4K). + _GEMINI_VALID_TIERS = {"512", "1K", "2K", "4K"} + _GEMINI_TIER_FALLBACK = {"3K": "2K"} + image_config: dict = {} + if size: + if "x" in size.lower(): + tier = _pixels_to_tier(size) + else: + tier = size.upper() + tier = _GEMINI_TIER_FALLBACK.get(tier, tier) + if tier in _GEMINI_VALID_TIERS: + image_config["imageSize"] = tier + if aspect_ratio: + image_config["aspectRatio"] = aspect_ratio + elif size and "x" in size.lower(): + ratio = _pixels_to_ratio(size) + if ratio: + image_config["aspectRatio"] = ratio + if image_config: + payload["generationConfig"]["imageConfig"] = image_config + + url = f"{self.api_base}/v1beta/models/{self.model}:generateContent" + headers = { + "x-goog-api-key": self.api_key, + "Content-Type": "application/json", + } + + if _HAS_REQUESTS: + resp = requests.post(url, headers=headers, json=payload, timeout=300) + if resp.status_code >= 400: + try: + body = resp.json() + msg = body.get("error", {}).get("message") or resp.text + except Exception: + msg = resp.text or resp.reason + raise RuntimeError(f"API {resp.status_code}: {msg}") + result = resp.json() + else: + data = json.dumps(payload).encode() + req = Request(url, data=data, headers=headers, method="POST") + with urlopen(req, timeout=300) as r: + result = json.loads(r.read()) + + return self._extract_images(result, output_dir) + + @staticmethod + def _extract_images(result: dict, output_dir: str) -> list[str]: + paths: list[str] = [] + for cand in result.get("candidates", []): + for part in cand.get("content", {}).get("parts", []): + if part.get("thought"): + continue # skip thinking-stage interim images + inline = part.get("inlineData") or part.get("inline_data") + if inline and inline.get("data"): + raw = base64.b64decode(inline["data"]) + paths.append(_save_image(raw, output_dir)) + if not paths: + # Surface the model's text reply (often a refusal explanation) + for cand in result.get("candidates", []): + for part in cand.get("content", {}).get("parts", []): + if part.get("text"): + raise RuntimeError(f"Gemini returned no image: {part['text'][:200]}") + raise RuntimeError("Gemini returned no image (empty response)") + return paths + + +def _guess_mime(data: bytes) -> str: + if data[:3] == b"\xff\xd8\xff": + return "image/jpeg" + if data[:4] == b"RIFF": + return "image/webp" + if data[:8] == b"\x89PNG\r\n\x1a\n": + return "image/png" + return "image/png" + + +def _pixels_to_tier(pixel_str: str) -> str: + """Map 'WxH' to nearest Gemini tier (512 / 1K / 2K / 4K).""" + try: + w, h = (int(x) for x in pixel_str.lower().split("x")) + long_edge = max(w, h) + except Exception: + return "1K" + if long_edge <= 768: + return "512" + if long_edge <= 1536: + return "1K" + if long_edge <= 3072: + return "2K" + return "4K" + + +def _pixels_to_ratio(pixel_str: str) -> str | None: + """Map 'WxH' to a Gemini-supported aspect ratio string when possible.""" + try: + w, h = (int(x) for x in pixel_str.lower().split("x")) + except Exception: + return None + # Reduce to a small ratio + from math import gcd + g = gcd(w, h) + rw, rh = w // g, h // g + candidate = f"{rw}:{rh}" + supported = {"1:1", "1:4", "1:8", "2:3", "3:2", "3:4", "4:1", "4:3", + "4:5", "5:4", "8:1", "9:16", "16:9", "21:9"} + return candidate if candidate in supported else None + + +# --------------------------------------------------------------------------- +# Seedream provider (Volcengine Ark, OpenAI-compatible /images/generations) +# --------------------------------------------------------------------------- + +# Friendly aliases → real Seedream model id (Ark Model IDs). +_SEEDREAM_MODEL_ALIASES = { + "seedream": "doubao-seedream-5-0-260128", + "seedream-lite": "doubao-seedream-5-0-260128", + "seedream-5.0": "doubao-seedream-5-0-260128", + "seedream-5.0-lite": "doubao-seedream-5-0-260128", + "seedream-5-0-lite": "doubao-seedream-5-0-260128", + "doubao-seedream-5-0": "doubao-seedream-5-0-260128", + "doubao-seedream-5-0-lite": "doubao-seedream-5-0-260128", + "seedream-4.5": "doubao-seedream-4-5-251128", + "seedream-4-5": "doubao-seedream-4-5-251128", + "doubao-seedream-4-5": "doubao-seedream-4-5-251128", +} + +# Seedream supports either a coarse tier ("2K"/"3K"/"4K") or explicit "WxH". +# We pass the user's tier through as-is when valid; otherwise translate ratio +# hints into the recommended pixel sizes from the Ark docs. +# Valid size tiers for Seedream (5.0 lite: 2K/3K, 4.5: 2K/4K). +# Unsupported tiers are mapped to the nearest valid one. +_SEEDREAM_VALID_TIERS = {"2K", "3K", "4K"} +_SEEDREAM_TIER_FALLBACK = {"512": "2K", "1K": "2K"} +_SEEDREAM_SIZE_TABLE = { + # (tier, ratio) -> "WxH" recommended pixel sizes (Seedream 5.0 lite + 4.5 share most) + ("2K", "1:1"): "2048x2048", + ("2K", "3:4"): "1728x2304", + ("2K", "4:3"): "2304x1728", + ("2K", "16:9"): "2848x1600", + ("2K", "9:16"): "1600x2848", + ("2K", "3:2"): "2496x1664", + ("2K", "2:3"): "1664x2496", + ("2K", "21:9"): "3136x1344", + ("3K", "1:1"): "3072x3072", + ("3K", "3:4"): "2592x3456", + ("3K", "4:3"): "3456x2592", + ("3K", "16:9"): "4096x2304", + ("3K", "9:16"): "2304x4096", + ("3K", "2:3"): "2496x3744", + ("3K", "3:2"): "3744x2496", + ("3K", "21:9"): "4704x2016", + ("4K", "1:1"): "4096x4096", + ("4K", "3:4"): "3520x4704", + ("4K", "4:3"): "4704x3520", + ("4K", "16:9"): "5504x3040", + ("4K", "9:16"): "3040x5504", + ("4K", "2:3"): "3328x4992", + ("4K", "3:2"): "4992x3328", + ("4K", "21:9"): "6240x2656", +} + + +class SeedreamProvider(ImageProvider): + """Provider for Volcengine Ark Seedream image generation API. + + The endpoint is OpenAI-compatible (POST {base}/images/generations) but + accepts an extra `image` field (string or list) for image-to-image and + multi-image fusion, plus `sequential_image_generation` / `watermark` flags. + Reference docs accept both `2K` shorthand and explicit `WxH` for `size`. + """ + + DEFAULT_MODEL = "doubao-seedream-5-0-260128" # seedream 5.0 lite + + def __init__(self, api_key: str, api_base: str, model: str): + self.api_key = api_key + self.api_base = api_base.rstrip("/") + self.model = _SEEDREAM_MODEL_ALIASES.get((model or "").lower(), model or self.DEFAULT_MODEL) + + def generate( + self, + prompt: str, + *, + image_url=None, + quality: str | None = None, # not honoured by Seedream + size: str | None = None, + aspect_ratio: str | None = None, + output_dir: str = ".", + ) -> list[str]: + url = f"{self.api_base}/images/generations" + + payload: dict = { + "model": self.model, + "prompt": prompt, + "response_format": "url", + "watermark": False, + } + + # Default to 2K (Seedream 5.0 lite minimum tier), unless caller picks one. + seedream_size = self._resolve_seedream_size(size, aspect_ratio) + if seedream_size: + payload["size"] = seedream_size + + # Image-to-image / multi-image fusion (up to 14 reference images). + if image_url: + urls = image_url if isinstance(image_url, list) else [image_url] + prepared: list[str] = [] + for u in urls[:14]: + if os.path.isfile(u): + data = _compress_image(_load_image(u)) + mime = _guess_mime(data) + prepared.append(f"data:{mime};base64,{base64.b64encode(data).decode()}") + else: + prepared.append(u) + payload["image"] = prepared if len(prepared) > 1 else prepared[0] + + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + + if _HAS_REQUESTS: + resp = requests.post(url, headers=headers, json=payload, timeout=300) + if resp.status_code >= 400: + try: + body = resp.json() + err = body.get("error") or {} + msg = err.get("message") or body.get("message") or resp.text + except Exception: + msg = resp.text or resp.reason + raise RuntimeError(f"API {resp.status_code}: {msg}") + result = resp.json() + else: + data = json.dumps(payload).encode() + req = Request(url, data=data, headers=headers, method="POST") + with urlopen(req, timeout=300) as r: + result = json.loads(r.read()) + + if result.get("error"): + err = result["error"] + raise RuntimeError(f"Seedream {err.get('code')}: {err.get('message')}") + + paths: list[str] = [] + for item in result.get("data") or []: + u = item.get("url") + b64 = item.get("b64_json") + if u: + paths.append(_save_image(_load_image(u), output_dir)) + elif b64: + paths.append(_save_image(base64.b64decode(b64), output_dir)) + if not paths: + raise RuntimeError(f"Seedream returned no image: {result}") + return paths + + @staticmethod + def _resolve_seedream_size(size: str | None, aspect_ratio: str | None) -> str | None: + if not size and not aspect_ratio: + return "2K" + # Explicit pixel values: pass through (normalise separator) + if size and "x" in size.lower() and "*" not in size: + return size.lower() + if size and "*" in size: + return size.replace("*", "x") + tier = (size or "2K").upper() + # Map unsupported tiers (512, 1K) to the nearest valid one + tier = _SEEDREAM_TIER_FALLBACK.get(tier, tier) + if tier not in _SEEDREAM_VALID_TIERS: + tier = "2K" + ratio = aspect_ratio or "1:1" + if (tier, ratio) in _SEEDREAM_SIZE_TABLE: + return _SEEDREAM_SIZE_TABLE[(tier, ratio)] + return tier + + +# --------------------------------------------------------------------------- +# Qwen provider (DashScope multimodal-generation: qwen-image-* family) +# --------------------------------------------------------------------------- + +# Friendly aliases → real Qwen model id +_QWEN_MODEL_ALIASES = { + "qwen": "qwen-image-2.0-pro", + "qwen-image": "qwen-image-2.0-pro", + "qwen-image-pro": "qwen-image-2.0-pro", +} + +# Qwen pixel-size table (closest match by tier+ratio). +# qwen-image-2.0(*) supports any WxH between 512*512 and 2048*2048. +_QWEN_SIZE_TABLE = { + # (tier, ratio) -> "W*H" + ("1K", "1:1"): "1024*1024", + ("1K", "16:9"): "1280*720", + ("1K", "9:16"): "720*1280", + ("1K", "4:3"): "1184*888", + ("1K", "3:4"): "888*1184", + ("1K", "3:2"): "1248*832", + ("1K", "2:3"): "832*1248", + ("2K", "1:1"): "2048*2048", + ("2K", "16:9"): "2688*1536", # exceeds 2048 cap → clamped at runtime if needed + ("2K", "9:16"): "1536*2688", + ("2K", "4:3"): "2368*1728", + ("2K", "3:4"): "1728*2368", +} + + +class QwenProvider(ImageProvider): + """Provider for Alibaba DashScope Qwen image API (qwen-image-2.0[-pro]).""" + + DEFAULT_MODEL = "qwen-image-2.0" + + def __init__(self, api_key: str, api_base: str, model: str): + self.api_key = api_key + self.api_base = api_base.rstrip("/") + self.model = _QWEN_MODEL_ALIASES.get((model or "").lower(), model or self.DEFAULT_MODEL) + + def generate( + self, + prompt: str, + *, + image_url=None, + quality: str | None = None, # not supported by Qwen image API + size: str | None = None, + aspect_ratio: str | None = None, + output_dir: str = ".", + ) -> list[str]: + url = f"{self.api_base}/api/v1/services/aigc/multimodal-generation/generation" + + # Build content array: 0..3 images then a single text part. + content: list[dict] = [] + if image_url: + urls = image_url if isinstance(image_url, list) else [image_url] + for u in urls[:3]: # API caps at 3 reference images + if os.path.isfile(u): + data = _compress_image(_load_image(u)) + mime = _guess_mime(data) + image_field = f"data:{mime};base64,{base64.b64encode(data).decode()}" + else: + image_field = u + content.append({"image": image_field}) + content.append({"text": prompt}) + + payload: dict = { + "model": self.model, + "input": {"messages": [{"role": "user", "content": content}]}, + } + + # Map (size, aspect_ratio) → Qwen "W*H" + qwen_size = self._resolve_qwen_size(size, aspect_ratio) + if qwen_size: + payload["parameters"] = {"size": qwen_size} + + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + + if _HAS_REQUESTS: + resp = requests.post(url, headers=headers, json=payload, timeout=300) + if resp.status_code >= 400: + try: + body = resp.json() + msg = body.get("message") or body.get("error", {}).get("message") or resp.text + except Exception: + msg = resp.text or resp.reason + raise RuntimeError(f"API {resp.status_code}: {msg}") + result = resp.json() + else: + data = json.dumps(payload).encode() + req = Request(url, data=data, headers=headers, method="POST") + with urlopen(req, timeout=300) as r: + result = json.loads(r.read()) + + # Business-level errors arrive on HTTP 200 with a `code` field. + if result.get("code"): + raise RuntimeError(f"Qwen {result.get('code')}: {result.get('message')}") + + paths: list[str] = [] + choices = (result.get("output") or {}).get("choices") or [] + for ch in choices: + for part in ((ch.get("message") or {}).get("content") or []): + u = part.get("image") + if u: + paths.append(_save_image(_load_image(u), output_dir)) + if not paths: + raise RuntimeError(f"Qwen returned no image: {result}") + return paths + + @staticmethod + def _resolve_qwen_size(size: str | None, aspect_ratio: str | None) -> str | None: + if not size and not aspect_ratio: + return None + if size and "x" in size.lower() and "*" not in size: + return size.lower().replace("x", "*") + if size and "*" in size: + return size + tier = (size or "1K").upper() + # Qwen supports 1K and 2K; clamp others + _QWEN_TIER_MAP = {"512": "1K", "3K": "2K", "4K": "2K"} + tier = _QWEN_TIER_MAP.get(tier, tier) + if tier not in ("1K", "2K"): + tier = "1K" + ratio = aspect_ratio or "1:1" + if (tier, ratio) in _QWEN_SIZE_TABLE: + return _QWEN_SIZE_TABLE[(tier, ratio)] + return _QWEN_SIZE_TABLE.get((tier, "1:1")) + + +# --------------------------------------------------------------------------- +# MiniMax provider (image-01 family) +# --------------------------------------------------------------------------- + +# Friendly aliases → real MiniMax model id +_MINIMAX_MODEL_ALIASES = { + "minimax": "image-01", + "minimax-image": "image-01", + "minimax-image-01": "image-01", +} + +_MINIMAX_SUPPORTED_RATIOS = {"1:1", "16:9", "4:3", "3:2", "2:3", "3:4", "9:16", "21:9"} + + +class MinimaxProvider(ImageProvider): + """Provider for MiniMax image generation API (image-01).""" + + DEFAULT_MODEL = "image-01" + + def __init__(self, api_key: str, api_base: str, model: str): + self.api_key = api_key + self.api_base = api_base.rstrip("/") + self.model = _MINIMAX_MODEL_ALIASES.get((model or "").lower(), model or self.DEFAULT_MODEL) + + def generate( + self, + prompt: str, + *, + image_url=None, + quality: str | None = None, # not supported by MiniMax + size: str | None = None, + aspect_ratio: str | None = None, + output_dir: str = ".", + ) -> list[str]: + url = f"{self.api_base}/v1/image_generation" + payload: dict = { + "model": self.model, + "prompt": prompt, + "response_format": "base64", + } + + # MiniMax accepts aspect_ratio directly; derive from pixels if needed. + ratio = aspect_ratio + if not ratio and size and "x" in size.lower(): + ratio = _pixels_to_ratio(size) + if ratio and ratio in _MINIMAX_SUPPORTED_RATIOS: + payload["aspect_ratio"] = ratio + + # Image-to-image uses subject_reference; accept URL or local file (→ base64). + if image_url: + urls = image_url if isinstance(image_url, list) else [image_url] + refs = [] + for u in urls: + if os.path.isfile(u): + data = _compress_image(_load_image(u)) + mime = _guess_mime(data) + image_file = f"data:{mime};base64,{base64.b64encode(data).decode()}" + else: + image_file = u + refs.append({"type": "character", "image_file": image_file}) + payload["subject_reference"] = refs + + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + + if _HAS_REQUESTS: + resp = requests.post(url, headers=headers, json=payload, timeout=300) + if resp.status_code >= 400: + try: + body = resp.json() + msg = body.get("base_resp", {}).get("status_msg") or body.get("error", {}).get("message") or resp.text + except Exception: + msg = resp.text or resp.reason + raise RuntimeError(f"API {resp.status_code}: {msg}") + result = resp.json() + else: + data = json.dumps(payload).encode() + req = Request(url, data=data, headers=headers, method="POST") + with urlopen(req, timeout=300) as r: + result = json.loads(r.read()) + + # MiniMax returns business errors inside base_resp even on HTTP 200. + base_resp = result.get("base_resp") or {} + if base_resp.get("status_code") not in (None, 0): + raise RuntimeError(f"MiniMax {base_resp.get('status_code')}: {base_resp.get('status_msg')}") + + data_obj = result.get("data") or {} + b64_list = data_obj.get("image_base64") or [] + urls_list = data_obj.get("image_urls") or [] + + paths: list[str] = [] + for b64 in b64_list: + paths.append(_save_image(base64.b64decode(b64), output_dir)) + for u in urls_list: + paths.append(_save_image(_load_image(u), output_dir)) + if not paths: + raise RuntimeError(f"MiniMax returned no image: {result}") + return paths + + # --------------------------------------------------------------------------- # Provider factory # --------------------------------------------------------------------------- -def _build_providers(model: str) -> list[tuple[str, ImageProvider]]: - """Build an ordered list of (label, provider) to try.""" - openai_key = os.environ.get("OPENAI_API_KEY", "") - openai_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1") - linkai_key = os.environ.get("LINKAI_API_KEY", "") - linkai_base = os.environ.get("LINKAI_API_BASE", "https://api.link-ai.tech") +# Model-prefix → preferred provider label. +# When the requested model matches a prefix, that provider is promoted to the +# front of the queue. All other configured providers still run as fallbacks. +_MODEL_PREFERRED_PROVIDER: list[tuple[tuple[str, ...], str]] = [ + (("gpt-image",), "OpenAI"), + (("nano-banana", "gemini-"), "Gemini"), + (("seedream", "doubao-seedream"), "Seedream"), + (("qwen-image", "qwen"), "Qwen"), + (("minimax", "image-01"), "MiniMax"), +] - providers = [] - if openai_key: - providers.append(("OpenAI", OpenAIProvider(api_key=openai_key, api_base=openai_base, model=model))) - if linkai_key: - providers.append(("LinkAI", LinkAIProvider(api_key=linkai_key, api_base=linkai_base, model=model))) - return providers +# Default global priority when the model has no preferred provider. +_DEFAULT_PROVIDER_ORDER = ["OpenAI", "Gemini", "Seedream", "Qwen", "MiniMax", "LinkAI"] + + +def _preferred_provider(model: str) -> str | None: + m = (model or "").lower() + for prefixes, label in _MODEL_PREFERRED_PROVIDER: + if m.startswith(prefixes): + return label + return None + + +def _build_providers(model: str) -> list[tuple[str, ImageProvider]]: + """Build an ordered list of (label, provider) to try. + + Behaviour: + 1. All providers with a configured API key are added in the global + priority order: OpenAI → Gemini → Seedream → Qwen → MiniMax → LinkAI. + 2. If `model` natively belongs to one of the providers AND that provider + is configured, it is promoted to the front so it gets the first + attempt with the right model id. + 3. If the preferred provider is NOT configured (no API key), the model + id would 100% fail on every other backend, so we drop the explicit + model and fall back to automatic routing — every provider then uses + its own DEFAULT_MODEL. + """ + keys = { + "OpenAI": os.environ.get("OPENAI_API_KEY", ""), + "Gemini": os.environ.get("GEMINI_API_KEY", ""), + "Seedream": os.environ.get("ARK_API_KEY", ""), + "Qwen": os.environ.get("DASHSCOPE_API_KEY", ""), + "MiniMax": os.environ.get("MINIMAX_API_KEY", ""), + "LinkAI": os.environ.get("LINKAI_API_KEY", ""), + } + bases = { + "OpenAI": os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1"), + "Gemini": os.environ.get("GEMINI_API_BASE", "https://generativelanguage.googleapis.com"), + "Seedream": os.environ.get("ARK_API_BASE", "https://ark.cn-beijing.volces.com/api/v3"), + "Qwen": os.environ.get("DASHSCOPE_API_BASE", "https://dashscope.aliyuncs.com"), + "MiniMax": os.environ.get("MINIMAX_API_BASE", "https://api.minimaxi.com"), + "LinkAI": os.environ.get("LINKAI_API_BASE", "https://api.link-ai.tech"), + } + + pref = _preferred_provider(model) + + # If a specific model is requested and its native provider has no key, + # other backends won't recognise the id → reset to auto routing. + if pref and not keys.get(pref): + model = "" + pref = None + + factories = { + "OpenAI": OpenAIProvider, + "Gemini": GeminiProvider, + "Seedream": SeedreamProvider, + "Qwen": QwenProvider, + "MiniMax": MinimaxProvider, + "LinkAI": LinkAIProvider, + } + available: dict[str, ImageProvider] = {} + for label, key in keys.items(): + if key: + available[label] = factories[label](api_key=key, api_base=bases[label], model=model) + + # When a specific model is pinned, only try its native provider — other + # backends won't recognise the model id so retrying them is pointless. + if pref and pref in available: + return [(pref, available[pref])] + + # Auto routing: try every configured provider in priority order. + ordered: list[str] = [] + for label in _DEFAULT_PROVIDER_ORDER: + if label in available: + ordered.append(label) + return [(label, available[label]) for label in ordered] # --------------------------------------------------------------------------- @@ -447,40 +1105,59 @@ def main(): print(json.dumps({"error": "Missing required parameter: prompt"})) sys.exit(1) - model = args.get("model", "gpt-image-2") + # Model resolution priority: + # 1. Explicit `model` in the call args (agent / user override) + # 2. SKILL_IMAGE_GENERATION_MODEL env var (synced from + # config["skill"]["image-generation"]["model"] at startup) + # 3. None → fall back to automatic provider routing (try every + # provider with a configured API key in global priority order) + model = args.get("model") or os.environ.get("SKILL_IMAGE_GENERATION_MODEL") or "" quality = args.get("quality") - raw_size = args.get("size") + size = args.get("size") aspect_ratio = args.get("aspect_ratio") image_url = args.get("image_url") - resolved_size = resolve_size(raw_size, aspect_ratio) - output_dir = os.environ.get("IMAGE_OUTPUT_DIR", os.path.join(os.getcwd(), "images")) providers = _build_providers(model) if not providers: + target = f"model '{model}'" if model else "image generation" print(json.dumps({ - "error": "No API key configured. Please set OPENAI_API_KEY or LINKAI_API_KEY via env_config tool, then try again." + "error": ( + f"No API key configured for {target}. " + "Set at least one of OPENAI_API_KEY / GEMINI_API_KEY / " + "ARK_API_KEY / DASHSCOPE_API_KEY / MINIMAX_API_KEY / " + "LINKAI_API_KEY via the env_config tool, then try again." + ) }, ensure_ascii=False)) sys.exit(1) - import time - errors = [] for label, provider in providers: try: - print(f"[image-generation] Trying {label} (model={model})...", file=sys.stderr) + attempt_model = getattr(provider, "model", model) or "auto" + print(f"[image-generation] Trying {label} (model={attempt_model})...", file=sys.stderr) t0 = time.time() paths = provider.generate( prompt, image_url=image_url, quality=quality, - size=resolved_size, + size=size, + aspect_ratio=aspect_ratio, output_dir=output_dir, ) elapsed = time.time() - t0 - print(f"[image-generation] ✅ {label} succeeded in {elapsed:.1f}s", file=sys.stderr) - result = {"images": [{"url": p} for p in paths]} + # Resolved model id (after alias expansion) actually sent to the API + actual_model = getattr(provider, "model", model) + print( + f"[image-generation] ✅ {label} succeeded in {elapsed:.1f}s " + f"(model={actual_model})", + file=sys.stderr, + ) + result = { + "model": actual_model, + "images": [{"url": p} for p in paths], + } print(json.dumps(result, ensure_ascii=False)) return except Exception as e: @@ -493,8 +1170,10 @@ def main(): "error": f"All providers failed — {hint}. " "This is likely an API key or base URL configuration issue. " "Do NOT retry with the same parameters. " - "Ask the user to verify their OPENAI_API_KEY / OPENAI_API_BASE " - "(or LINKAI_API_KEY / LINKAI_API_BASE) settings via env_config." + "Ask the user to verify their API key / base URL " + "(OPENAI_API_KEY, GEMINI_API_KEY, ARK_API_KEY, " + "DASHSCOPE_API_KEY, MINIMAX_API_KEY, or LINKAI_API_KEY) " + "via env_config." }, ensure_ascii=False)) sys.exit(1)