feat: add skills and upgrade feishu/dingtalk channel

This commit is contained in:
zhayujie
2026-02-02 00:42:39 +08:00
parent 77c2bfcc1e
commit a8d5309c90
32 changed files with 2931 additions and 200 deletions

View File

@@ -237,8 +237,8 @@ def _build_tooling_section(tools: List[Any], language: str) -> List[str]:
"叙述要求: 保持简洁、信息密度高,避免重复显而易见的步骤。", "叙述要求: 保持简洁、信息密度高,避免重复显而易见的步骤。",
"", "",
"完成标准:", "完成标准:",
"- 确保用户的需求得到实际解决,而不仅仅是制定计划", "- 确保用户的需求得到实际解决,而不仅仅是制定计划",
"- 当任务需要多次工具调用时,持续推进直到完成", "- 当任务需要多次工具调用时,持续推进直到完成, 解决完后向用户报告结果或回复用户的问题",
"- 每次工具调用后,评估是否已获得足够信息来推进或完成任务", "- 每次工具调用后,评估是否已获得足够信息来推进或完成任务",
"- 避免重复调用相同的工具和相同参数获取相同的信息,除非用户明确要求", "- 避免重复调用相同的工具和相同参数获取相同的信息,除非用户明确要求",
"", "",

View File

@@ -360,6 +360,9 @@ class Agent:
# Update agent's message history from executor # Update agent's message history from executor
self.messages = executor.messages self.messages = executor.messages
# Store executor reference for agent_bridge to access files_to_send
self.stream_executor = executor
# Execute all post-process tools # Execute all post-process tools
self._execute_post_process_tools() self._execute_post_process_tools()

View File

@@ -58,6 +58,9 @@ class AgentStreamExecutor:
# Tool failure tracking for retry protection # Tool failure tracking for retry protection
self.tool_failure_history = [] # List of (tool_name, args_hash, success) tuples self.tool_failure_history = [] # List of (tool_name, args_hash, success) tuples
# Track files to send (populated by read tool)
self.files_to_send = [] # List of file metadata dicts
def _emit_event(self, event_type: str, data: dict = None): def _emit_event(self, event_type: str, data: dict = None):
"""Emit event""" """Emit event"""
@@ -191,21 +194,47 @@ class AgentStreamExecutor:
logger.info( logger.info(
f"Memory flush recommended: tokens={current_tokens}, turns={self.agent.memory_manager.flush_manager.turn_count}") f"Memory flush recommended: tokens={current_tokens}, turns={self.agent.memory_manager.flush_manager.turn_count}")
# Call LLM # Call LLM (enable retry_on_empty for better reliability)
assistant_msg, tool_calls = self._call_llm_stream() assistant_msg, tool_calls = self._call_llm_stream(retry_on_empty=True)
final_response = assistant_msg final_response = assistant_msg
# No tool calls, end loop # No tool calls, end loop
if not tool_calls: if not tool_calls:
# 检查是否返回了空响应 # 检查是否返回了空响应
if not assistant_msg: if not assistant_msg:
logger.warning(f"[Agent] LLM returned empty response (no content and no tool calls)") logger.warning(f"[Agent] LLM returned empty response after retry (no content and no tool calls)")
logger.info(f"[Agent] This usually happens when LLM thinks the task is complete after tool execution")
# 生成通用的友好提示 # 如果之前有工具调用,强制要求 LLM 生成文本回复
final_response = ( if turn > 1:
"抱歉,我暂时无法生成回复。请尝试换一种方式描述你的需求,或稍后再试。" logger.info(f"[Agent] Requesting explicit response from LLM...")
)
logger.info(f"Generated fallback response for empty LLM output") # 添加一条消息,明确要求回复用户
self.messages.append({
"role": "user",
"content": [{
"type": "text",
"text": "请向用户说明刚才工具执行的结果或回答用户的问题。"
}]
})
# 再调用一次 LLM
assistant_msg, tool_calls = self._call_llm_stream(retry_on_empty=False)
final_response = assistant_msg
# 如果还是空,才使用 fallback
if not assistant_msg and not tool_calls:
logger.warning(f"[Agent] Still empty after explicit request")
final_response = (
"抱歉,我暂时无法生成回复。请尝试换一种方式描述你的需求,或稍后再试。"
)
logger.info(f"Generated fallback response for empty LLM output")
else:
# 第一轮就空回复,直接 fallback
final_response = (
"抱歉,我暂时无法生成回复。请尝试换一种方式描述你的需求,或稍后再试。"
)
logger.info(f"Generated fallback response for empty LLM output")
else: else:
logger.info(f"💭 {assistant_msg[:150]}{'...' if len(assistant_msg) > 150 else ''}") logger.info(f"💭 {assistant_msg[:150]}{'...' if len(assistant_msg) > 150 else ''}")
@@ -235,6 +264,14 @@ class AgentStreamExecutor:
result = self._execute_tool(tool_call) result = self._execute_tool(tool_call)
tool_results.append(result) tool_results.append(result)
# Check if this is a file to send (from read tool)
if result.get("status") == "success" and isinstance(result.get("result"), dict):
result_data = result.get("result")
if result_data.get("type") == "file_to_send":
# Store file metadata for later sending
self.files_to_send.append(result_data)
logger.info(f"📎 检测到待发送文件: {result_data.get('file_name', result_data.get('path'))}")
# Check for critical error - abort entire conversation # Check for critical error - abort entire conversation
if result.get("status") == "critical_error": if result.get("status") == "critical_error":
logger.error(f"💥 检测到严重错误,终止对话") logger.error(f"💥 检测到严重错误,终止对话")
@@ -392,6 +429,7 @@ class AgentStreamExecutor:
# Streaming response # Streaming response
full_content = "" full_content = ""
tool_calls_buffer = {} # {index: {id, name, arguments}} tool_calls_buffer = {} # {index: {id, name, arguments}}
stop_reason = None # Track why the stream stopped
try: try:
stream = self.model.call_stream(request) stream = self.model.call_stream(request)
@@ -404,21 +442,47 @@ class AgentStreamExecutor:
if isinstance(error_data, dict): if isinstance(error_data, dict):
error_msg = error_data.get("message", chunk.get("message", "Unknown error")) error_msg = error_data.get("message", chunk.get("message", "Unknown error"))
error_code = error_data.get("code", "") error_code = error_data.get("code", "")
error_type = error_data.get("type", "")
else: else:
error_msg = chunk.get("message", str(error_data)) error_msg = chunk.get("message", str(error_data))
error_code = "" error_code = ""
error_type = ""
status_code = chunk.get("status_code", "N/A") status_code = chunk.get("status_code", "N/A")
logger.error(f"API Error: {error_msg} (Status: {status_code}, Code: {error_code})")
logger.error(f"Full error chunk: {chunk}")
# Raise exception with full error message for retry logic # Log error with all available information
raise Exception(f"{error_msg} (Status: {status_code})") logger.error(f"🔴 Stream API Error:")
logger.error(f" Message: {error_msg}")
logger.error(f" Status Code: {status_code}")
logger.error(f" Error Code: {error_code}")
logger.error(f" Error Type: {error_type}")
logger.error(f" Full chunk: {chunk}")
# Check if this is a context overflow error (keyword-based, works for all models)
# Don't rely on specific status codes as different providers use different codes
error_msg_lower = error_msg.lower()
is_overflow = any(keyword in error_msg_lower for keyword in [
'context length exceeded', 'maximum context length', 'prompt is too long',
'context overflow', 'context window', 'too large', 'exceeds model context',
'request_too_large', 'request exceeds the maximum size', 'tokens exceed'
])
if is_overflow:
# Mark as context overflow for special handling
raise Exception(f"[CONTEXT_OVERFLOW] {error_msg} (Status: {status_code})")
else:
# Raise exception with full error message for retry logic
raise Exception(f"{error_msg} (Status: {status_code}, Code: {error_code}, Type: {error_type})")
# Parse chunk # Parse chunk
if isinstance(chunk, dict) and "choices" in chunk: if isinstance(chunk, dict) and "choices" in chunk:
choice = chunk["choices"][0] choice = chunk["choices"][0]
delta = choice.get("delta", {}) delta = choice.get("delta", {})
# Capture finish_reason if present
finish_reason = choice.get("finish_reason")
if finish_reason:
stop_reason = finish_reason
# Handle text content # Handle text content
if "content" in delta and delta["content"]: if "content" in delta and delta["content"]:
@@ -449,9 +513,46 @@ class AgentStreamExecutor:
tool_calls_buffer[index]["arguments"] += func["arguments"] tool_calls_buffer[index]["arguments"] += func["arguments"]
except Exception as e: except Exception as e:
error_str = str(e).lower() error_str = str(e)
error_str_lower = error_str.lower()
# Check if error is context overflow (non-retryable, needs session reset)
# Method 1: Check for special marker (set in stream error handling above)
is_context_overflow = '[context_overflow]' in error_str_lower
# Method 2: Fallback to keyword matching for non-stream errors
if not is_context_overflow:
is_context_overflow = any(keyword in error_str_lower for keyword in [
'context length exceeded', 'maximum context length', 'prompt is too long',
'context overflow', 'context window', 'too large', 'exceeds model context',
'request_too_large', 'request exceeds the maximum size'
])
# Check if error is message format error (incomplete tool_use/tool_result pairs)
# This happens when previous conversation had tool failures
is_message_format_error = any(keyword in error_str_lower for keyword in [
'tool_use', 'tool_result', 'without', 'immediately after',
'corresponding', 'must have', 'each'
]) and 'status: 400' in error_str_lower
if is_context_overflow or is_message_format_error:
error_type = "context overflow" if is_context_overflow else "message format error"
logger.error(f"💥 {error_type} detected: {e}")
# Clear message history to recover
logger.warning("🔄 Clearing conversation history to recover")
self.messages.clear()
# Raise special exception with user-friendly message
if is_context_overflow:
raise Exception(
"抱歉,对话历史过长导致上下文溢出。我已清空历史记录,请重新描述你的需求。"
)
else:
raise Exception(
"抱歉,之前的对话出现了问题。我已清空历史记录,请重新发送你的消息。"
)
# Check if error is retryable (timeout, connection, rate limit, server busy, etc.) # Check if error is retryable (timeout, connection, rate limit, server busy, etc.)
is_retryable = any(keyword in error_str for keyword in [ is_retryable = any(keyword in error_str_lower for keyword in [
'timeout', 'timed out', 'connection', 'network', 'timeout', 'timed out', 'connection', 'network',
'rate limit', 'overloaded', 'unavailable', 'busy', 'retry', 'rate limit', 'overloaded', 'unavailable', 'busy', 'retry',
'429', '500', '502', '503', '504', '512' '429', '500', '502', '503', '504', '512'
@@ -505,11 +606,12 @@ class AgentStreamExecutor:
# Check for empty response and retry once if enabled # Check for empty response and retry once if enabled
if retry_on_empty and not full_content and not tool_calls: if retry_on_empty and not full_content and not tool_calls:
logger.warning(f"⚠️ LLM returned empty response, retrying once...") logger.warning(f"⚠️ LLM returned empty response (stop_reason: {stop_reason}), retrying once...")
self._emit_event("message_end", { self._emit_event("message_end", {
"content": "", "content": "",
"tool_calls": [], "tool_calls": [],
"empty_retry": True "empty_retry": True,
"stop_reason": stop_reason
}) })
# Retry without retry flag to avoid infinite loop # Retry without retry flag to avoid infinite loop
return self._call_llm_stream( return self._call_llm_stream(

View File

@@ -137,6 +137,10 @@ class SkillLoader:
name = frontmatter.get('name', parent_dir_name) name = frontmatter.get('name', parent_dir_name)
description = frontmatter.get('description', '') description = frontmatter.get('description', '')
# Special handling for linkai-agent: dynamically load apps from config.json
if name == 'linkai-agent':
description = self._load_linkai_agent_description(skill_dir, description)
if not description or not description.strip(): if not description or not description.strip():
diagnostics.append(f"Skill {name} has no description: {file_path}") diagnostics.append(f"Skill {name} has no description: {file_path}")
return LoadSkillsResult(skills=[], diagnostics=diagnostics) return LoadSkillsResult(skills=[], diagnostics=diagnostics)
@@ -161,6 +165,45 @@ class SkillLoader:
return LoadSkillsResult(skills=[skill], diagnostics=diagnostics) return LoadSkillsResult(skills=[skill], diagnostics=diagnostics)
def _load_linkai_agent_description(self, skill_dir: str, default_description: str) -> str:
"""
Dynamically load LinkAI agent description from config.json
:param skill_dir: Skill directory
:param default_description: Default description from SKILL.md
:return: Dynamic description with app list
"""
import json
config_path = os.path.join(skill_dir, "config.json")
template_path = os.path.join(skill_dir, "config.json.template")
# Try to load config.json or fallback to template
config_file = config_path if os.path.exists(config_path) else template_path
if not os.path.exists(config_file):
return default_description
try:
with open(config_file, 'r', encoding='utf-8') as f:
config = json.load(f)
apps = config.get("apps", [])
if not apps:
return default_description
# Build dynamic description with app details
app_descriptions = "; ".join([
f"{app['app_name']}({app['app_code']}: {app['app_description']})"
for app in apps
])
return f"Call LinkAI apps/workflows. {app_descriptions}"
except Exception as e:
logger.warning(f"[SkillLoader] Failed to load linkai-agent config: {e}")
return default_description
def load_all_skills( def load_all_skills(
self, self,
managed_dir: Optional[str] = None, managed_dir: Optional[str] = None,

View File

@@ -8,6 +8,7 @@ from agent.tools.write.write import Write
from agent.tools.edit.edit import Edit from agent.tools.edit.edit import Edit
from agent.tools.bash.bash import Bash from agent.tools.bash.bash import Bash
from agent.tools.ls.ls import Ls from agent.tools.ls.ls import Ls
from agent.tools.send.send import Send
# Import memory tools # Import memory tools
from agent.tools.memory.memory_search import MemorySearchTool from agent.tools.memory.memory_search import MemorySearchTool
@@ -112,6 +113,7 @@ __all__ = [
'Edit', 'Edit',
'Bash', 'Bash',
'Ls', 'Ls',
'Send',
'MemorySearchTool', 'MemorySearchTool',
'MemoryGetTool', 'MemoryGetTool',
'EnvConfig', 'EnvConfig',

View File

@@ -3,12 +3,14 @@ Bash tool - Execute bash commands
""" """
import os import os
import sys
import subprocess import subprocess
import tempfile import tempfile
from typing import Dict, Any from typing import Dict, Any
from agent.tools.base_tool import BaseTool, ToolResult from agent.tools.base_tool import BaseTool, ToolResult
from agent.tools.utils.truncate import truncate_tail, format_size, DEFAULT_MAX_LINES, DEFAULT_MAX_BYTES from agent.tools.utils.truncate import truncate_tail, format_size, DEFAULT_MAX_LINES, DEFAULT_MAX_BYTES
from common.log import logger
class Bash(BaseTool): class Bash(BaseTool):
@@ -60,6 +62,12 @@ IMPORTANT SAFETY GUIDELINES:
if not command: if not command:
return ToolResult.fail("Error: command parameter is required") return ToolResult.fail("Error: command parameter is required")
# Security check: Prevent accessing sensitive config files
if "~/.cow/.env" in command or "~/.cow" in command:
return ToolResult.fail(
"Error: Access denied. API keys and credentials must be accessed through the env_config tool only."
)
# Optional safety check - only warn about extremely dangerous commands # Optional safety check - only warn about extremely dangerous commands
if self.safety_mode: if self.safety_mode:
warning = self._get_safety_warning(command) warning = self._get_safety_warning(command)
@@ -68,7 +76,31 @@ IMPORTANT SAFETY GUIDELINES:
f"Safety Warning: {warning}\n\nIf you believe this command is safe and necessary, please ask the user for confirmation first, explaining what the command does and why it's needed.") f"Safety Warning: {warning}\n\nIf you believe this command is safe and necessary, please ask the user for confirmation first, explaining what the command does and why it's needed.")
try: try:
# Execute command # Prepare environment with .env file variables
env = os.environ.copy()
# Load environment variables from ~/.cow/.env if it exists
env_file = os.path.expanduser("~/.cow/.env")
if os.path.exists(env_file):
try:
from dotenv import dotenv_values
env_vars = dotenv_values(env_file)
env.update(env_vars)
logger.debug(f"[Bash] Loaded {len(env_vars)} variables from {env_file}")
except ImportError:
logger.debug("[Bash] python-dotenv not installed, skipping .env loading")
except Exception as e:
logger.debug(f"[Bash] Failed to load .env: {e}")
# Debug logging
logger.debug(f"[Bash] CWD: {self.cwd}")
logger.debug(f"[Bash] Command: {command[:500]}")
logger.debug(f"[Bash] OPENAI_API_KEY in env: {'OPENAI_API_KEY' in env}")
logger.debug(f"[Bash] SHELL: {env.get('SHELL', 'not set')}")
logger.debug(f"[Bash] Python executable: {sys.executable}")
logger.debug(f"[Bash] Process UID: {os.getuid()}")
# Execute command with inherited environment variables
result = subprocess.run( result = subprocess.run(
command, command,
shell=True, shell=True,
@@ -76,8 +108,50 @@ IMPORTANT SAFETY GUIDELINES:
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
text=True, text=True,
timeout=timeout timeout=timeout,
env=env
) )
logger.debug(f"[Bash] Exit code: {result.returncode}")
logger.debug(f"[Bash] Stdout length: {len(result.stdout)}")
logger.debug(f"[Bash] Stderr length: {len(result.stderr)}")
# Workaround for exit code 126 with no output
if result.returncode == 126 and not result.stdout and not result.stderr:
logger.warning(f"[Bash] Exit 126 with no output - trying alternative execution method")
# Try using argument list instead of shell=True
import shlex
try:
parts = shlex.split(command)
if len(parts) > 0:
logger.info(f"[Bash] Retrying with argument list: {parts[:3]}...")
retry_result = subprocess.run(
parts,
cwd=self.cwd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=timeout,
env=env
)
logger.debug(f"[Bash] Retry exit code: {retry_result.returncode}, stdout: {len(retry_result.stdout)}, stderr: {len(retry_result.stderr)}")
# If retry succeeded, use retry result
if retry_result.returncode == 0 or retry_result.stdout or retry_result.stderr:
result = retry_result
else:
# Both attempts failed - check if this is openai-image-vision skill
if 'openai-image-vision' in command or 'vision.sh' in command:
# Create a mock result with helpful error message
from types import SimpleNamespace
result = SimpleNamespace(
returncode=1,
stdout='{"error": "图片无法解析", "reason": "该图片格式可能不受支持,或图片文件存在问题", "suggestion": "请尝试其他图片"}',
stderr=''
)
logger.info(f"[Bash] Converted exit 126 to user-friendly image error message for vision skill")
except Exception as retry_err:
logger.warning(f"[Bash] Retry failed: {retry_err}")
# Combine stdout and stderr # Combine stdout and stderr
output = result.stdout output = result.stdout

View File

@@ -27,7 +27,7 @@ class EnvConfig(BaseTool):
name: str = "env_config" name: str = "env_config"
description: str = ( description: str = (
"Manage API keys and skill configurations stored in the workspace .env file. " "Manage API keys and skill configurations securely. "
"Use this tool when user wants to configure API keys (like BOCHA_API_KEY, OPENAI_API_KEY), " "Use this tool when user wants to configure API keys (like BOCHA_API_KEY, OPENAI_API_KEY), "
"view configured keys, or manage skill settings. " "view configured keys, or manage skill settings. "
"Actions: 'set' (add/update key), 'get' (view specific key), 'list' (show all configured keys), 'delete' (remove key). " "Actions: 'set' (add/update key), 'get' (view specific key), 'list' (show all configured keys), 'delete' (remove key). "
@@ -65,16 +65,17 @@ class EnvConfig(BaseTool):
def __init__(self, config: dict = None): def __init__(self, config: dict = None):
self.config = config or {} self.config = config or {}
self.workspace_dir = self.config.get("workspace_dir", os.path.expanduser("~/cow")) # Store env config in ~/.cow directory (outside workspace for security)
self.env_path = os.path.join(self.workspace_dir, '.env') self.env_dir = os.path.expanduser("~/.cow")
self.env_path = os.path.join(self.env_dir, '.env')
self.agent_bridge = self.config.get("agent_bridge") # Reference to AgentBridge for hot reload self.agent_bridge = self.config.get("agent_bridge") # Reference to AgentBridge for hot reload
# Don't create .env file in __init__ to avoid issues during tool discovery # Don't create .env file in __init__ to avoid issues during tool discovery
# It will be created on first use in execute() # It will be created on first use in execute()
def _ensure_env_file(self): def _ensure_env_file(self):
"""Ensure the .env file exists""" """Ensure the .env file exists"""
# Create workspace directory if it doesn't exist # Create ~/.cow directory if it doesn't exist
os.makedirs(self.workspace_dir, exist_ok=True) os.makedirs(self.env_dir, exist_ok=True)
if not os.path.exists(self.env_path): if not os.path.exists(self.env_path):
Path(self.env_path).touch() Path(self.env_path).touch()

View File

@@ -50,6 +50,13 @@ class Ls(BaseTool):
# Resolve path # Resolve path
absolute_path = self._resolve_path(path) absolute_path = self._resolve_path(path)
# Security check: Prevent accessing sensitive config directory
env_config_dir = os.path.expanduser("~/.cow")
if os.path.abspath(absolute_path) == os.path.abspath(env_config_dir):
return ToolResult.fail(
"Error: Access denied. API keys and credentials must be accessed through the env_config tool only."
)
if not os.path.exists(absolute_path): if not os.path.exists(absolute_path):
# Provide helpful hint if using relative path # Provide helpful hint if using relative path
if not os.path.isabs(path) and not path.startswith('~'): if not os.path.isabs(path) and not path.startswith('~'):

View File

@@ -15,7 +15,7 @@ class Read(BaseTool):
"""Tool for reading file contents""" """Tool for reading file contents"""
name: str = "read" name: str = "read"
description: str = f"Read the contents of a file. Supports text files, PDF files, and images (jpg, png, gif, webp). For text files, output is truncated to {DEFAULT_MAX_LINES} lines or {DEFAULT_MAX_BYTES // 1024}KB (whichever is hit first). Use offset/limit for large files." description: str = f"Read or inspect file contents. For text/PDF files, returns content (truncated to {DEFAULT_MAX_LINES} lines or {DEFAULT_MAX_BYTES // 1024}KB). For images/videos/audio, returns metadata only (file info, size, type). Use offset/limit for large text files."
params: dict = { params: dict = {
"type": "object", "type": "object",
@@ -39,10 +39,25 @@ class Read(BaseTool):
def __init__(self, config: dict = None): def __init__(self, config: dict = None):
self.config = config or {} self.config = config or {}
self.cwd = self.config.get("cwd", os.getcwd()) self.cwd = self.config.get("cwd", os.getcwd())
# Supported image formats
self.image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.webp'} # File type categories
# Supported PDF format self.image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg', '.ico'}
self.video_extensions = {'.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv', '.webm', '.m4v'}
self.audio_extensions = {'.mp3', '.wav', '.ogg', '.m4a', '.flac', '.aac', '.wma'}
self.binary_extensions = {'.exe', '.dll', '.so', '.dylib', '.bin', '.dat', '.db', '.sqlite'}
self.archive_extensions = {'.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz'}
self.pdf_extensions = {'.pdf'} self.pdf_extensions = {'.pdf'}
# Readable text formats (will be read with truncation)
self.text_extensions = {
'.txt', '.md', '.markdown', '.rst', '.log', '.csv', '.tsv', '.json', '.xml', '.yaml', '.yml',
'.py', '.js', '.ts', '.java', '.c', '.cpp', '.h', '.hpp', '.go', '.rs', '.rb', '.php',
'.html', '.css', '.scss', '.sass', '.less', '.vue', '.jsx', '.tsx',
'.sh', '.bash', '.zsh', '.fish', '.ps1', '.bat', '.cmd',
'.sql', '.r', '.m', '.swift', '.kt', '.scala', '.clj', '.erl', '.ex',
'.dockerfile', '.makefile', '.cmake', '.gradle', '.properties', '.ini', '.conf', '.cfg',
'.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx' # Office documents
}
def execute(self, args: Dict[str, Any]) -> ToolResult: def execute(self, args: Dict[str, Any]) -> ToolResult:
""" """
@@ -61,6 +76,13 @@ class Read(BaseTool):
# Resolve path # Resolve path
absolute_path = self._resolve_path(path) absolute_path = self._resolve_path(path)
# Security check: Prevent reading sensitive config files
env_config_path = os.path.expanduser("~/.cow/.env")
if os.path.abspath(absolute_path) == os.path.abspath(env_config_path):
return ToolResult.fail(
"Error: Access denied. API keys and credentials must be accessed through the env_config tool only."
)
# Check if file exists # Check if file exists
if not os.path.exists(absolute_path): if not os.path.exists(absolute_path):
# Provide helpful hint if using relative path # Provide helpful hint if using relative path
@@ -78,16 +100,25 @@ class Read(BaseTool):
# Check file type # Check file type
file_ext = Path(absolute_path).suffix.lower() file_ext = Path(absolute_path).suffix.lower()
file_size = os.path.getsize(absolute_path)
# Check if image # Check if image - return metadata for sending
if file_ext in self.image_extensions: if file_ext in self.image_extensions:
return self._read_image(absolute_path, file_ext) return self._read_image(absolute_path, file_ext)
# Check if video/audio/binary/archive - return metadata only
if file_ext in self.video_extensions:
return self._return_file_metadata(absolute_path, "video", file_size)
if file_ext in self.audio_extensions:
return self._return_file_metadata(absolute_path, "audio", file_size)
if file_ext in self.binary_extensions or file_ext in self.archive_extensions:
return self._return_file_metadata(absolute_path, "binary", file_size)
# Check if PDF # Check if PDF
if file_ext in self.pdf_extensions: if file_ext in self.pdf_extensions:
return self._read_pdf(absolute_path, path, offset, limit) return self._read_pdf(absolute_path, path, offset, limit)
# Read text file # Read text file (with truncation for large files)
return self._read_text(absolute_path, path, offset, limit) return self._read_text(absolute_path, path, offset, limit)
def _resolve_path(self, path: str) -> str: def _resolve_path(self, path: str) -> str:
@@ -103,25 +134,56 @@ class Read(BaseTool):
return path return path
return os.path.abspath(os.path.join(self.cwd, path)) return os.path.abspath(os.path.join(self.cwd, path))
def _return_file_metadata(self, absolute_path: str, file_type: str, file_size: int) -> ToolResult:
"""
Return file metadata for non-readable files (video, audio, binary, etc.)
:param absolute_path: Absolute path to the file
:param file_type: Type of file (video, audio, binary, etc.)
:param file_size: File size in bytes
:return: File metadata
"""
file_name = Path(absolute_path).name
file_ext = Path(absolute_path).suffix.lower()
# Determine MIME type
mime_types = {
# Video
'.mp4': 'video/mp4', '.avi': 'video/x-msvideo', '.mov': 'video/quicktime',
'.mkv': 'video/x-matroska', '.webm': 'video/webm',
# Audio
'.mp3': 'audio/mpeg', '.wav': 'audio/wav', '.ogg': 'audio/ogg',
'.m4a': 'audio/mp4', '.flac': 'audio/flac',
# Binary
'.zip': 'application/zip', '.tar': 'application/x-tar',
'.gz': 'application/gzip', '.rar': 'application/x-rar-compressed',
}
mime_type = mime_types.get(file_ext, 'application/octet-stream')
result = {
"type": f"{file_type}_metadata",
"file_type": file_type,
"path": absolute_path,
"file_name": file_name,
"mime_type": mime_type,
"size": file_size,
"size_formatted": format_size(file_size),
"message": f"{file_type.capitalize()} 文件: {file_name} ({format_size(file_size)})\n提示: 如果需要发送此文件,请使用 send 工具。"
}
return ToolResult.success(result)
def _read_image(self, absolute_path: str, file_ext: str) -> ToolResult: def _read_image(self, absolute_path: str, file_ext: str) -> ToolResult:
""" """
Read image file Read image file - always return metadata only (images should be sent, not read into context)
:param absolute_path: Absolute path to the image file :param absolute_path: Absolute path to the image file
:param file_ext: File extension :param file_ext: File extension
:return: Result containing image information :return: Result containing image metadata for sending
""" """
try: try:
# Read image file
with open(absolute_path, 'rb') as f:
image_data = f.read()
# Get file size # Get file size
file_size = len(image_data) file_size = os.path.getsize(absolute_path)
# Return image information (actual image data can be base64 encoded when needed)
import base64
base64_data = base64.b64encode(image_data).decode('utf-8')
# Determine MIME type # Determine MIME type
mime_type_map = { mime_type_map = {
@@ -133,12 +195,15 @@ class Read(BaseTool):
} }
mime_type = mime_type_map.get(file_ext, 'image/jpeg') mime_type = mime_type_map.get(file_ext, 'image/jpeg')
# Return metadata for images (NOT file_to_send - use send tool to actually send)
result = { result = {
"type": "image", "type": "image_metadata",
"file_type": "image",
"path": absolute_path,
"mime_type": mime_type, "mime_type": mime_type,
"size": file_size, "size": file_size,
"size_formatted": format_size(file_size), "size_formatted": format_size(file_size),
"data": base64_data # Base64 encoded image data "message": f"图片文件: {Path(absolute_path).name} ({format_size(file_size)})\n提示: 如果需要发送此图片,请使用 send 工具。"
} }
return ToolResult.success(result) return ToolResult.success(result)
@@ -157,10 +222,32 @@ class Read(BaseTool):
:return: File content or error message :return: File content or error message
""" """
try: try:
# Check file size first
file_size = os.path.getsize(absolute_path)
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
if file_size > MAX_FILE_SIZE:
# File too large, return metadata only
return ToolResult.success({
"type": "file_to_send",
"file_type": "document",
"path": absolute_path,
"size": file_size,
"size_formatted": format_size(file_size),
"message": f"文件过大 ({format_size(file_size)} > 50MB),无法读取内容。文件路径: {absolute_path}"
})
# Read file # Read file
with open(absolute_path, 'r', encoding='utf-8') as f: with open(absolute_path, 'r', encoding='utf-8') as f:
content = f.read() content = f.read()
# Truncate content if too long (20K characters max for model context)
MAX_CONTENT_CHARS = 20 * 1024 # 20K characters
content_truncated = False
if len(content) > MAX_CONTENT_CHARS:
content = content[:MAX_CONTENT_CHARS]
content_truncated = True
all_lines = content.split('\n') all_lines = content.split('\n')
total_file_lines = len(all_lines) total_file_lines = len(all_lines)
@@ -197,6 +284,10 @@ class Read(BaseTool):
output_text = "" output_text = ""
details = {} details = {}
# Add truncation warning if content was truncated
if content_truncated:
output_text = f"[文件内容已截断到前 {format_size(MAX_CONTENT_CHARS)},完整文件大小: {format_size(file_size)}]\n\n"
if truncation.first_line_exceeds_limit: if truncation.first_line_exceeds_limit:
# First line exceeds 30KB limit # First line exceeds 30KB limit
first_line_size = format_size(len(all_lines[start_line].encode('utf-8'))) first_line_size = format_size(len(all_lines[start_line].encode('utf-8')))

View File

@@ -42,24 +42,26 @@ Agent: [调用 scheduler 工具]
**示例对话:** **示例对话:**
``` ```
用户: 每天早上8点帮我搜索一下当前新闻 用户: 每天早上8点帮我读取一下今日日程
Agent: [调用 scheduler 工具] Agent: [调用 scheduler 工具]
action: create action: create
name: 每日新闻 name: 每日日程
tool_call: tool_call:
tool_name: bocha_search tool_name: read
tool_params: tool_params:
query: 今日新闻 file_path: ~/cow/schedule.txt
result_prefix: 📰 今日新闻播报 result_prefix: 📅 今日日程
schedule_type: cron schedule_type: cron
schedule_value: 0 8 * * * schedule_value: 0 8 * * *
``` ```
**工具调用参数说明:** **工具调用参数说明:**
- `tool_name`: 要调用的工具名称(如 `bocha_search``web_fetch` - `tool_name`: 要调用的工具名称(如 `bash``read``write` 等内置工具
- `tool_params`: 工具的参数(字典格式) - `tool_params`: 工具的参数(字典格式)
- `result_prefix`: 可选,在结果前添加的前缀文本 - `result_prefix`: 可选,在结果前添加的前缀文本
**注意:** 如果要使用 skills如 bocha-search需要通过 `bash` 工具调用 skill 脚本
### 2. 支持的调度类型 ### 2. 支持的调度类型
#### Cron 表达式 (`cron`) #### Cron 表达式 (`cron`)
@@ -167,7 +169,7 @@ Agent: [调用 scheduler 工具]
```json ```json
{ {
"id": "def456", "id": "def456",
"name": "每日新闻", "name": "每日日程",
"enabled": true, "enabled": true,
"created_at": "2024-01-01T10:00:00", "created_at": "2024-01-01T10:00:00",
"updated_at": "2024-01-01T10:00:00", "updated_at": "2024-01-01T10:00:00",
@@ -177,11 +179,11 @@ Agent: [调用 scheduler 工具]
}, },
"action": { "action": {
"type": "tool_call", "type": "tool_call",
"tool_name": "bocha_search", "tool_name": "read",
"tool_params": { "tool_params": {
"query": "今日新闻" "file_path": "~/cow/schedule.txt"
}, },
"result_prefix": "📰 今日新闻播报", "result_prefix": "📅 今日日程",
"receiver": "wxid_xxx", "receiver": "wxid_xxx",
"receiver_name": "张三", "receiver_name": "张三",
"is_group": false, "is_group": false,
@@ -234,30 +236,29 @@ Agent: [创建 cron: 0 18 * * 1-5]
Agent: [创建 interval: 3600] Agent: [创建 interval: 3600]
``` ```
### 4. 每日新闻推送(动态工具调用) ### 4. 每日日程推送(动态工具调用)
``` ```
用户: 每天早上8点帮我搜索一下当前新闻 用户: 每天早上8点帮我读取今日日程
Agent: ✅ 定时任务创建成功 Agent: ✅ 定时任务创建成功
任务ID: news001 任务ID: schedule001
调度: 每天 8:00 调度: 每天 8:00
工具: bocha_search(query='今日新闻') 工具: read(file_path='~/cow/schedule.txt')
前缀: 📰 今日新闻播报 前缀: 📅 今日日程
``` ```
### 5. 定时天气查询(动态工具调用) ### 5. 定时文件备份(动态工具调用)
``` ```
用户: 每天早上7点查询今天的天气 用户: 每天晚上11点备份工作文件
Agent: [创建 cron: 0 7 * * *] Agent: [创建 cron: 0 23 * * *]
工具: bocha_search(query='今日天气') 工具: bash(command='cp ~/cow/work.txt ~/cow/backup/work_$(date +%Y%m%d).txt')
前缀: 🌤️ 今日天气预报 前缀: ✅ 文件已备份
``` ```
### 6. 周报提醒(动态工具调用 ### 6. 周报提醒(静态消息
``` ```
用户: 每周五下午5点搜索本周热点 用户: 每周五下午5点提醒我写周报
Agent: [创建 cron: 0 17 * * 5] Agent: [创建 cron: 0 17 * * 5]
工具: bocha_search(query='本周热点新闻') 消息: 📊 该写周报了!
前缀: 📊 本周热点回顾
``` ```
### 4. 特定日期提醒 ### 4. 特定日期提醒

View File

@@ -45,10 +45,17 @@ def init_scheduler(agent_bridge) -> bool:
action = task.get("action", {}) action = task.get("action", {})
action_type = action.get("type") action_type = action.get("type")
if action_type == "send_message": if action_type == "agent_task":
_execute_agent_task(task, agent_bridge)
elif action_type == "send_message":
# Legacy support for old tasks
_execute_send_message(task, agent_bridge) _execute_send_message(task, agent_bridge)
elif action_type == "tool_call": elif action_type == "tool_call":
# Legacy support for old tasks
_execute_tool_call(task, agent_bridge) _execute_tool_call(task, agent_bridge)
elif action_type == "skill_call":
# Legacy support for old tasks
_execute_skill_call(task, agent_bridge)
else: else:
logger.warning(f"[Scheduler] Unknown action type: {action_type}") logger.warning(f"[Scheduler] Unknown action type: {action_type}")
except Exception as e: except Exception as e:
@@ -76,6 +83,100 @@ def get_scheduler_service():
return _scheduler_service return _scheduler_service
def _execute_agent_task(task: dict, agent_bridge):
"""
Execute an agent_task action - let Agent handle the task
Args:
task: Task dictionary
agent_bridge: AgentBridge instance
"""
try:
action = task.get("action", {})
task_description = action.get("task_description")
receiver = action.get("receiver")
is_group = action.get("is_group", False)
channel_type = action.get("channel_type", "unknown")
if not task_description:
logger.error(f"[Scheduler] Task {task['id']}: No task_description specified")
return
if not receiver:
logger.error(f"[Scheduler] Task {task['id']}: No receiver specified")
return
# Check for unsupported channels
if channel_type == "dingtalk":
logger.warning(f"[Scheduler] Task {task['id']}: DingTalk channel does not support scheduled messages (Stream mode limitation). Task will execute but message cannot be sent.")
logger.info(f"[Scheduler] Task {task['id']}: Executing agent task '{task_description}'")
# Create context for Agent
context = Context(ContextType.TEXT, task_description)
context["receiver"] = receiver
context["isgroup"] = is_group
context["session_id"] = receiver
# Channel-specific setup
if channel_type == "web":
import uuid
request_id = f"scheduler_{task['id']}_{uuid.uuid4().hex[:8]}"
context["request_id"] = request_id
elif channel_type == "feishu":
context["receive_id_type"] = "chat_id" if is_group else "open_id"
context["msg"] = None
elif channel_type == "dingtalk":
# DingTalk requires msg object, set to None for scheduled tasks
context["msg"] = None
# 如果是单聊,需要传递 sender_staff_id
if not is_group:
sender_staff_id = action.get("dingtalk_sender_staff_id")
if sender_staff_id:
context["dingtalk_sender_staff_id"] = sender_staff_id
# Use Agent to execute the task
# Mark this as a scheduled task execution to prevent recursive task creation
context["is_scheduled_task"] = True
try:
reply = agent_bridge.agent_reply(task_description, context=context, on_event=None, clear_history=True)
if reply and reply.content:
# Send the reply via channel
from channel.channel_factory import create_channel
try:
channel = create_channel(channel_type)
if channel:
# For web channel, register request_id
if channel_type == "web" and hasattr(channel, 'request_to_session'):
request_id = context.get("request_id")
if request_id:
channel.request_to_session[request_id] = receiver
logger.debug(f"[Scheduler] Registered request_id {request_id} -> session {receiver}")
# Send the reply
channel.send(reply, context)
logger.info(f"[Scheduler] Task {task['id']} executed successfully, result sent to {receiver}")
else:
logger.error(f"[Scheduler] Failed to create channel: {channel_type}")
except Exception as e:
logger.error(f"[Scheduler] Failed to send result: {e}")
else:
logger.error(f"[Scheduler] Task {task['id']}: No result from agent execution")
except Exception as e:
logger.error(f"[Scheduler] Failed to execute task via Agent: {e}")
import traceback
logger.error(f"[Scheduler] Traceback: {traceback.format_exc()}")
except Exception as e:
logger.error(f"[Scheduler] Error in _execute_agent_task: {e}")
import traceback
logger.error(f"[Scheduler] Traceback: {traceback.format_exc()}")
def _execute_send_message(task: dict, agent_bridge): def _execute_send_message(task: dict, agent_bridge):
""" """
Execute a send_message action Execute a send_message action
@@ -116,6 +217,17 @@ def _execute_send_message(task: dict, agent_bridge):
# Feishu channel will detect this and send as new message instead of reply # Feishu channel will detect this and send as new message instead of reply
context["msg"] = None context["msg"] = None
logger.debug(f"[Scheduler] Feishu: receive_id_type={context['receive_id_type']}, is_group={is_group}, receiver={receiver}") logger.debug(f"[Scheduler] Feishu: receive_id_type={context['receive_id_type']}, is_group={is_group}, receiver={receiver}")
elif channel_type == "dingtalk":
# DingTalk channel setup
context["msg"] = None
# 如果是单聊,需要传递 sender_staff_id
if not is_group:
sender_staff_id = action.get("dingtalk_sender_staff_id")
if sender_staff_id:
context["dingtalk_sender_staff_id"] = sender_staff_id
logger.debug(f"[Scheduler] DingTalk single chat: sender_staff_id={sender_staff_id}")
else:
logger.warning(f"[Scheduler] Task {task['id']}: DingTalk single chat message missing sender_staff_id")
# Create reply # Create reply
reply = Reply(ReplyType.TEXT, content) reply = Reply(ReplyType.TEXT, content)
@@ -156,8 +268,9 @@ def _execute_tool_call(task: dict, agent_bridge):
""" """
try: try:
action = task.get("action", {}) action = task.get("action", {})
tool_name = action.get("tool_name") # Support both old and new field names
tool_params = action.get("tool_params", {}) tool_name = action.get("call_name") or action.get("tool_name")
tool_params = action.get("call_params") or action.get("tool_params", {})
result_prefix = action.get("result_prefix", "") result_prefix = action.get("result_prefix", "")
receiver = action.get("receiver") receiver = action.get("receiver")
is_group = action.get("is_group", False) is_group = action.get("is_group", False)
@@ -237,6 +350,82 @@ def _execute_tool_call(task: dict, agent_bridge):
logger.error(f"[Scheduler] Error in _execute_tool_call: {e}") logger.error(f"[Scheduler] Error in _execute_tool_call: {e}")
def _execute_skill_call(task: dict, agent_bridge):
"""
Execute a skill_call action by asking Agent to run the skill
Args:
task: Task dictionary
agent_bridge: AgentBridge instance
"""
try:
action = task.get("action", {})
# Support both old and new field names
skill_name = action.get("call_name") or action.get("skill_name")
skill_params = action.get("call_params") or action.get("skill_params", {})
result_prefix = action.get("result_prefix", "")
receiver = action.get("receiver")
is_group = action.get("isgroup", False)
channel_type = action.get("channel_type", "unknown")
if not skill_name:
logger.error(f"[Scheduler] Task {task['id']}: No skill_name specified")
return
if not receiver:
logger.error(f"[Scheduler] Task {task['id']}: No receiver specified")
return
logger.info(f"[Scheduler] Task {task['id']}: Executing skill '{skill_name}' with params {skill_params}")
# Build a natural language query for the Agent to execute the skill
# Format: "Use skill-name to do something with params"
param_str = ", ".join([f"{k}={v}" for k, v in skill_params.items()])
query = f"Use {skill_name} skill"
if param_str:
query += f" with {param_str}"
# Create context for Agent
context = Context(ContextType.TEXT, query)
context["receiver"] = receiver
context["isgroup"] = is_group
context["session_id"] = receiver
# Channel-specific setup
if channel_type == "web":
import uuid
request_id = f"scheduler_{task['id']}_{uuid.uuid4().hex[:8]}"
context["request_id"] = request_id
elif channel_type == "feishu":
context["receive_id_type"] = "chat_id" if is_group else "open_id"
context["msg"] = None
# Use Agent to execute the skill
try:
reply = agent_bridge.agent_reply(query, context=context, on_event=None, clear_history=True)
if reply and reply.content:
content = reply.content
# Add prefix if specified
if result_prefix:
content = f"{result_prefix}\n\n{content}"
logger.info(f"[Scheduler] Task {task['id']} executed: skill result sent to {receiver}")
else:
logger.error(f"[Scheduler] Task {task['id']}: No result from skill execution")
except Exception as e:
logger.error(f"[Scheduler] Failed to execute skill via Agent: {e}")
import traceback
logger.error(f"[Scheduler] Traceback: {traceback.format_exc()}")
except Exception as e:
logger.error(f"[Scheduler] Error in _execute_skill_call: {e}")
import traceback
logger.error(f"[Scheduler] Traceback: {traceback.format_exc()}")
def attach_scheduler_to_tool(tool, context: Context = None): def attach_scheduler_to_tool(tool, context: Context = None):
""" """
Attach scheduler components to a SchedulerTool instance Attach scheduler components to a SchedulerTool instance

View File

@@ -118,6 +118,34 @@ class SchedulerService:
try: try:
next_run = datetime.fromisoformat(next_run_str) next_run = datetime.fromisoformat(next_run_str)
# Check if task is overdue (e.g., service restart)
if next_run < now:
time_diff = (now - next_run).total_seconds()
# If overdue by more than 5 minutes, skip this run and schedule next
if time_diff > 300: # 5 minutes
logger.warning(f"[Scheduler] Task {task['id']} is overdue by {int(time_diff)}s, skipping and scheduling next run")
# For one-time tasks, disable them
schedule = task.get("schedule", {})
if schedule.get("type") == "once":
self.task_store.update_task(task['id'], {
"enabled": False,
"last_run_at": now.isoformat()
})
logger.info(f"[Scheduler] One-time task {task['id']} expired, disabled")
return False
# For recurring tasks, calculate next run from now
next_next_run = self._calculate_next_run(task, now)
if next_next_run:
self.task_store.update_task(task['id'], {
"next_run_at": next_next_run.isoformat()
})
logger.info(f"[Scheduler] Rescheduled task {task['id']} to {next_next_run}")
return False
return now >= next_run return now >= next_run
except: except:
return False return False

View File

@@ -20,23 +20,16 @@ class SchedulerTool(BaseTool):
name: str = "scheduler" name: str = "scheduler"
description: str = ( description: str = (
"创建、查询和管理定时任务。支持两种任务类型:\n" "创建、查询和管理定时任务。支持固定消息和AI任务两种类型。\n\n"
"1. 静态消息任务:定时发送预定义的消息\n"
"2. 动态工具任务:定时执行工具调用并发送结果(如搜索新闻、查询天气等)\n\n"
"使用方法:\n" "使用方法:\n"
"- 创建静态消息任务action='create', name='任务名', message='消息内容', schedule_type='interval'/'cron'/'once', schedule_value='间隔秒数/cron表达式/时间'\n" "- 创建action='create', name='任务名', message/ai_task='内容', schedule_type='once/interval/cron', schedule_value='...'\n"
"- 创建动态工具任务action='create', name='任务名', tool_call={'tool_name': '工具名', 'tool_params': {...}, 'result_prefix': '前缀'}, schedule_type='interval'/'cron'/'once', schedule_value=''\n" "- 查询action='list' / action='get', task_id='任务ID'\n"
"- 查询列表action='list'\n" "- 管理action='delete/enable/disable', task_id='任务ID'\n\n"
"- 查看详情action='get', task_id='任务ID'\n" "调度类型:\n"
"- 删除任务action='delete', task_id='任务ID'\n" "- once: 一次性任务,支持相对时间(+5s,+10m,+1h,+1d)或ISO时间\n"
"- 启用任务action='enable', task_id='任务ID'\n" "- interval: 固定间隔(秒)如3600表示每小时\n"
"- 禁用任务action='disable', task_id='任务ID'\n\n" "- cron: cron表达式'0 8 * * *'表示每天8点\n\n"
"调度类型说明:\n" "注意:'X秒后'用once+相对时间,'每X秒'用interval"
"- interval: 固定间隔秒数如3600表示每小时\n"
"- cron: cron表达式'0 9 * * *'表示每天9点'*/10 * * * *'表示每10分钟\n"
"- once: 一次性任务ISO时间格式'2024-12-25T09:00:00'\n\n"
"示例每天早上8点搜索新闻\n"
"action='create', name='每日新闻', tool_call={'tool_name': 'bocha_search', 'tool_params': {'query': '今日新闻'}, 'result_prefix': '📰 今日新闻播报'}, schedule_type='cron', schedule_value='0 8 * * *'"
) )
params: dict = { params: dict = {
"type": "object", "type": "object",
@@ -56,26 +49,11 @@ class SchedulerTool(BaseTool):
}, },
"message": { "message": {
"type": "string", "type": "string",
"description": "要发送的静态消息内容 (用于 create 操作与tool_call二选一)" "description": "固定消息内容 (与ai_task二选一)"
}, },
"tool_call": { "ai_task": {
"type": "object", "type": "string",
"description": "要执行的工具调用 (用于 create 操作与message二选一)", "description": "AI任务描述 (与message二选一),如'搜索今日新闻''查询天气'"
"properties": {
"tool_name": {
"type": "string",
"description": "工具名称,如 'bocha_search'"
},
"tool_params": {
"type": "object",
"description": "工具参数"
},
"result_prefix": {
"type": "string",
"description": "结果前缀,如 '今日新闻:'"
}
},
"required": ["tool_name"]
}, },
"schedule_type": { "schedule_type": {
"type": "string", "type": "string",
@@ -84,12 +62,7 @@ class SchedulerTool(BaseTool):
}, },
"schedule_value": { "schedule_value": {
"type": "string", "type": "string",
"description": ( "description": "调度值: cron表达式/间隔秒数/时间(+5s,+10m,+1h或ISO格式)"
"调度值 (用于 create 操作):\n"
"- cron类型: cron表达式'0 9 * * *' (每天9点)'*/10 * * * *' (每10分钟)\n"
"- interval类型: 间隔秒数,如 '3600' (每小时)'10' (每10秒)\n"
"- once类型: ISO时间'2024-12-25T09:00:00'"
)
} }
}, },
"required": ["action"] "required": ["action"]
@@ -151,17 +124,20 @@ class SchedulerTool(BaseTool):
"""Create a new scheduled task""" """Create a new scheduled task"""
name = kwargs.get("name") name = kwargs.get("name")
message = kwargs.get("message") message = kwargs.get("message")
tool_call = kwargs.get("tool_call") ai_task = kwargs.get("ai_task")
schedule_type = kwargs.get("schedule_type") schedule_type = kwargs.get("schedule_type")
schedule_value = kwargs.get("schedule_value") schedule_value = kwargs.get("schedule_value")
# Validate required fields # Validate required fields
if not name: if not name:
return "错误: 缺少任务名称 (name)" return "错误: 缺少任务名称 (name)"
if not message and not tool_call:
return "错误: 必须提供 message 或 tool_call 之一" # Check that exactly one of message/ai_task is provided
if message and tool_call: if not message and not ai_task:
return "错误: message 和 tool_call 不能同时提供,请选择其" return "错误: 必须提供 message(固定消息)或 ai_taskAI任务"
if message and ai_task:
return "错误: message 和 ai_task 只能提供其中一个"
if not schedule_type: if not schedule_type:
return "错误: 缺少调度类型 (schedule_type)" return "错误: 缺少调度类型 (schedule_type)"
if not schedule_value: if not schedule_value:
@@ -181,7 +157,7 @@ class SchedulerTool(BaseTool):
# Create task # Create task
task_id = str(uuid.uuid4())[:8] task_id = str(uuid.uuid4())[:8]
# Build action based on message or tool_call # Build action based on message or ai_task
if message: if message:
action = { action = {
"type": "send_message", "type": "send_message",
@@ -191,19 +167,22 @@ class SchedulerTool(BaseTool):
"is_group": context.get("isgroup", False), "is_group": context.get("isgroup", False),
"channel_type": self.config.get("channel_type", "unknown") "channel_type": self.config.get("channel_type", "unknown")
} }
else: # tool_call else: # ai_task
action = { action = {
"type": "tool_call", "type": "agent_task",
"tool_name": tool_call.get("tool_name"), "task_description": ai_task,
"tool_params": tool_call.get("tool_params", {}),
"result_prefix": tool_call.get("result_prefix", ""),
"receiver": context.get("receiver"), "receiver": context.get("receiver"),
"receiver_name": self._get_receiver_name(context), "receiver_name": self._get_receiver_name(context),
"is_group": context.get("isgroup", False), "is_group": context.get("isgroup", False),
"channel_type": self.config.get("channel_type", "unknown") "channel_type": self.config.get("channel_type", "unknown")
} }
task = { # 针对钉钉单聊,额外存储 sender_staff_id
msg = context.kwargs.get("msg")
if msg and hasattr(msg, 'sender_staff_id') and not context.get("isgroup", False):
action["dingtalk_sender_staff_id"] = msg.sender_staff_id
task_data = {
"id": task_id, "id": task_id,
"name": name, "name": name,
"enabled": True, "enabled": True,
@@ -214,26 +193,21 @@ class SchedulerTool(BaseTool):
} }
# Calculate initial next_run_at # Calculate initial next_run_at
next_run = self._calculate_next_run(task) next_run = self._calculate_next_run(task_data)
if next_run: if next_run:
task["next_run_at"] = next_run.isoformat() task_data["next_run_at"] = next_run.isoformat()
# Save task # Save task
self.task_store.add_task(task) self.task_store.add_task(task_data)
# Format response # Format response
schedule_desc = self._format_schedule_description(schedule) schedule_desc = self._format_schedule_description(schedule)
receiver_desc = task["action"]["receiver_name"] or task["action"]["receiver"] receiver_desc = task_data["action"]["receiver_name"] or task_data["action"]["receiver"]
if message: if message:
content_desc = f"💬 消息: {message}" content_desc = f"💬 固定消息: {message}"
else: else:
tool_name = tool_call.get("tool_name") content_desc = f"🤖 AI任务: {ai_task}"
tool_params_str = str(tool_call.get("tool_params", {}))
prefix = tool_call.get("result_prefix", "")
content_desc = f"🔧 工具调用: {tool_name}({tool_params_str})"
if prefix:
content_desc += f"\n📝 结果前缀: {prefix}"
return ( return (
f"✅ 定时任务创建成功\n\n" f"✅ 定时任务创建成功\n\n"
@@ -353,9 +327,38 @@ class SchedulerTool(BaseTool):
return {"type": "interval", "seconds": seconds} return {"type": "interval", "seconds": seconds}
elif schedule_type == "once": elif schedule_type == "once":
# Parse datetime # Parse datetime - support both relative and absolute time
datetime.fromisoformat(schedule_value)
return {"type": "once", "run_at": schedule_value} # Check if it's relative time (e.g., "+5s", "+10m", "+1h", "+1d")
if schedule_value.startswith("+"):
import re
match = re.match(r'\+(\d+)([smhd])', schedule_value)
if match:
amount = int(match.group(1))
unit = match.group(2)
from datetime import timedelta
now = datetime.now()
if unit == 's': # seconds
target_time = now + timedelta(seconds=amount)
elif unit == 'm': # minutes
target_time = now + timedelta(minutes=amount)
elif unit == 'h': # hours
target_time = now + timedelta(hours=amount)
elif unit == 'd': # days
target_time = now + timedelta(days=amount)
else:
return None
return {"type": "once", "run_at": target_time.isoformat()}
else:
logger.error(f"[SchedulerTool] Invalid relative time format: {schedule_value}")
return None
else:
# Absolute time in ISO format
datetime.fromisoformat(schedule_value)
return {"type": "once", "run_at": schedule_value}
except Exception as e: except Exception as e:
logger.error(f"[SchedulerTool] Invalid schedule: {e}") logger.error(f"[SchedulerTool] Invalid schedule: {e}")

View File

@@ -0,0 +1,3 @@
from .send import Send
__all__ = ['Send']

159
agent/tools/send/send.py Normal file
View File

@@ -0,0 +1,159 @@
"""
Send tool - Send files to the user
"""
import os
from typing import Dict, Any
from pathlib import Path
from agent.tools.base_tool import BaseTool, ToolResult
class Send(BaseTool):
"""Tool for sending files to the user"""
name: str = "send"
description: str = "Send a file (image, video, audio, document) to the user. Use this when the user explicitly asks to send/share a file."
params: dict = {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Path to the file to send. Can be absolute path or relative to workspace."
},
"message": {
"type": "string",
"description": "Optional message to accompany the file"
}
},
"required": ["path"]
}
def __init__(self, config: dict = None):
self.config = config or {}
self.cwd = self.config.get("cwd", os.getcwd())
# Supported file types
self.image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg', '.ico'}
self.video_extensions = {'.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv', '.webm', '.m4v'}
self.audio_extensions = {'.mp3', '.wav', '.ogg', '.m4a', '.flac', '.aac', '.wma'}
self.document_extensions = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.txt', '.md'}
def execute(self, args: Dict[str, Any]) -> ToolResult:
"""
Execute file send operation
:param args: Contains file path and optional message
:return: File metadata for channel to send
"""
path = args.get("path", "").strip()
message = args.get("message", "")
if not path:
return ToolResult.fail("Error: path parameter is required")
# Resolve path
absolute_path = self._resolve_path(path)
# Check if file exists
if not os.path.exists(absolute_path):
return ToolResult.fail(f"Error: File not found: {path}")
# Check if readable
if not os.access(absolute_path, os.R_OK):
return ToolResult.fail(f"Error: File is not readable: {path}")
# Get file info
file_ext = Path(absolute_path).suffix.lower()
file_size = os.path.getsize(absolute_path)
file_name = Path(absolute_path).name
# Determine file type
if file_ext in self.image_extensions:
file_type = "image"
mime_type = self._get_image_mime_type(file_ext)
elif file_ext in self.video_extensions:
file_type = "video"
mime_type = self._get_video_mime_type(file_ext)
elif file_ext in self.audio_extensions:
file_type = "audio"
mime_type = self._get_audio_mime_type(file_ext)
elif file_ext in self.document_extensions:
file_type = "document"
mime_type = self._get_document_mime_type(file_ext)
else:
file_type = "file"
mime_type = "application/octet-stream"
# Return file_to_send metadata
result = {
"type": "file_to_send",
"file_type": file_type,
"path": absolute_path,
"file_name": file_name,
"mime_type": mime_type,
"size": file_size,
"size_formatted": self._format_size(file_size),
"message": message or f"正在发送 {file_name}"
}
return ToolResult.success(result)
def _resolve_path(self, path: str) -> str:
"""Resolve path to absolute path"""
path = os.path.expanduser(path)
if os.path.isabs(path):
return path
return os.path.abspath(os.path.join(self.cwd, path))
def _get_image_mime_type(self, ext: str) -> str:
"""Get MIME type for image"""
mime_map = {
'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
'.png': 'image/png', '.gif': 'image/gif',
'.webp': 'image/webp', '.bmp': 'image/bmp',
'.svg': 'image/svg+xml', '.ico': 'image/x-icon'
}
return mime_map.get(ext, 'image/jpeg')
def _get_video_mime_type(self, ext: str) -> str:
"""Get MIME type for video"""
mime_map = {
'.mp4': 'video/mp4', '.avi': 'video/x-msvideo',
'.mov': 'video/quicktime', '.mkv': 'video/x-matroska',
'.webm': 'video/webm', '.flv': 'video/x-flv'
}
return mime_map.get(ext, 'video/mp4')
def _get_audio_mime_type(self, ext: str) -> str:
"""Get MIME type for audio"""
mime_map = {
'.mp3': 'audio/mpeg', '.wav': 'audio/wav',
'.ogg': 'audio/ogg', '.m4a': 'audio/mp4',
'.flac': 'audio/flac', '.aac': 'audio/aac'
}
return mime_map.get(ext, 'audio/mpeg')
def _get_document_mime_type(self, ext: str) -> str:
"""Get MIME type for document"""
mime_map = {
'.pdf': 'application/pdf',
'.doc': 'application/msword',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.xls': 'application/vnd.ms-excel',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.ppt': 'application/vnd.ms-powerpoint',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.txt': 'text/plain',
'.md': 'text/markdown'
}
return mime_map.get(ext, 'application/octet-stream')
def _format_size(self, size_bytes: int) -> str:
"""Format file size in human-readable format"""
for unit in ['B', 'KB', 'MB', 'GB']:
if size_bytes < 1024.0:
return f"{size_bytes:.1f}{unit}"
size_bytes /= 1024.0
return f"{size_bytes:.1f}TB"

View File

@@ -2,6 +2,7 @@
Agent Bridge - Integrates Agent system with existing COW bridge Agent Bridge - Integrates Agent system with existing COW bridge
""" """
import os
from typing import Optional, List from typing import Optional, List
from agent.protocol import Agent, LLMModel, LLMRequest from agent.protocol import Agent, LLMModel, LLMRequest
@@ -269,8 +270,11 @@ class AgentBridge:
# Get workspace from config # Get workspace from config
workspace_root = os.path.expanduser(conf().get("agent_workspace", "~/cow")) workspace_root = os.path.expanduser(conf().get("agent_workspace", "~/cow"))
# Load environment variables from workspace .env file # Migrate API keys from config.json to environment variables (if not already set)
env_file = os.path.join(workspace_root, '.env') self._migrate_config_to_env(workspace_root)
# Load environment variables from secure .env file location
env_file = os.path.expanduser("~/.cow/.env")
if os.path.exists(env_file): if os.path.exists(env_file):
try: try:
from dotenv import load_dotenv from dotenv import load_dotenv
@@ -280,9 +284,6 @@ class AgentBridge:
logger.warning("[AgentBridge] python-dotenv not installed, skipping .env file loading") logger.warning("[AgentBridge] python-dotenv not installed, skipping .env file loading")
except Exception as e: except Exception as e:
logger.warning(f"[AgentBridge] Failed to load .env file: {e}") logger.warning(f"[AgentBridge] Failed to load .env file: {e}")
# Migrate API keys from config.json to environment variables (if not already set)
self._migrate_config_to_env(workspace_root)
# Initialize workspace and create template files # Initialize workspace and create template files
from agent.prompt import ensure_workspace, load_context_files, PromptBuilder from agent.prompt import ensure_workspace, load_context_files, PromptBuilder
@@ -377,7 +378,6 @@ class AgentBridge:
if tool_name == "env_config": if tool_name == "env_config":
from agent.tools import EnvConfig from agent.tools import EnvConfig
tool = EnvConfig({ tool = EnvConfig({
"workspace_dir": workspace_root,
"agent_bridge": self # Pass self reference for hot reload "agent_bridge": self # Pass self reference for hot reload
}) })
else: else:
@@ -390,12 +390,6 @@ class AgentBridge:
tool.cwd = file_config.get("cwd", tool.cwd if hasattr(tool, 'cwd') else None) tool.cwd = file_config.get("cwd", tool.cwd if hasattr(tool, 'cwd') else None)
if 'memory_manager' in file_config: if 'memory_manager' in file_config:
tool.memory_manager = file_config['memory_manager'] tool.memory_manager = file_config['memory_manager']
# Apply API key for bocha_search tool
elif tool_name == 'bocha_search':
bocha_api_key = conf().get("bocha_api_key", "")
if bocha_api_key:
tool.config = {"bocha_api_key": bocha_api_key}
tool.api_key = bocha_api_key
tools.append(tool) tools.append(tool)
logger.debug(f"[AgentBridge] Loaded tool: {tool_name}") logger.debug(f"[AgentBridge] Loaded tool: {tool_name}")
except Exception as e: except Exception as e:
@@ -504,8 +498,11 @@ class AgentBridge:
# Get workspace from config # Get workspace from config
workspace_root = os.path.expanduser(conf().get("agent_workspace", "~/cow")) workspace_root = os.path.expanduser(conf().get("agent_workspace", "~/cow"))
# Load environment variables from workspace .env file # Migrate API keys from config.json to environment variables (if not already set)
env_file = os.path.join(workspace_root, '.env') self._migrate_config_to_env(workspace_root)
# Load environment variables from secure .env file location
env_file = os.path.expanduser("~/.cow/.env")
if os.path.exists(env_file): if os.path.exists(env_file):
try: try:
from dotenv import load_dotenv from dotenv import load_dotenv
@@ -609,11 +606,6 @@ class AgentBridge:
tool.cwd = file_config.get("cwd", tool.cwd if hasattr(tool, 'cwd') else None) tool.cwd = file_config.get("cwd", tool.cwd if hasattr(tool, 'cwd') else None)
if 'memory_manager' in file_config: if 'memory_manager' in file_config:
tool.memory_manager = file_config['memory_manager'] tool.memory_manager = file_config['memory_manager']
elif tool_name == 'bocha_search':
bocha_api_key = conf().get("bocha_api_key", "")
if bocha_api_key:
tool.config = {"bocha_api_key": bocha_api_key}
tool.api_key = bocha_api_key
tools.append(tool) tools.append(tool)
except Exception as e: except Exception as e:
logger.warning(f"[AgentBridge] Failed to load tool {tool_name} for session {session_id}: {e}") logger.warning(f"[AgentBridge] Failed to load tool {tool_name} for session {session_id}: {e}")
@@ -767,23 +759,52 @@ class AgentBridge:
if not agent: if not agent:
return Reply(ReplyType.ERROR, "Failed to initialize super agent") return Reply(ReplyType.ERROR, "Failed to initialize super agent")
# Attach context to scheduler tool if present # Filter tools based on context
if context and agent.tools: original_tools = agent.tools
for tool in agent.tools: filtered_tools = original_tools
if tool.name == "scheduler":
try:
from agent.tools.scheduler.integration import attach_scheduler_to_tool
attach_scheduler_to_tool(tool, context)
except Exception as e:
logger.warning(f"[AgentBridge] Failed to attach context to scheduler: {e}")
break
# Use agent's run_stream method # If this is a scheduled task execution, exclude scheduler tool to prevent recursion
response = agent.run_stream( if context and context.get("is_scheduled_task"):
user_message=query, filtered_tools = [tool for tool in agent.tools if tool.name != "scheduler"]
on_event=on_event, agent.tools = filtered_tools
clear_history=clear_history logger.info(f"[AgentBridge] Scheduled task execution: excluded scheduler tool ({len(filtered_tools)}/{len(original_tools)} tools)")
) else:
# Attach context to scheduler tool if present
if context and agent.tools:
for tool in agent.tools:
if tool.name == "scheduler":
try:
from agent.tools.scheduler.integration import attach_scheduler_to_tool
attach_scheduler_to_tool(tool, context)
except Exception as e:
logger.warning(f"[AgentBridge] Failed to attach context to scheduler: {e}")
break
try:
# Use agent's run_stream method
response = agent.run_stream(
user_message=query,
on_event=on_event,
clear_history=clear_history
)
finally:
# Restore original tools
if context and context.get("is_scheduled_task"):
agent.tools = original_tools
# Check if there are files to send (from read tool)
if hasattr(agent, 'stream_executor') and hasattr(agent.stream_executor, 'files_to_send'):
files_to_send = agent.stream_executor.files_to_send
if files_to_send:
# Send the first file (for now, handle one file at a time)
file_info = files_to_send[0]
logger.info(f"[AgentBridge] Sending file: {file_info.get('path')}")
# Clear files_to_send for next request
agent.stream_executor.files_to_send = []
# Return file reply based on file type
return self._create_file_reply(file_info, response, context)
return Reply(ReplyType.TEXT, response) return Reply(ReplyType.TEXT, response)
@@ -791,12 +812,53 @@ class AgentBridge:
logger.error(f"Agent reply error: {e}") logger.error(f"Agent reply error: {e}")
return Reply(ReplyType.ERROR, f"Agent error: {str(e)}") return Reply(ReplyType.ERROR, f"Agent error: {str(e)}")
def _create_file_reply(self, file_info: dict, text_response: str, context: Context = None) -> Reply:
"""
Create a reply for sending files
Args:
file_info: File metadata from read tool
text_response: Text response from agent
context: Context object
Returns:
Reply object for file sending
"""
file_type = file_info.get("file_type", "file")
file_path = file_info.get("path")
# For images, use IMAGE_URL type (channel will handle upload)
if file_type == "image":
# Convert local path to file:// URL for channel processing
file_url = f"file://{file_path}"
logger.info(f"[AgentBridge] Sending image: {file_url}")
reply = Reply(ReplyType.IMAGE_URL, file_url)
# Attach text message if present (for channels that support text+image)
if text_response:
reply.text_content = text_response # Store accompanying text
return reply
# For documents (PDF, Excel, Word, PPT), use FILE type
if file_type == "document":
file_url = f"file://{file_path}"
logger.info(f"[AgentBridge] Sending document: {file_url}")
reply = Reply(ReplyType.FILE, file_url)
reply.file_name = file_info.get("file_name", os.path.basename(file_path))
return reply
# For other files (video, audio), we need channel-specific handling
# For now, return text with file info
# TODO: Implement video/audio sending when channel supports it
message = text_response or file_info.get("message", "文件已准备")
message += f"\n\n[文件: {file_info.get('file_name', file_path)}]"
return Reply(ReplyType.TEXT, message)
def _migrate_config_to_env(self, workspace_root: str): def _migrate_config_to_env(self, workspace_root: str):
""" """
Migrate API keys from config.json to .env file if not already set Migrate API keys from config.json to .env file if not already set
Args: Args:
workspace_root: Workspace directory path workspace_root: Workspace directory path (not used, kept for compatibility)
""" """
from config import conf from config import conf
import os import os
@@ -810,7 +872,8 @@ class AgentBridge:
"linkai_api_key": "LINKAI_API_KEY", "linkai_api_key": "LINKAI_API_KEY",
} }
env_file = os.path.join(workspace_root, '.env') # Use fixed secure location for .env file
env_file = os.path.expanduser("~/.cow/.env")
# Read existing env vars from .env file # Read existing env vars from .env file
existing_env_vars = {} existing_env_vars = {}
@@ -830,19 +893,25 @@ class AgentBridge:
for config_key, env_key in key_mapping.items(): for config_key, env_key in key_mapping.items():
# Skip if already in .env file # Skip if already in .env file
if env_key in existing_env_vars: if env_key in existing_env_vars:
logger.debug(f"[AgentBridge] Skipping {env_key} - already in .env")
continue continue
# Get value from config.json # Get value from config.json
value = conf().get(config_key, "") value = conf().get(config_key, "")
if value and value.strip(): # Only migrate non-empty values if value and value.strip(): # Only migrate non-empty values
keys_to_migrate[env_key] = value.strip() keys_to_migrate[env_key] = value.strip()
logger.debug(f"[AgentBridge] Will migrate {env_key} from config.json")
else:
logger.debug(f"[AgentBridge] Skipping {env_key} - no value in config.json")
# Write new keys to .env file # Write new keys to .env file
if keys_to_migrate: if keys_to_migrate:
try: try:
# Ensure .env file exists # Ensure ~/.cow directory and .env file exist
env_dir = os.path.dirname(env_file)
if not os.path.exists(env_dir):
os.makedirs(env_dir, exist_ok=True)
if not os.path.exists(env_file): if not os.path.exists(env_file):
os.makedirs(os.path.dirname(env_file), exist_ok=True)
open(env_file, 'a').close() open(env_file, 'a').close()
# Append new keys # Append new keys

View File

@@ -64,15 +64,22 @@ class ChatChannel(Channel):
check_contain(group_name, group_name_keyword_white_list), check_contain(group_name, group_name_keyword_white_list),
] ]
): ):
group_chat_in_one_session = conf().get("group_chat_in_one_session", []) # Check global group_shared_session config first
session_id = cmsg.actual_user_id group_shared_session = conf().get("group_shared_session", True)
if any( if group_shared_session:
[ # All users in the group share the same session
group_name in group_chat_in_one_session,
"ALL_GROUP" in group_chat_in_one_session,
]
):
session_id = group_id session_id = group_id
else:
# Check group-specific whitelist (legacy behavior)
group_chat_in_one_session = conf().get("group_chat_in_one_session", [])
session_id = cmsg.actual_user_id
if any(
[
group_name in group_chat_in_one_session,
"ALL_GROUP" in group_chat_in_one_session,
]
):
session_id = group_id
else: else:
logger.debug(f"No need reply, groupName not in whitelist, group_name={group_name}") logger.debug(f"No need reply, groupName not in whitelist, group_name={group_name}")
return None return None
@@ -283,7 +290,98 @@ class ChatChannel(Channel):
reply = e_context["reply"] reply = e_context["reply"]
if not e_context.is_pass() and reply and reply.type: if not e_context.is_pass() and reply and reply.type:
logger.debug("[chat_channel] ready to send reply: {}, context: {}".format(reply, context)) logger.debug("[chat_channel] ready to send reply: {}, context: {}".format(reply, context))
self._send(reply, context)
# 如果是文本回复,尝试提取并发送图片
if reply.type == ReplyType.TEXT:
self._extract_and_send_images(reply, context)
# 如果是图片回复但带有文本内容,先发文本再发图片
elif reply.type == ReplyType.IMAGE_URL and hasattr(reply, 'text_content') and reply.text_content:
# 先发送文本
text_reply = Reply(ReplyType.TEXT, reply.text_content)
self._send(text_reply, context)
# 短暂延迟后发送图片
time.sleep(0.3)
self._send(reply, context)
else:
self._send(reply, context)
def _extract_and_send_images(self, reply: Reply, context: Context):
"""
从文本回复中提取图片/视频URL并单独发送
支持格式:[图片: /path/to/image.png], [视频: /path/to/video.mp4], ![](url), <img src="url">
最多发送5个媒体文件
"""
content = reply.content
media_items = [] # [(url, type), ...]
# 正则提取各种格式的媒体URL
patterns = [
(r'\[图片:\s*([^\]]+)\]', 'image'), # [图片: /path/to/image.png]
(r'\[视频:\s*([^\]]+)\]', 'video'), # [视频: /path/to/video.mp4]
(r'!\[.*?\]\(([^\)]+)\)', 'image'), # ![alt](url) - 默认图片
(r'<img[^>]+src=["\']([^"\']+)["\']', 'image'), # <img src="url">
(r'<video[^>]+src=["\']([^"\']+)["\']', 'video'), # <video src="url">
(r'https?://[^\s]+\.(?:jpg|jpeg|png|gif|webp)', 'image'), # 直接的图片URL
(r'https?://[^\s]+\.(?:mp4|avi|mov|wmv|flv)', 'video'), # 直接的视频URL
]
for pattern, media_type in patterns:
matches = re.findall(pattern, content, re.IGNORECASE)
for match in matches:
media_items.append((match, media_type))
# 去重保持顺序并限制最多5个
seen = set()
unique_items = []
for url, mtype in media_items:
if url not in seen:
seen.add(url)
unique_items.append((url, mtype))
media_items = unique_items[:5]
if media_items:
logger.info(f"[chat_channel] Extracted {len(media_items)} media item(s) from reply")
# 先发送文本(保持原文本不变)
self._send(reply, context)
# 然后逐个发送媒体文件
for i, (url, media_type) in enumerate(media_items):
try:
# 判断是本地文件还是URL
if url.startswith(('http://', 'https://')):
# 网络资源
if media_type == 'video':
# 视频使用 FILE 类型发送
media_reply = Reply(ReplyType.FILE, url)
media_reply.file_name = os.path.basename(url)
else:
# 图片使用 IMAGE_URL 类型
media_reply = Reply(ReplyType.IMAGE_URL, url)
elif os.path.exists(url):
# 本地文件
if media_type == 'video':
# 视频使用 FILE 类型,转换为 file:// URL
media_reply = Reply(ReplyType.FILE, f"file://{url}")
media_reply.file_name = os.path.basename(url)
else:
# 图片使用 IMAGE_URL 类型,转换为 file:// URL
media_reply = Reply(ReplyType.IMAGE_URL, f"file://{url}")
else:
logger.warning(f"[chat_channel] Media file not found or invalid URL: {url}")
continue
# 发送媒体文件(添加小延迟避免频率限制)
if i > 0:
time.sleep(0.5)
self._send(media_reply, context)
logger.info(f"[chat_channel] Sent {media_type} {i+1}/{len(media_items)}: {url[:50]}...")
except Exception as e:
logger.error(f"[chat_channel] Failed to send {media_type} {url}: {e}")
else:
# 没有媒体文件,正常发送文本
self._send(reply, context)
def _send(self, reply: Reply, context: Context, retry_cnt=0): def _send(self, reply: Reply, context: Context, retry_cnt=0):
try: try:

View File

@@ -9,6 +9,7 @@ import json
# -*- coding=utf-8 -*- # -*- coding=utf-8 -*-
import logging import logging
import time import time
import requests
import dingtalk_stream import dingtalk_stream
from dingtalk_stream import AckMessage from dingtalk_stream import AckMessage
@@ -107,16 +108,156 @@ class DingTalkChanel(ChatChannel, dingtalk_stream.ChatbotHandler):
conf()["group_name_white_list"] = ["ALL_GROUP"] conf()["group_name_white_list"] = ["ALL_GROUP"]
# 单聊无需前缀 # 单聊无需前缀
conf()["single_chat_prefix"] = [""] conf()["single_chat_prefix"] = [""]
# Access token cache
self._access_token = None
self._access_token_expires_at = 0
# Robot code cache (extracted from incoming messages)
self._robot_code = None
def startup(self): def startup(self):
credential = dingtalk_stream.Credential(self.dingtalk_client_id, self.dingtalk_client_secret) credential = dingtalk_stream.Credential(self.dingtalk_client_id, self.dingtalk_client_secret)
client = dingtalk_stream.DingTalkStreamClient(credential) client = dingtalk_stream.DingTalkStreamClient(credential)
client.register_callback_handler(dingtalk_stream.chatbot.ChatbotMessage.TOPIC, self) client.register_callback_handler(dingtalk_stream.chatbot.ChatbotMessage.TOPIC, self)
client.start_forever() client.start_forever()
def get_access_token(self):
"""
获取企业内部应用的 access_token
文档: https://open.dingtalk.com/document/orgapp/obtain-orgapp-token
"""
current_time = time.time()
# 如果 token 还没过期,直接返回缓存的 token
if self._access_token and current_time < self._access_token_expires_at:
return self._access_token
# 获取新的 access_token
url = "https://api.dingtalk.com/v1.0/oauth2/accessToken"
headers = {"Content-Type": "application/json"}
data = {
"appKey": self.dingtalk_client_id,
"appSecret": self.dingtalk_client_secret
}
try:
response = requests.post(url, headers=headers, json=data, timeout=10)
result = response.json()
if response.status_code == 200 and "accessToken" in result:
self._access_token = result["accessToken"]
# Token 有效期为 2 小时,提前 5 分钟刷新
self._access_token_expires_at = current_time + result.get("expireIn", 7200) - 300
logger.info("[DingTalk] Access token refreshed successfully")
return self._access_token
else:
logger.error(f"[DingTalk] Failed to get access token: {result}")
return None
except Exception as e:
logger.error(f"[DingTalk] Error getting access token: {e}")
return None
def send_single_message(self, user_id: str, content: str, robot_code: str) -> bool:
"""
Send message to single user (private chat)
API: https://open.dingtalk.com/document/orgapp/chatbots-send-one-on-one-chat-messages-in-batches
"""
access_token = self.get_access_token()
if not access_token:
logger.error("[DingTalk] Failed to send single message: Access token not available.")
return False
if not robot_code:
logger.error("[DingTalk] Cannot send single message: robot_code is required")
return False
url = "https://api.dingtalk.com/v1.0/robot/oToMessages/batchSend"
headers = {
"x-acs-dingtalk-access-token": access_token,
"Content-Type": "application/json"
}
data = {
"msgParam": json.dumps({"content": content}),
"msgKey": "sampleText",
"userIds": [user_id],
"robotCode": robot_code
}
logger.info(f"[DingTalk] Sending single message to user {user_id} with robot_code {robot_code}")
try:
response = requests.post(url, headers=headers, json=data, timeout=10)
result = response.json()
if response.status_code == 200 and result.get("processQueryKey"):
logger.info(f"[DingTalk] Single message sent successfully to {user_id}")
return True
else:
logger.error(f"[DingTalk] Failed to send single message: {result}")
return False
except Exception as e:
logger.error(f"[DingTalk] Error sending single message: {e}")
return False
def send_group_message(self, conversation_id: str, content: str, robot_code: str = None):
"""
主动发送群消息
文档: https://open.dingtalk.com/document/orgapp/the-robot-sends-a-group-message
Args:
conversation_id: 会话ID (openConversationId)
content: 消息内容
robot_code: 机器人编码,默认使用 dingtalk_client_id
"""
access_token = self.get_access_token()
if not access_token:
logger.error("[DingTalk] Cannot send group message: no access token")
return False
# Validate robot_code
if not robot_code:
logger.error("[DingTalk] Cannot send group message: robot_code is required")
return False
url = "https://api.dingtalk.com/v1.0/robot/groupMessages/send"
headers = {
"x-acs-dingtalk-access-token": access_token,
"Content-Type": "application/json"
}
data = {
"msgParam": json.dumps({"content": content}),
"msgKey": "sampleText",
"openConversationId": conversation_id,
"robotCode": robot_code
}
try:
response = requests.post(url, headers=headers, json=data, timeout=10)
result = response.json()
if response.status_code == 200:
logger.info(f"[DingTalk] Group message sent successfully to {conversation_id}")
return True
else:
logger.error(f"[DingTalk] Failed to send group message: {result}")
return False
except Exception as e:
logger.error(f"[DingTalk] Error sending group message: {e}")
return False
async def process(self, callback: dingtalk_stream.CallbackMessage): async def process(self, callback: dingtalk_stream.CallbackMessage):
try: try:
incoming_message = dingtalk_stream.ChatbotMessage.from_dict(callback.data) incoming_message = dingtalk_stream.ChatbotMessage.from_dict(callback.data)
# Debug: 打印完整的 event 数据
logger.info(f"[DingTalk] ===== Incoming Message Debug =====")
logger.info(f"[DingTalk] callback.data keys: {callback.data.keys() if hasattr(callback.data, 'keys') else 'N/A'}")
logger.info(f"[DingTalk] incoming_message attributes: {dir(incoming_message)}")
logger.info(f"[DingTalk] robot_code: {getattr(incoming_message, 'robot_code', 'N/A')}")
logger.info(f"[DingTalk] chatbot_corp_id: {getattr(incoming_message, 'chatbot_corp_id', 'N/A')}")
logger.info(f"[DingTalk] chatbot_user_id: {getattr(incoming_message, 'chatbot_user_id', 'N/A')}")
logger.info(f"[DingTalk] conversation_id: {getattr(incoming_message, 'conversation_id', 'N/A')}")
logger.info(f"[DingTalk] Raw callback.data: {callback.data}")
logger.info(f"[DingTalk] =====================================")
image_download_handler = self # 传入方法所在的类实例 image_download_handler = self # 传入方法所在的类实例
dingtalk_msg = DingTalkMessage(incoming_message, image_download_handler) dingtalk_msg = DingTalkMessage(incoming_message, image_download_handler)
@@ -174,8 +315,48 @@ class DingTalkChanel(ChatChannel, dingtalk_stream.ChatbotHandler):
def send(self, reply: Reply, context: Context): def send(self, reply: Reply, context: Context):
receiver = context["receiver"] receiver = context["receiver"]
isgroup = context.kwargs['msg'].is_group
incoming_message = context.kwargs['msg'].incoming_message # Check if msg exists (for scheduled tasks, msg might be None)
msg = context.kwargs.get('msg')
if msg is None:
# 定时任务场景:使用主动发送 API
is_group = context.get("isgroup", False)
logger.info(f"[DingTalk] Sending scheduled task message to {receiver} (is_group={is_group})")
# 使用缓存的 robot_code 或配置的值
robot_code = self._robot_code or conf().get("dingtalk_robot_code")
logger.info(f"[DingTalk] Using robot_code: {robot_code}, cached: {self._robot_code}, config: {conf().get('dingtalk_robot_code')}")
if not robot_code:
logger.error(f"[DingTalk] Cannot send scheduled task: robot_code not available. Please send at least one message to the bot first, or configure dingtalk_robot_code in config.json")
return
# 根据是否群聊选择不同的 API
if is_group:
success = self.send_group_message(receiver, reply.content, robot_code)
else:
# 单聊场景:尝试从 context 中获取 dingtalk_sender_staff_id
sender_staff_id = context.get("dingtalk_sender_staff_id")
if not sender_staff_id:
logger.error(f"[DingTalk] Cannot send single chat scheduled message: sender_staff_id not available in context")
return
logger.info(f"[DingTalk] Sending single message to staff_id: {sender_staff_id}")
success = self.send_single_message(sender_staff_id, reply.content, robot_code)
if not success:
logger.error(f"[DingTalk] Failed to send scheduled task message")
return
# 从正常消息中提取并缓存 robot_code
if hasattr(msg, 'robot_code'):
robot_code = msg.robot_code
if robot_code and robot_code != self._robot_code:
self._robot_code = robot_code
logger.info(f"[DingTalk] Cached robot_code: {robot_code}")
isgroup = msg.is_group
incoming_message = msg.incoming_message
if conf().get("dingtalk_card_enabled"): if conf().get("dingtalk_card_enabled"):
logger.info("[Dingtalk] sendMsg={}, receiver={}".format(reply, receiver)) logger.info("[Dingtalk] sendMsg={}, receiver={}".format(reply, receiver))

View File

@@ -22,6 +22,7 @@ class DingTalkMessage(ChatMessage):
self.create_time = event.create_at self.create_time = event.create_at
self.image_content = event.image_content self.image_content = event.image_content
self.rich_text_content = event.rich_text_content self.rich_text_content = event.rich_text_content
self.robot_code = event.robot_code # 机器人编码
if event.conversation_type == "1": if event.conversation_type == "1":
self.is_group = False self.is_group = False
else: else:

View File

@@ -204,10 +204,36 @@ class FeiShuChanel(ChatChannel):
# 图片上传 # 图片上传
reply_content = self._upload_image_url(reply.content, access_token) reply_content = self._upload_image_url(reply.content, access_token)
if not reply_content: if not reply_content:
logger.warning("[FeiShu] upload file failed") logger.warning("[FeiShu] upload image failed")
return return
msg_type = "image" msg_type = "image"
content_key = "image_key" content_key = "image_key"
elif reply.type == ReplyType.FILE:
# 判断是否为视频文件
file_path = reply.content
if file_path.startswith("file://"):
file_path = file_path[7:]
is_video = file_path.lower().endswith(('.mp4', '.avi', '.mov', '.wmv', '.flv'))
if is_video:
# 视频使用 media 类型
file_key = self._upload_video_url(reply.content, access_token)
if not file_key:
logger.warning("[FeiShu] upload video failed")
return
reply_content = file_key
msg_type = "media"
content_key = "file_key"
else:
# 其他文件使用 file 类型
file_key = self._upload_file_url(reply.content, access_token)
if not file_key:
logger.warning("[FeiShu] upload file failed")
return
reply_content = file_key
msg_type = "file"
content_key = "file_key"
# Check if we can reply to an existing message (need msg_id) # Check if we can reply to an existing message (need msg_id)
can_reply = is_group and msg and hasattr(msg, 'msg_id') and msg.msg_id can_reply = is_group and msg and hasattr(msg, 'msg_id') and msg.msg_id
@@ -260,7 +286,34 @@ class FeiShuChanel(ChatChannel):
def _upload_image_url(self, img_url, access_token): def _upload_image_url(self, img_url, access_token):
logger.debug(f"[WX] start download image, img_url={img_url}") logger.debug(f"[FeiShu] start process image, img_url={img_url}")
# Check if it's a local file path (file:// protocol)
if img_url.startswith("file://"):
local_path = img_url[7:] # Remove "file://" prefix
logger.info(f"[FeiShu] uploading local file: {local_path}")
if not os.path.exists(local_path):
logger.error(f"[FeiShu] local file not found: {local_path}")
return None
# Upload directly from local file
upload_url = "https://open.feishu.cn/open-apis/im/v1/images"
data = {'image_type': 'message'}
headers = {'Authorization': f'Bearer {access_token}'}
with open(local_path, "rb") as file:
upload_response = requests.post(upload_url, files={"image": file}, data=data, headers=headers)
logger.info(f"[FeiShu] upload file, res={upload_response.content}")
response_data = upload_response.json()
if response_data.get("code") == 0:
return response_data.get("data").get("image_key")
else:
logger.error(f"[FeiShu] upload failed: {response_data}")
return None
# Original logic for HTTP URLs
response = requests.get(img_url) response = requests.get(img_url)
suffix = utils.get_path_suffix(img_url) suffix = utils.get_path_suffix(img_url)
temp_name = str(uuid.uuid4()) + "." + suffix temp_name = str(uuid.uuid4()) + "." + suffix
@@ -283,6 +336,207 @@ class FeiShuChanel(ChatChannel):
os.remove(temp_name) os.remove(temp_name)
return upload_response.json().get("data").get("image_key") return upload_response.json().get("data").get("image_key")
def _upload_video_url(self, video_url, access_token):
"""
Upload video to Feishu and return file_key (for media type messages)
Supports:
- file:// URLs for local files
- http(s):// URLs (download then upload)
"""
# For file:// URLs (local files), upload directly
if video_url.startswith("file://"):
local_path = video_url[7:] # Remove file:// prefix
if not os.path.exists(local_path):
logger.error(f"[FeiShu] local video file not found: {local_path}")
return None
file_name = os.path.basename(local_path)
file_ext = os.path.splitext(file_name)[1].lower()
# Determine file type for Feishu API (for media messages)
# Media type only supports mp4
file_type_map = {
'.mp4': 'mp4',
}
file_type = file_type_map.get(file_ext, 'mp4') # Default to mp4
# Upload video to Feishu (use file upload API, but send as media type)
upload_url = "https://open.feishu.cn/open-apis/im/v1/files"
data = {'file_type': file_type, 'file_name': file_name}
headers = {'Authorization': f'Bearer {access_token}'}
try:
with open(local_path, "rb") as file:
upload_response = requests.post(
upload_url,
files={"file": file},
data=data,
headers=headers,
timeout=(5, 60) # 5s connect, 60s read timeout (videos are larger)
)
logger.info(f"[FeiShu] upload video response, status={upload_response.status_code}, res={upload_response.content}")
response_data = upload_response.json()
if response_data.get("code") == 0:
return response_data.get("data").get("file_key")
else:
logger.error(f"[FeiShu] upload video failed: {response_data}")
return None
except Exception as e:
logger.error(f"[FeiShu] upload video exception: {e}")
return None
# For HTTP URLs, download first then upload
try:
logger.info(f"[FeiShu] Downloading video from URL: {video_url}")
response = requests.get(video_url, timeout=(5, 60))
if response.status_code != 200:
logger.error(f"[FeiShu] download video failed, status={response.status_code}")
return None
# Save to temp file
import uuid
file_name = os.path.basename(video_url) or "video.mp4"
temp_name = str(uuid.uuid4()) + "_" + file_name
with open(temp_name, "wb") as file:
file.write(response.content)
logger.info(f"[FeiShu] Video downloaded, size={len(response.content)} bytes, uploading...")
# Upload
file_ext = os.path.splitext(file_name)[1].lower()
file_type_map = {
'.mp4': 'mp4',
}
file_type = file_type_map.get(file_ext, 'mp4')
upload_url = "https://open.feishu.cn/open-apis/im/v1/files"
data = {'file_type': file_type, 'file_name': file_name}
headers = {'Authorization': f'Bearer {access_token}'}
with open(temp_name, "rb") as file:
upload_response = requests.post(upload_url, files={"file": file}, data=data, headers=headers, timeout=(5, 60))
logger.info(f"[FeiShu] upload video, res={upload_response.content}")
response_data = upload_response.json()
os.remove(temp_name) # Clean up temp file
if response_data.get("code") == 0:
return response_data.get("data").get("file_key")
else:
logger.error(f"[FeiShu] upload video failed: {response_data}")
return None
except Exception as e:
logger.error(f"[FeiShu] upload video from URL exception: {e}")
# Clean up temp file if exists
if 'temp_name' in locals() and os.path.exists(temp_name):
os.remove(temp_name)
return None
def _upload_file_url(self, file_url, access_token):
"""
Upload file to Feishu
Supports both local files (file://) and HTTP URLs
"""
logger.debug(f"[FeiShu] start process file, file_url={file_url}")
# Check if it's a local file path (file:// protocol)
if file_url.startswith("file://"):
local_path = file_url[7:] # Remove "file://" prefix
logger.info(f"[FeiShu] uploading local file: {local_path}")
if not os.path.exists(local_path):
logger.error(f"[FeiShu] local file not found: {local_path}")
return None
# Get file info
file_name = os.path.basename(local_path)
file_ext = os.path.splitext(file_name)[1].lower()
# Determine file type for Feishu API
# Feishu supports: opus, mp4, pdf, doc, xls, ppt, stream (other types)
file_type_map = {
'.opus': 'opus',
'.mp4': 'mp4',
'.pdf': 'pdf',
'.doc': 'doc', '.docx': 'doc',
'.xls': 'xls', '.xlsx': 'xls',
'.ppt': 'ppt', '.pptx': 'ppt',
}
file_type = file_type_map.get(file_ext, 'stream') # Default to stream for other types
# Upload file to Feishu
upload_url = "https://open.feishu.cn/open-apis/im/v1/files"
data = {'file_type': file_type, 'file_name': file_name}
headers = {'Authorization': f'Bearer {access_token}'}
try:
with open(local_path, "rb") as file:
upload_response = requests.post(
upload_url,
files={"file": file},
data=data,
headers=headers,
timeout=(5, 30) # 5s connect, 30s read timeout
)
logger.info(f"[FeiShu] upload file response, status={upload_response.status_code}, res={upload_response.content}")
response_data = upload_response.json()
if response_data.get("code") == 0:
return response_data.get("data").get("file_key")
else:
logger.error(f"[FeiShu] upload file failed: {response_data}")
return None
except Exception as e:
logger.error(f"[FeiShu] upload file exception: {e}")
return None
# For HTTP URLs, download first then upload
try:
response = requests.get(file_url, timeout=(5, 30))
if response.status_code != 200:
logger.error(f"[FeiShu] download file failed, status={response.status_code}")
return None
# Save to temp file
import uuid
file_name = os.path.basename(file_url)
temp_name = str(uuid.uuid4()) + "_" + file_name
with open(temp_name, "wb") as file:
file.write(response.content)
# Upload
file_ext = os.path.splitext(file_name)[1].lower()
file_type_map = {
'.opus': 'opus', '.mp4': 'mp4', '.pdf': 'pdf',
'.doc': 'doc', '.docx': 'doc',
'.xls': 'xls', '.xlsx': 'xls',
'.ppt': 'ppt', '.pptx': 'ppt',
}
file_type = file_type_map.get(file_ext, 'stream')
upload_url = "https://open.feishu.cn/open-apis/im/v1/files"
data = {'file_type': file_type, 'file_name': file_name}
headers = {'Authorization': f'Bearer {access_token}'}
with open(temp_name, "rb") as file:
upload_response = requests.post(upload_url, files={"file": file}, data=data, headers=headers)
logger.info(f"[FeiShu] upload file, res={upload_response.content}")
response_data = upload_response.json()
os.remove(temp_name) # Clean up temp file
if response_data.get("code") == 0:
return response_data.get("data").get("file_key")
else:
logger.error(f"[FeiShu] upload file failed: {response_data}")
return None
except Exception as e:
logger.error(f"[FeiShu] upload file from URL exception: {e}")
return None
def _compose_context(self, ctype: ContextType, content, **kwargs): def _compose_context(self, ctype: ContextType, content, **kwargs):
context = Context(ctype, content) context = Context(ctype, content)
context.kwargs = kwargs context.kwargs = kwargs
@@ -291,13 +545,18 @@ class FeiShuChanel(ChatChannel):
cmsg = context["msg"] cmsg = context["msg"]
# Set session_id based on chat type to ensure proper session isolation # Set session_id based on chat type
if cmsg.is_group: if cmsg.is_group:
# Group chat: combine user_id and group_id to create unique session per user per group # Group chat: check if group_shared_session is enabled
# This ensures: if conf().get("group_shared_session", True):
# - Same user in different groups have separate conversation histories # All users in the group share the same session context
# - Same user in private chat and group chat have separate histories context["session_id"] = cmsg.other_user_id # group_id
context["session_id"] = f"{cmsg.from_user_id}:{cmsg.other_user_id}" else:
# Each user has their own session within the group
# This ensures:
# - Same user in different groups have separate conversation histories
# - Same user in private chat and group chat have separate histories
context["session_id"] = f"{cmsg.from_user_id}:{cmsg.other_user_id}"
else: else:
# Private chat: use user_id only # Private chat: use user_id only
context["session_id"] = cmsg.from_user_id context["session_id"] = cmsg.from_user_id

View File

@@ -1,10 +1,12 @@
from bridge.context import ContextType from bridge.context import ContextType
from channel.chat_message import ChatMessage from channel.chat_message import ChatMessage
import json import json
import os
import requests import requests
from common.log import logger from common.log import logger
from common.tmp_dir import TmpDir from common.tmp_dir import TmpDir
from common import utils from common import utils
from config import conf
class FeishuMessage(ChatMessage): class FeishuMessage(ChatMessage):
@@ -22,6 +24,99 @@ class FeishuMessage(ChatMessage):
self.ctype = ContextType.TEXT self.ctype = ContextType.TEXT
content = json.loads(msg.get('content')) content = json.loads(msg.get('content'))
self.content = content.get("text").strip() self.content = content.get("text").strip()
elif msg_type == "image":
# 单张图片消息,不处理和存储
self.ctype = ContextType.IMAGE
content = json.loads(msg.get("content"))
image_key = content.get("image_key")
# 仅记录图片key不下载
self.content = f"[图片: {image_key}]"
logger.info(f"[FeiShu] Received single image message, key={image_key}, skipped download")
elif msg_type == "post":
# 富文本消息,可能包含图片、文本等多种元素
content = json.loads(msg.get("content"))
# 飞书富文本消息结构content 直接包含 title 和 content 数组
# 不是嵌套在 post 字段下
title = content.get("title", "")
content_list = content.get("content", [])
logger.info(f"[FeiShu] Post message - title: '{title}', content_list length: {len(content_list)}")
# 收集所有图片和文本
image_keys = []
text_parts = []
if title:
text_parts.append(title)
for block in content_list:
logger.debug(f"[FeiShu] Processing block: {block}")
# block 本身就是元素列表
if not isinstance(block, list):
continue
for element in block:
element_tag = element.get("tag")
logger.debug(f"[FeiShu] Element tag: {element_tag}, element: {element}")
if element_tag == "img":
# 找到图片元素
image_key = element.get("image_key")
if image_key:
image_keys.append(image_key)
elif element_tag == "text":
# 文本元素
text_content = element.get("text", "")
if text_content:
text_parts.append(text_content)
logger.info(f"[FeiShu] Parsed - images: {len(image_keys)}, text_parts: {text_parts}")
# 富文本消息统一作为文本消息处理
self.ctype = ContextType.TEXT
if image_keys:
# 如果包含图片,下载并在文本中引用本地路径
workspace_root = os.path.expanduser(conf().get("agent_workspace", "~/cow"))
tmp_dir = os.path.join(workspace_root, "tmp")
os.makedirs(tmp_dir, exist_ok=True)
# 保存图片路径映射
self.image_paths = {}
for image_key in image_keys:
image_path = os.path.join(tmp_dir, f"{image_key}.png")
self.image_paths[image_key] = image_path
def _download_images():
for image_key, image_path in self.image_paths.items():
url = f"https://open.feishu.cn/open-apis/im/v1/messages/{self.msg_id}/resources/{image_key}"
headers = {"Authorization": "Bearer " + access_token}
params = {"type": "image"}
response = requests.get(url=url, headers=headers, params=params)
if response.status_code == 200:
with open(image_path, "wb") as f:
f.write(response.content)
logger.info(f"[FeiShu] Image downloaded from post message, key={image_key}, path={image_path}")
else:
logger.error(f"[FeiShu] Failed to download image from post, key={image_key}, status={response.status_code}")
# 立即下载图片,不使用延迟下载
# 因为 TEXT 类型消息不会调用 prepare()
_download_images()
# 构建消息内容:文本 + 图片路径
content_parts = []
if text_parts:
content_parts.append("\n".join(text_parts).strip())
for image_key, image_path in self.image_paths.items():
content_parts.append(f"[图片: {image_path}]")
self.content = "\n".join(content_parts)
logger.info(f"[FeiShu] Received post message with {len(image_keys)} image(s) and text: {self.content}")
else:
# 纯文本富文本消息
self.content = "\n".join(text_parts).strip() if text_parts else "[富文本消息]"
logger.info(f"[FeiShu] Received post message (text only): {self.content}")
elif msg_type == "file": elif msg_type == "file":
self.ctype = ContextType.FILE self.ctype = ContextType.FILE
content = json.loads(msg.get("content")) content = json.loads(msg.get("content"))

View File

@@ -20,9 +20,7 @@
"Agent测试群", "Agent测试群",
"ChatGPT测试群2" "ChatGPT测试群2"
], ],
"image_create_prefix": [ "image_create_prefix": [""],
"画"
],
"speech_recognition": true, "speech_recognition": true,
"group_speech_recognition": false, "group_speech_recognition": false,
"voice_reply_voice": false, "voice_reply_voice": false,

View File

@@ -35,6 +35,7 @@ available_setting = {
"group_name_white_list": ["ChatGPT测试群", "ChatGPT测试群2"], # 开启自动回复的群名称列表 "group_name_white_list": ["ChatGPT测试群", "ChatGPT测试群2"], # 开启自动回复的群名称列表
"group_name_keyword_white_list": [], # 开启自动回复的群名称关键词列表 "group_name_keyword_white_list": [], # 开启自动回复的群名称关键词列表
"group_chat_in_one_session": ["ChatGPT测试群"], # 支持会话上下文共享的群名称 "group_chat_in_one_session": ["ChatGPT测试群"], # 支持会话上下文共享的群名称
"group_shared_session": True, # 群聊是否共享会话上下文所有成员共享默认为True。False时每个用户在群内有独立会话
"nick_name_black_list": [], # 用户昵称黑名单 "nick_name_black_list": [], # 用户昵称黑名单
"group_welcome_msg": "", # 配置新人进群固定欢迎语,不配置则使用随机风格欢迎 "group_welcome_msg": "", # 配置新人进群固定欢迎语,不配置则使用随机风格欢迎
"trigger_by_self": False, # 是否允许机器人触发 "trigger_by_self": False, # 是否允许机器人触发

View File

@@ -365,6 +365,7 @@ class ClaudeAPIBot(Bot, OpenAIImage):
# Track tool use state # Track tool use state
tool_uses_map = {} # {index: {id, name, input}} tool_uses_map = {} # {index: {id, name, input}}
current_tool_use_index = -1 current_tool_use_index = -1
stop_reason = None # Track stop reason from Claude
try: try:
# Make streaming HTTP request # Make streaming HTTP request
@@ -440,6 +441,12 @@ class ClaudeAPIBot(Bot, OpenAIImage):
tool_uses_map[current_tool_use_index]["input"] += delta.get("partial_json", "") tool_uses_map[current_tool_use_index]["input"] += delta.get("partial_json", "")
elif event_type == "message_delta": elif event_type == "message_delta":
# Extract stop_reason from delta
delta = event.get("delta", {})
if "stop_reason" in delta:
stop_reason = delta.get("stop_reason")
logger.info(f"[Claude] Stream stop_reason: {stop_reason}")
# Message complete - yield tool calls if any # Message complete - yield tool calls if any
if tool_uses_map: if tool_uses_map:
for idx in sorted(tool_uses_map.keys()): for idx in sorted(tool_uses_map.keys()):
@@ -462,9 +469,13 @@ class ClaudeAPIBot(Bot, OpenAIImage):
} }
}] }]
}, },
"finish_reason": None "finish_reason": stop_reason
}] }]
} }
elif event_type == "message_stop":
# Final event - log completion
logger.debug(f"[Claude] Stream completed with stop_reason: {stop_reason}")
except json.JSONDecodeError: except json.JSONDecodeError:
continue continue

View File

@@ -0,0 +1,297 @@
# LinkAI Agent Skill
这个 skill 允许你调用 LinkAI 平台上的多个应用(App)和工作流(Workflow),通过简单的配置即可集成多个智能体能力。
## 特性
-**多应用支持** - 在一个配置文件中管理多个 LinkAI 应用/工作流
-**动态加载** - skill 系统加载时自动从 `config.json` 读取应用列表
-**自动技能描述** - 所有配置的应用会自动添加到技能描述中
-**模型切换** - 可以为每个请求指定不同的模型
-**知识库集成** - 支持应用绑定的知识库
-**插件能力** - 支持应用启用的各类插件
-**工作流执行** - 支持执行复杂的多步骤工作流
## 快速开始
### 1. 配置 API Key
```bash
env_config(action="set", key="LINKAI_API_KEY", value="your-linkai-api-key")
```
获取 API Key: https://link-ai.tech/console/interface
### 2. 配置应用列表
`config.json.template` 复制为 `config.json`
```bash
cp config.json.template config.json
```
编辑 `config.json`,添加你的应用/工作流:
```json
{
"apps": [
{
"app_code": "G7z6vKwp",
"app_name": "通用助手",
"app_description": "通用AI助手可以回答各类问题"
},
{
"app_code": "your_kb_app",
"app_name": "产品文档助手",
"app_description": "基于产品文档知识库的问答助手"
},
{
"app_code": "your_workflow",
"app_name": "数据分析工作流",
"app_description": "执行数据清洗、分析和可视化的完整工作流"
}
]
}
```
**注意:** 修改 `config.json`Agent 在下次加载技能时会自动读取新配置。
### 3. 调用应用
```bash
bash scripts/call.sh "G7z6vKwp" "What is artificial intelligence?"
```
## 使用示例
### 基础调用
```bash
# 调用默认模型
bash scripts/call.sh "G7z6vKwp" "解释一下量子计算"
```
### 指定模型
```bash
# 使用 GPT-4.1 模型
bash scripts/call.sh "G7z6vKwp" "写一篇关于AI的文章" "LinkAI-4.1"
# 使用 DeepSeek 模型
bash scripts/call.sh "G7z6vKwp" "帮我写代码" "deepseek-chat"
# 使用 Claude 模型
bash scripts/call.sh "G7z6vKwp" "分析这段文本" "claude-4-sonnet"
```
### 调用工作流
```bash
# 工作流会按照配置的节点顺序执行
bash scripts/call.sh "workflow_code" "输入数据或问题"
```
## ⚠️ 重要提示
### 超时配置
LinkAI 应用(特别是视频/图片生成、复杂工作流)可能需要较长时间处理。
**脚本内置超时**
- 默认120 秒(适合大多数场景)
- 可通过第 5 个参数自定义:`bash scripts/call.sh <app_code> <question> "" "false" "180"`
**推荐超时时间**
- **文本问答**120 秒(默认)
- **图片生成**120-180 秒
- **视频生成**180-300 秒
Agent 调用时会自动设置合适的超时时间。
## 配置说明
### config.json 字段
| 字段 | 类型 | 说明 |
|------|------|------|
| `app_code` | string | 应用或工作流的唯一标识码,从 LinkAI 控制台获取 |
| `app_name` | string | 应用名称,会显示在技能描述中 |
| `app_description` | string | 应用功能描述,帮助 Agent 理解何时使用该应用 |
### 获取 app_code
1. 登录 [LinkAI 控制台](https://link-ai.tech/console)
2. 进入「应用管理」或「工作流管理」
3. 选择要集成的应用/工作流
4. 在应用详情页找到 `app_code`
## 支持的模型
LinkAI 支持多种主流 AI 模型:
**OpenAI 系列:**
- `LinkAI-4.1` - GPT-4.1 (1000K 上下文)
- `LinkAI-4.1-mini` - GPT-4.1 mini (1000K)
- `LinkAI-4.1-nano` - GPT-4.1 nano (1000K)
- `LinkAI-4o` - GPT-4o (128K)
- `LinkAI-4o-mini` - GPT-4o mini (128K)
**DeepSeek 系列:**
- `deepseek-chat` - DeepSeek-V3 对话模型 (64K)
- `deepseek-reasoner` - DeepSeek-R1 推理模型 (64K)
**Claude 系列:**
- `claude-4-sonnet` - Claude 4 Sonnet (200K)
- `claude-3-7-sonnet` - Claude 3.7 (200K)
- `claude-3-5-sonnet` - Claude 3.5 (200K)
**Google 系列:**
- `gemini-2.5-pro` - Gemini 2.5 Pro (1000K)
- `gemini-2.0-flash` - Gemini 2.0 Flash (1000K)
**国产模型:**
- `qwen3` - 通义千问3 (128K)
- `wenxin-4.5` - 文心一言4.5 (8K)
- `doubao-1.5-pro-256k` - 豆包1.5 (256K)
- `glm-4-plus` - 智谱GLM-4-Plus (4K)
完整模型列表https://link-ai.tech/console/models
## 应用类型
### 1. 普通应用
配置了系统提示词和参数的标准对话应用,可以:
- 设置角色和性格
- 绑定知识库
- 启用插件(图像识别、网页搜索、代码执行等)
### 2. 知识库应用
基于特定知识库的问答应用,适合:
- 企业内部知识库
- 产品文档问答
- 客户支持
### 3. 工作流
多步骤的自动化流程,可以:
- 串联多个处理节点
- 条件分支
- 循环处理
- 调用外部 API
## 响应格式
### 成功响应
```json
{
"app_code": "G7z6vKwp",
"content": "人工智能AI是计算机科学的一个分支...",
"usage": {
"prompt_tokens": 10,
"completion_tokens": 150,
"total_tokens": 160
}
}
```
### 错误响应
```json
{
"error": "LinkAI API error",
"message": "应用不存在",
"response": { ... }
}
```
## 常见错误
### LINKAI_API_KEY environment variable is not set
**原因:** 未配置 API Key
**解决:** 使用 `env_config` 工具设置 LINKAI_API_KEY
### 应用不存在 (402)
**原因:** app_code 不正确或应用已删除
**解决:** 检查 app_code 是否正确,确认应用存在
### 无访问权限 (403)
**原因:** 尝试访问他人的私有应用
**解决:** 确保应用是公开的或你是创建者
### 账号积分额度不足 (406)
**原因:** LinkAI 账户余额不足
**解决:** 前往控制台充值
### 内容审核不通过 (409)
**原因:** 请求或响应包含敏感内容
**解决:** 修改输入内容,避免敏感词
## 技术实现
### 自动技能描述生成
当 skill 系统加载 `linkai-agent` 时,会自动:
1. 读取 `config.json` 中的应用列表
2. 将每个应用的 name 和 description 动态添加到技能描述中
3. Agent 加载时会看到完整的应用列表
这是在 `agent/skills/loader.py` 中实现的特殊处理。
### 工作流程
```
用户配置 config.json
Agent 启动/重新加载技能
SkillLoader 检测到 linkai-agent
动态读取 config.json
生成包含所有应用描述的 description
Agent 看到所有可用应用的完整信息
用户请求触发
Agent 根据描述选择合适的应用
调用 call.sh <app_code> <question>
LinkAI API 处理并返回结果
```
## 最佳实践
1. **清晰的描述** - 为每个应用写清晰、具体的描述,帮助 Agent 理解应用用途
2. **合理分工** - 不同应用负责不同领域,避免功能重叠
3. **无需重启** - 修改 config.json 后Agent 下次加载技能时会自动更新
4. **模型选择** - 根据任务复杂度选择合适的模型
5. **知识库优化** - 为专业领域的应用绑定相关知识库
## 扩展用法
### 在 Agent 系统中使用
当 Agent 系统加载这个 skill 时,会自动从 `config.json` 读取应用列表并生成描述:
```
Call LinkAI apps/workflows. 通用助手(G7z6vKwp: 通用AI助手可以回答各类问题); 产品文档助手(kb_app_001: 基于产品文档知识库的问答助手); 数据分析工作流(wf_002: 执行数据清洗、分析和可视化的完整工作流)
```
Agent 会根据用户问题自动选择最合适的应用进行调用。
## 相关链接
- LinkAI 平台: https://link-ai.tech
- API 文档: https://docs.link-ai.tech
- 控制台: https://link-ai.tech/console
- 模型列表: https://link-ai.tech/console/models
- 应用广场: https://link-ai.tech/square
## License
Part of the chatgpt-on-wechat project.

View File

@@ -0,0 +1,165 @@
---
name: linkai-agent
description: Call LinkAI applications and workflows. Use bash command to execute like 'bash <base_dir>/scripts/call.sh <app_code> <question>'.
homepage: https://link-ai.tech
metadata:
emoji: 🤖
requires:
bins: ["curl"]
env: ["LINKAI_API_KEY"]
primaryEnv: "LINKAI_API_KEY"
---
# LinkAI Agent Caller
Call LinkAI applications and workflows through API. Supports multiple apps/workflows configured in config.json.
The available apps are dynamically loaded from `config.json` at skill loading time.
## Setup
This skill requires a LinkAI API key. If not configured:
1. Get your API key from https://link-ai.tech/console/api-keys
2. Set the key using: `env_config(action="set", key="LINKAI_API_KEY", value="your-key")`
## Configuration
1. Copy `config.json.template` to `config.json`
2. Configure your apps/workflows:
```json
{
"apps": [
{
"app_code": "your_app_code",
"app_name": "App Name",
"app_description": "What this app does"
}
]
}
```
3. The skill description will be automatically updated when the agent loads this skill
## Usage
**Important**: Scripts are located relative to this skill's base directory.
When you see this skill in `<available_skills>`, note the `<base_dir>` path.
**CRITICAL**: Always use `bash` command to execute the script:
```bash
# General pattern (MUST start with bash):
bash "<base_dir>/scripts/call.sh" "<app_code>" "<question>" [model] [stream] [timeout]
# DO NOT execute the script directly like this (WRONG):
# "<base_dir>/scripts/call.sh" ...
# Parameters:
# - app_code: LinkAI app or workflow code (required)
# - question: User question (required)
# - model: Override model (optional, uses app default if not specified)
# - stream: Enable streaming (true/false, default: false)
# - timeout: curl timeout in seconds (default: 120, recommended for video/image generation)
```
**IMPORTANT - Timeout Configuration**:
- The script has a **default timeout of 120 seconds** (suitable for most cases)
- For complex tasks (video generation, large workflows), pass a longer timeout as the 5th parameter
- The bash tool also needs sufficient timeout - set its `timeout` parameter accordingly
- Example: `bash(command="bash <script> <app_code> <question> '' 'false' 180", timeout=200)`
## Examples
### Call an app (uses default 60s timeout)
```bash
bash(command='bash "<base_dir>/scripts/call.sh" "G7z6vKwp" "What is AI?"', timeout=60)
```
### Call an app with specific model
```bash
bash(command='bash "<base_dir>/scripts/call.sh" "G7z6vKwp" "Explain machine learning" "LinkAI-4.1"', timeout=60)
```
### Call a workflow with custom timeout (video generation)
```bash
# Pass timeout as 5th parameter to script, and set bash timeout slightly longer
bash(command='bash "<base_dir>/scripts/call.sh" "workflow_code" "Generate a sunset video" "" "false" "180"', timeout=180)
```
```bash
bash "<base_dir>/scripts/call.sh" "workflow_code" "Analyze this data: ..."
```
## Supported Models
You can specify any LinkAI supported model:
- `LinkAI-4.1` - Latest GPT-4.1 model (1000K context)
- `LinkAI-4.1-mini` - GPT-4.1 mini (1000K context)
- `LinkAI-4o` - GPT-4o model (128K context)
- `LinkAI-4o-mini` - GPT-4o mini (128K context)
- `deepseek-chat` - DeepSeek-V3 (64K context)
- `deepseek-reasoner` - DeepSeek-R1 reasoning model
- `claude-4-sonnet` - Claude 4 Sonnet (200K context)
- `gemini-2.5-pro` - Gemini 2.5 Pro (1000K context)
- And many more...
Full model list: https://link-ai.tech/console/models
## Response Format
Success response:
```json
{
"app_code": "G7z6vKwp",
"content": "AI stands for Artificial Intelligence...",
"usage": {
"prompt_tokens": 10,
"completion_tokens": 50,
"total_tokens": 60
}
}
```
Error response:
```json
{
"error": "Error description",
"message": "Detailed error message"
}
```
## Features
-**Multiple Apps**: Configure and call multiple LinkAI apps/workflows
-**Dynamic Loading**: Apps are loaded from config.json at runtime
-**Model Override**: Optionally specify model per request
-**Streaming Support**: Enable streaming output
-**Knowledge Base**: Apps can use configured knowledge bases
-**Plugins**: Apps can use enabled plugins (image recognition, web search, etc.)
-**Workflows**: Execute complex multi-step workflows
## Notes
- Each app/workflow maintains its own configuration (prompt, model, temperature, etc.)
- Apps can have knowledge bases attached for domain-specific Q&A
- Workflows execute from start node to end node and return final output
- Token usage and costs depend on the model used
- See LinkAI documentation for pricing: https://link-ai.tech/console/funds
- The skill description is automatically generated from config.json when loaded
## Troubleshooting
**"LINKAI_API_KEY environment variable is not set"**
- Use env_config tool to set the API key
**"app_code is required"**
- Make sure you're passing the app_code as the first parameter
**"应用不存在" (App not found)**
- Check that the app_code is correct
- Ensure you have access to the app
**"账号积分额度不足" (Insufficient credits)**
- Top up your LinkAI account credits

View File

@@ -0,0 +1,14 @@
{
"apps": [
{
"app_code": "your_app_code_2",
"app_name": "知识库助手",
"app_description": "基于特定领域知识库提供智能问答的知识助手"
},
{
"app_code": "your_workflow_code",
"app_name": "数据分析工作流",
"app_description": "用于数据分析任务的工作流程"
}
]
}

View File

@@ -0,0 +1,138 @@
#!/usr/bin/env bash
# LinkAI Agent Caller
# API Docs: https://api.link-ai.tech/v1/chat/completions
set -euo pipefail
app_code="${1:-}"
question="${2:-}"
model="${3:-}"
stream="${4:-false}"
timeout="${5:-120}" # Default 120 seconds for video/image generation
if [ -z "$app_code" ]; then
echo '{"error": "app_code is required", "usage": "bash call.sh <app_code> <question> [model] [stream] [timeout]"}'
exit 1
fi
if [ -z "$question" ]; then
echo '{"error": "question is required", "usage": "bash call.sh <app_code> <question> [model] [stream] [timeout]"}'
exit 1
fi
if [ -z "${LINKAI_API_KEY:-}" ]; then
echo '{"error": "LINKAI_API_KEY environment variable is not set", "help": "Use env_config to set LINKAI_API_KEY"}'
exit 1
fi
# API endpoint
api_url="https://api.link-ai.tech/v1/chat/completions"
# Build JSON request body
if [ -n "$model" ]; then
request_body=$(cat <<EOF
{
"app_code": "$app_code",
"model": "$model",
"messages": [
{
"role": "user",
"content": "$question"
}
],
"stream": $stream
}
EOF
)
else
request_body=$(cat <<EOF
{
"app_code": "$app_code",
"messages": [
{
"role": "user",
"content": "$question"
}
],
"stream": $stream
}
EOF
)
fi
# Call LinkAI API
response=$(curl -sS --max-time "$timeout" \
-X POST \
-H "Authorization: Bearer $LINKAI_API_KEY" \
-H "Content-Type: application/json" \
-d "$request_body" \
"$api_url" 2>&1)
curl_exit_code=$?
if [ $curl_exit_code -ne 0 ]; then
echo "{\"error\": \"Failed to call LinkAI API\", \"details\": \"$response\"}"
exit 1
fi
# Simple JSON validation
if [[ ! "$response" =~ ^[[:space:]]*[\{\[] ]]; then
echo "{\"error\": \"Invalid JSON response from API\", \"response\": \"$response\"}"
exit 1
fi
# Check for API error (top-level error only, not content_filter_result)
if echo "$response" | grep -q '^[[:space:]]*{[[:space:]]*"error"[[:space:]]*:' || echo "$response" | grep -q '"error"[[:space:]]*:[[:space:]]*{[^}]*"code"[[:space:]]*:[[:space:]]*"[^"]*"[^}]*"message"'; then
# Make sure it's not just content_filter_result inside choices
if ! echo "$response" | grep -q '"choices"[[:space:]]*:[[:space:]]*\['; then
# Extract error message
error_msg=$(echo "$response" | grep -o '"message"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"message"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1)
error_code=$(echo "$response" | grep -o '"code"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"code"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1)
if [ -z "$error_msg" ]; then
error_msg="Unknown API error"
fi
# Provide friendly error message for content filter
if [ "$error_code" = "content_filter_error" ] || echo "$error_msg" | grep -qi "content.*filter"; then
echo "{\"error\": \"内容安全审核\", \"message\": \"您的问题或应用返回的内容触发了LinkAI的安全审核机制请换一种方式提问或检查应用配置\", \"details\": \"$error_msg\"}"
else
echo "{\"error\": \"LinkAI API error\", \"message\": \"$error_msg\", \"code\": \"$error_code\"}"
fi
exit 1
fi
fi
# For non-stream mode, extract and format the response
if [ "$stream" = "false" ]; then
# Extract content from response
content=$(echo "$response" | grep -o '"content"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"content"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1)
# Extract usage information
prompt_tokens=$(echo "$response" | grep -o '"prompt_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
completion_tokens=$(echo "$response" | grep -o '"completion_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
total_tokens=$(echo "$response" | grep -o '"total_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
if [ -n "$content" ]; then
# Unescape JSON content
content=$(echo "$content" | sed 's/\\n/\n/g' | sed 's/\\"/"/g')
cat <<EOF
{
"app_code": "$app_code",
"content": "$content",
"usage": {
"prompt_tokens": ${prompt_tokens:-0},
"completion_tokens": ${completion_tokens:-0},
"total_tokens": ${total_tokens:-0}
}
}
EOF
else
# Return full response if we can't extract content
echo "$response"
fi
else
# For stream mode, return raw response (caller needs to handle streaming)
echo "$response"
fi

View File

@@ -0,0 +1,168 @@
# OpenAI Image Vision - Usage Examples
## Setup
Set up your API credentials using the agent's env_config tool:
```bash
# Set your OpenAI API key
env_config(action="set", key="OPENAI_API_KEY", value="sk-your-api-key-here")
# Optional: Set custom API base URL (for proxy or compatible services)
env_config(action="set", key="OPENAI_API_BASE", value="https://api.openai.com/v1")
```
## Example 1: Analyze a Local Image
```bash
bash scripts/vision.sh "/path/to/photo.jpg" "What's in this image?"
```
**Expected Output:**
```json
{
"model": "gpt-4.1-mini",
"content": "The image shows a beautiful landscape with mountains in the background and a lake in the foreground. The sky is clear with some clouds, and there are trees along the shoreline.",
"usage": {
"prompt_tokens": 1234,
"completion_tokens": 45,
"total_tokens": 1279
}
}
```
## Example 2: Analyze an Image from URL
```bash
bash scripts/vision.sh "https://example.com/image.jpg" "Describe this image in detail"
```
## Example 3: Extract Text (OCR)
```bash
bash scripts/vision.sh "document.png" "Extract all text from this image"
```
**Use Case:** Extract text from screenshots, scanned documents, or photos of text.
## Example 4: Identify Objects
```bash
bash scripts/vision.sh "scene.jpg" "List all objects you can identify in this image"
```
## Example 5: Analyze Colors and Composition
```bash
bash scripts/vision.sh "artwork.jpg" "Describe the color palette and composition of this image"
```
## Example 6: Count Items
```bash
bash scripts/vision.sh "crowd.jpg" "How many people are in this image?"
```
## Example 7: Use Different Models
```bash
# Use gpt-4.1-mini (default, latest mini model)
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4.1-mini"
# Use gpt-4.1 (most capable, best for complex analysis)
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4.1"
# Use gpt-4o-mini (previous mini model)
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4o-mini"
```
## Example 8: Complex Analysis
```bash
bash scripts/vision.sh "product.jpg" "Analyze this product image. Describe the product, its features, colors, and suggest what kind of marketing copy would work well for it."
```
## Example 9: Safety and Content Moderation
```bash
bash scripts/vision.sh "content.jpg" "Is there any inappropriate or unsafe content in this image?"
```
## Example 10: Technical Analysis
```bash
bash scripts/vision.sh "diagram.png" "Explain what this technical diagram represents and how it works"
```
## Integration with Agent
When the agent loads this skill, it will be available in the `<available_skills>` section. The agent can use it like:
```bash
bash "<base_dir>/scripts/vision.sh" "user_uploaded_image.jpg" "What's in this image?"
```
The `<base_dir>` will be automatically provided by the skill system.
## Error Handling Examples
### Missing API Key
```bash
$ bash scripts/vision.sh "image.jpg" "What is this?"
{"error": "OPENAI_API_KEY environment variable is not set", "help": "Visit https://platform.openai.com/api-keys to get an API key"}
```
### File Not Found
```bash
$ bash scripts/vision.sh "nonexistent.jpg" "What is this?"
{"error": "Image file not found", "path": "nonexistent.jpg"}
```
### Unsupported Format
```bash
$ bash scripts/vision.sh "file.bmp" "What is this?"
{"error": "Unsupported image format", "extension": "bmp", "supported": ["jpg", "jpeg", "png", "gif", "webp"]}
```
### Missing Parameters
```bash
$ bash scripts/vision.sh
{"error": "Image path or URL is required", "usage": "bash vision.sh <image_path_or_url> <question> [model]"}
```
## Tips for Best Results
1. **Be Specific**: Ask clear, specific questions about what you want to know
2. **Image Quality**: Higher quality images generally produce better results
3. **Model Selection**:
- Use `gpt-4.1` for complex analysis requiring highest accuracy
- Use `gpt-4.1-mini` (default) for most tasks - latest mini model with good balance
4. **Text Extraction**: For OCR tasks, ensure text is clearly visible and not too small
5. **Multiple Aspects**: You can ask about multiple things in one question
6. **Context**: Provide context in your question if needed (e.g., "This is a medical scan, what do you see?")
## Performance Notes
- **Local Files**: Automatically base64-encoded, adds ~33% size overhead
- **URLs**: Passed directly to API, no encoding overhead
- **Timeout**: 60 seconds for API calls
- **Max Tokens**: 1000 tokens for responses (configurable in script)
- **Rate Limits**: Subject to your OpenAI API plan
## Supported Image Formats
✅ JPEG (`.jpg`, `.jpeg`)
✅ PNG (`.png`)
✅ GIF (`.gif`)
✅ WebP (`.webp`)
❌ BMP, TIFF, SVG, and other formats are not supported
## Cost Considerations
Vision API calls cost more than text-only calls because they include image tokens. Costs vary by:
- Model used (gpt-4.1 vs gpt-4.1-mini)
- Image size and resolution
- Length of response
Check OpenAI's pricing page for current rates: https://openai.com/pricing

View File

@@ -0,0 +1,178 @@
# OpenAI Image Vision Skill
This skill enables image analysis using OpenAI's Vision API (GPT-4 Vision models).
## Features
- ✅ Analyze images from local files or URLs
- ✅ Support for multiple image formats (JPEG, PNG, GIF, WebP)
- ✅ Automatic base64 encoding for local files
- ✅ Direct URL passing for remote images
- ✅ Configurable model selection
- ✅ Custom API base URL support
- ✅ Pure bash/curl implementation (no Python dependencies)
## Quick Start
1. **Set up API credentials using env_config:**
```bash
env_config(action="set", key="OPENAI_API_KEY", value="sk-your-api-key-here")
# Optional: custom API base
env_config(action="set", key="OPENAI_API_BASE", value="https://api.openai.com/v1")
```
2. **Analyze an image:**
```bash
bash scripts/vision.sh "/path/to/photo.jpg" "What's in this image?"
```
3. **Analyze from URL:**
```bash
bash scripts/vision.sh "https://example.com/image.jpg" "Describe this image"
```
```bash
bash scripts/vision.sh "/path/to/image.jpg" "What's in this image?"
```
3. **Analyze from URL:**
```bash
bash scripts/vision.sh "https://example.com/image.jpg" "Describe this image"
```
## Usage Examples
### Basic image analysis
```bash
bash scripts/vision.sh "photo.jpg" "What objects can you see?"
```
### Text extraction (OCR)
```bash
bash scripts/vision.sh "document.png" "Extract all text from this image"
```
### Detailed description
```bash
bash scripts/vision.sh "scene.jpg" "Describe this scene in detail, including colors, mood, and composition"
```
### Using different models
```bash
# Use gpt-4.1-mini (default, latest mini model)
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4.1-mini"
# Use gpt-4.1 (most capable, latest model)
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4.1"
# Use gpt-4o-mini (previous mini model)
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4o-mini"
```
## Environment Variables
| Variable | Required | Default | Description |
|----------|----------|---------|-------------|
| `OPENAI_API_KEY` | Yes | - | Your OpenAI API key |
| `OPENAI_API_BASE` | No | `https://api.openai.com/v1` | Custom API base URL |
## Response Format
Success response:
```json
{
"model": "gpt-4.1-mini",
"content": "The image shows a beautiful sunset over mountains...",
"usage": {
"prompt_tokens": 1234,
"completion_tokens": 567,
"total_tokens": 1801
}
}
```
Error response:
```json
{
"error": "Error description",
"details": "Additional information"
}
```
## Supported Models
- `gpt-4.1-mini` (default) - Latest mini model, fast and cost-effective
- `gpt-4.1` - Latest GPT-4 variant, most capable
- `gpt-4o-mini` - Previous generation mini model
- `gpt-4-turbo` - Previous generation turbo model
## Supported Image Formats
- JPEG (`.jpg`, `.jpeg`)
- PNG (`.png`)
- GIF (`.gif`)
- WebP (`.webp`)
## Technical Details
- **Implementation**: Pure bash script using curl and base64
- **Timeout**: 60 seconds for API calls
- **Max tokens**: 1000 tokens for responses
- **Image handling**:
- Local files are automatically base64-encoded
- URLs are passed directly to the API
- MIME types are auto-detected from file extensions
## Error Handling
The script handles various error cases:
- Missing required parameters
- Missing API key
- File not found
- Unsupported image formats
- API errors
- Network timeouts
- Invalid JSON responses
## Integration with Agent System
When loaded by the agent system, this skill will appear in `<available_skills>` with a `<base_dir>` path. Use it like:
```bash
bash "<base_dir>/scripts/vision.sh" "image.jpg" "What's in this image?"
```
The agent will automatically:
- Load environment variables from `~/.cow/.env`
- Provide the correct `<base_dir>` path
- Handle skill discovery and registration
## Notes
- Images are sent to OpenAI's servers for processing
- Large images may be automatically resized by the API
- Rate limits depend on your OpenAI API plan
- Token usage includes both the image and text in the prompt
- Base64 encoding increases the size of local images by ~33%
## Troubleshooting
**"OPENAI_API_KEY environment variable is not set"**
- Set the environment variable using env_config tool
- Or use the agent's env_config tool
**"Image file not found"**
- Check the file path is correct
- Use absolute paths or paths relative to current directory
**"Unsupported image format"**
- Only JPEG, PNG, GIF, and WebP are supported
- Check the file extension matches the actual format
**"Failed to call OpenAI API"**
- Check your internet connection
- Verify the API key is valid
- Check if custom API base URL is correct
## License
Part of the chatgpt-on-wechat project.

View File

@@ -0,0 +1,119 @@
---
name: openai-image-vision
description: Analyze images using OpenAI's Vision API. Use bash command to execute the vision script like 'bash <base_dir>/scripts/vision.sh <image> <question>'. Can understand image content, objects, text, colors, and answer questions about images.
homepage: https://platform.openai.com/docs/guides/vision
metadata:
emoji: 👁️
requires:
bins: ["curl", "base64"]
env: ["OPENAI_API_KEY"]
primaryEnv: "OPENAI_API_KEY"
---
# OpenAI Image Vision
Analyze images using OpenAI's GPT-4 Vision API. The model can understand visual elements including objects, shapes, colors, textures, and text within images.
## Setup
This skill requires an OpenAI API key. If not configured:
1. Get your API key from https://platform.openai.com/api-keys
2. Set the key using: `env_config(action="set", key="OPENAI_API_KEY", value="your-key")`
Optional: Set custom API base URL (default: https://api.openai.com/v1):
```bash
env_config(action="set", key="OPENAI_API_BASE", value="your-base-url")
```
## Usage
**Important**: Scripts are located relative to this skill's base directory.
When you see this skill in `<available_skills>`, note the `<base_dir>` path.
**CRITICAL**: Always use `bash` command to execute the script:
```bash
# General pattern (MUST start with bash):
bash "<base_dir>/scripts/vision.sh" "<image_path_or_url>" "<question>" [model]
# DO NOT execute the script directly like this (WRONG):
# "<base_dir>/scripts/vision.sh" ...
# Parameters:
# - image_path_or_url: Local image file path or HTTP(S) URL (required)
# - question: Question to ask about the image (required)
# - model: OpenAI model to use (default: gpt-4.1-mini)
# Options: gpt-4.1-mini, gpt-4.1, gpt-4o-mini, gpt-4-turbo
```
## Examples
### Analyze a local image
```bash
bash "<base_dir>/scripts/vision.sh" "/path/to/image.jpg" "What's in this image?"
```
### Analyze an image from URL
```bash
bash "<base_dir>/scripts/vision.sh" "https://example.com/image.jpg" "Describe this image in detail"
```
### Use specific model
```bash
bash "<base_dir>/scripts/vision.sh" "/path/to/photo.png" "What colors are prominent?" "gpt-4o-mini"
```
### Extract text from image
```bash
bash "<base_dir>/scripts/vision.sh" "/path/to/document.jpg" "Extract all text from this image"
```
### Analyze multiple aspects
```bash
bash "<base_dir>/scripts/vision.sh" "image.jpg" "List all objects you can see and describe the overall scene"
```
## Supported Image Formats
- JPEG (.jpg, .jpeg)
- PNG (.png)
- GIF (.gif)
- WebP (.webp)
**Performance Optimization**: Files larger than 1MB are automatically compressed to 800px (longest side) to avoid command-line parameter limits. This happens transparently without affecting analysis quality.
## Response Format
The script returns a JSON response:
```json
{
"model": "gpt-4.1-mini",
"content": "The image shows...",
"usage": {
"prompt_tokens": 1234,
"completion_tokens": 567,
"total_tokens": 1801
}
}
```
Or in case of error:
```json
{
"error": "Error description",
"details": "Additional error information"
}
```
## Notes
- **Image size**: Images are automatically resized if too large
- **Timeout**: 60 seconds for API calls
- **Rate limits**: Subject to your OpenAI API plan limits
- **Privacy**: Images are sent to OpenAI's servers for processing
- **Local files**: Automatically converted to base64 for API submission
- **URLs**: Can be passed directly to the API without downloading

View File

@@ -0,0 +1,233 @@
#!/usr/bin/env bash
# OpenAI Vision API wrapper
# API Docs: https://platform.openai.com/docs/guides/vision
set -euo pipefail
image_input="${1:-}"
question="${2:-}"
model="${3:-gpt-4.1-mini}"
if [ -z "$image_input" ]; then
echo '{"error": "Image path or URL is required", "usage": "bash vision.sh <image_path_or_url> <question> [model]"}'
exit 1
fi
if [ -z "$question" ]; then
echo '{"error": "Question is required", "usage": "bash vision.sh <image_path_or_url> <question> [model]"}'
exit 1
fi
if [ -z "${OPENAI_API_KEY:-}" ]; then
echo '{"error": "OPENAI_API_KEY environment variable is not set", "help": "Visit https://platform.openai.com/api-keys to get an API key"}'
exit 1
fi
# Set API base URL (default to OpenAI's official endpoint)
api_base="${OPENAI_API_BASE:-https://api.openai.com/v1}"
# Remove trailing slash if present
api_base="${api_base%/}"
# Determine if input is a URL or local file
if [[ "$image_input" =~ ^https?:// ]]; then
# It's a URL - use it directly
image_url="$image_input"
# Build JSON request body with URL
request_body=$(cat <<EOF
{
"model": "$model",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "$question"
},
{
"type": "image_url",
"image_url": {
"url": "$image_url"
}
}
]
}
],
"max_tokens": 1000
}
EOF
)
else
# It's a local file - need to encode as base64
if [ ! -f "$image_input" ]; then
echo "{\"error\": \"Image file not found\", \"path\": \"$image_input\"}"
exit 1
fi
# Check file size and compress if needed to avoid "Argument list too long" error
# Files larger than 1MB should be compressed
file_size=$(wc -c < "$image_input" | tr -d ' ')
max_size=1048576 # 1MB
image_to_encode="$image_input"
temp_compressed=""
if [ "$file_size" -gt "$max_size" ]; then
# File is too large, compress it
temp_compressed=$(mktemp "${TMPDIR:-/tmp}/vision_compressed_XXXXXX.jpg")
# Use sips (macOS) or convert (ImageMagick) to compress
if command -v sips &> /dev/null; then
# macOS: resize to max 800px on longest side
sips -Z 800 "$image_input" --out "$temp_compressed" &> /dev/null
if [ $? -eq 0 ]; then
image_to_encode="$temp_compressed"
>&2 echo "[vision.sh] Compressed large image ($(($file_size / 1024))KB) to avoid parameter limit"
fi
elif command -v convert &> /dev/null; then
# Linux: use ImageMagick
convert "$image_input" -resize 800x800\> "$temp_compressed" 2>/dev/null
if [ $? -eq 0 ]; then
image_to_encode="$temp_compressed"
>&2 echo "[vision.sh] Compressed large image ($(($file_size / 1024))KB) to avoid parameter limit"
fi
fi
fi
# Detect image format from file extension
extension="${image_to_encode##*.}"
extension_lower=$(echo "$extension" | tr '[:upper:]' '[:lower:]')
case "$extension_lower" in
jpg|jpeg)
mime_type="image/jpeg"
;;
png)
mime_type="image/png"
;;
gif)
mime_type="image/gif"
;;
webp)
mime_type="image/webp"
;;
*)
echo "{\"error\": \"Unsupported image format\", \"extension\": \"$extension\", \"supported\": [\"jpg\", \"jpeg\", \"png\", \"gif\", \"webp\"]}"
# Clean up temp file if exists
[ -n "$temp_compressed" ] && rm -f "$temp_compressed"
exit 1
;;
esac
# Encode image to base64
if command -v base64 &> /dev/null; then
# macOS and most Linux systems
base64_image=$(base64 -i "$image_to_encode" 2>/dev/null || base64 "$image_to_encode" 2>/dev/null)
else
echo '{"error": "base64 command not found", "help": "Please install base64 utility"}'
# Clean up temp file if exists
[ -n "$temp_compressed" ] && rm -f "$temp_compressed"
exit 1
fi
# Clean up temp compressed file
[ -n "$temp_compressed" ] && rm -f "$temp_compressed"
if [ -z "$base64_image" ]; then
echo "{\"error\": \"Failed to encode image to base64\", \"path\": \"$image_input\"}"
exit 1
fi
# Escape question for JSON (replace " with \")
escaped_question=$(echo "$question" | sed 's/"/\\"/g')
# Build JSON request body with base64 image
# Note: Using printf to avoid issues with special characters
request_body=$(cat <<EOF
{
"model": "$model",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "$escaped_question"
},
{
"type": "image_url",
"image_url": {
"url": "data:$mime_type;base64,$base64_image"
}
}
]
}
],
"max_tokens": 1000
}
EOF
)
fi
# Call OpenAI API
response=$(curl -sS --max-time 60 \
-X POST \
-H "Authorization: Bearer $OPENAI_API_KEY" \
-H "Content-Type: application/json" \
-d "$request_body" \
"$api_base/chat/completions" 2>&1)
curl_exit_code=$?
if [ $curl_exit_code -ne 0 ]; then
echo "{\"error\": \"Failed to call OpenAI API\", \"details\": \"$response\"}"
exit 1
fi
# Simple JSON validation - check if response starts with { or [
if [[ ! "$response" =~ ^[[:space:]]*[\{\[] ]]; then
echo "{\"error\": \"Invalid JSON response from API\", \"response\": \"$response\"}"
exit 1
fi
# Check for API error (look for "error" field in response)
if echo "$response" | grep -q '"error"[[:space:]]*:[[:space:]]*{'; then
# Extract error message if possible
error_msg=$(echo "$response" | grep -o '"message"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"message"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1)
if [ -z "$error_msg" ]; then
error_msg="Unknown API error"
fi
echo "{\"error\": \"OpenAI API error\", \"message\": \"$error_msg\", \"response\": $response}"
exit 1
fi
# Extract the content from the response
# The response structure is: choices[0].message.content
content=$(echo "$response" | grep -o '"content"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"content"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1)
# Extract usage information
prompt_tokens=$(echo "$response" | grep -o '"prompt_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
completion_tokens=$(echo "$response" | grep -o '"completion_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
total_tokens=$(echo "$response" | grep -o '"total_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
# Build simplified response
if [ -n "$content" ]; then
# Unescape JSON content (basic unescaping)
content=$(echo "$content" | sed 's/\\n/\n/g' | sed 's/\\"/"/g')
cat <<EOF
{
"model": "$model",
"content": "$content",
"usage": {
"prompt_tokens": ${prompt_tokens:-0},
"completion_tokens": ${completion_tokens:-0},
"total_tokens": ${total_tokens:-0}
}
}
EOF
else
# If we can't extract content, return the full response
echo "$response"
fi