From ccb9030d3cab180a22894ef84753ae47ffb22a13 Mon Sep 17 00:00:00 2001 From: zhayujie Date: Mon, 9 Mar 2026 10:13:48 +0800 Subject: [PATCH] refactor: convert web-fetch from skill to native tool --- agent/tools/__init__.py | 11 ++++ agent/tools/web_fetch/__init__.py | 0 agent/tools/web_fetch/web_fetch.py | 98 ++++++++++++++++++++++++++++++ skills/skill-creator/SKILL.md | 2 +- skills/web-fetch/SKILL.md | 56 ----------------- skills/web-fetch/scripts/fetch.sh | 54 ---------------- 6 files changed, 110 insertions(+), 111 deletions(-) create mode 100644 agent/tools/web_fetch/__init__.py create mode 100644 agent/tools/web_fetch/web_fetch.py delete mode 100644 skills/web-fetch/SKILL.md delete mode 100755 skills/web-fetch/scripts/fetch.sh diff --git a/agent/tools/__init__.py b/agent/tools/__init__.py index acf28f98..5c2cc206 100644 --- a/agent/tools/__init__.py +++ b/agent/tools/__init__.py @@ -55,6 +55,15 @@ def _import_optional_tools(): except Exception as e: logger.error(f"[Tools] WebSearch failed to load: {e}") + # WebFetch Tool + try: + from agent.tools.web_fetch.web_fetch import WebFetch + tools['WebFetch'] = WebFetch + except ImportError as e: + logger.error(f"[Tools] WebFetch not loaded - missing dependency: {e}") + except Exception as e: + logger.error(f"[Tools] WebFetch failed to load: {e}") + return tools # Load optional tools @@ -62,6 +71,7 @@ _optional_tools = _import_optional_tools() EnvConfig = _optional_tools.get('EnvConfig') SchedulerTool = _optional_tools.get('SchedulerTool') WebSearch = _optional_tools.get('WebSearch') +WebFetch = _optional_tools.get('WebFetch') GoogleSearch = _optional_tools.get('GoogleSearch') FileSave = _optional_tools.get('FileSave') Terminal = _optional_tools.get('Terminal') @@ -102,6 +112,7 @@ __all__ = [ 'EnvConfig', 'SchedulerTool', 'WebSearch', + 'WebFetch', # Optional tools (may be None if dependencies not available) # 'BrowserTool' ] diff --git a/agent/tools/web_fetch/__init__.py b/agent/tools/web_fetch/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/agent/tools/web_fetch/web_fetch.py b/agent/tools/web_fetch/web_fetch.py new file mode 100644 index 00000000..93a8b70d --- /dev/null +++ b/agent/tools/web_fetch/web_fetch.py @@ -0,0 +1,98 @@ +""" +Web Fetch tool - Fetch and extract readable content from web pages. +""" + +import re +from typing import Dict, Any +from urllib.parse import urlparse + +import requests + +from agent.tools.base_tool import BaseTool, ToolResult +from common.log import logger + + +DEFAULT_TIMEOUT = 10 + +DEFAULT_HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +} + + +class WebFetch(BaseTool): + """Tool for fetching and extracting readable content from web pages""" + + name: str = "web_fetch" + description: str = ( + "Fetch and extract readable text content from a web page URL. " + ) + + params: dict = { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "The HTTP/HTTPS URL to fetch" + } + }, + "required": ["url"] + } + + def __init__(self, config: dict = None): + self.config = config or {} + + def execute(self, args: Dict[str, Any]) -> ToolResult: + url = args.get("url", "").strip() + if not url: + return ToolResult.fail("Error: 'url' parameter is required") + + parsed = urlparse(url) + if parsed.scheme not in ("http", "https"): + return ToolResult.fail("Error: Invalid URL (must start with http:// or https://)") + + try: + response = requests.get( + url, + headers=DEFAULT_HEADERS, + timeout=DEFAULT_TIMEOUT, + allow_redirects=True, + ) + response.raise_for_status() + except requests.Timeout: + return ToolResult.fail(f"Error: Request timed out after {DEFAULT_TIMEOUT}s") + except requests.ConnectionError: + return ToolResult.fail(f"Error: Failed to connect to {parsed.netloc}") + except requests.HTTPError as e: + return ToolResult.fail(f"Error: HTTP {e.response.status_code} for URL: {url}") + except Exception as e: + return ToolResult.fail(f"Error: Failed to fetch URL: {e}") + + html = response.text + title = self._extract_title(html) + text = self._extract_text(html) + + return ToolResult.success(f"Title: {title}\n\nContent:\n{text}") + + @staticmethod + def _extract_title(html: str) -> str: + match = re.search(r"]*>(.*?)", html, re.IGNORECASE | re.DOTALL) + return match.group(1).strip() if match else "Untitled" + + @staticmethod + def _extract_text(html: str) -> str: + # Remove script and style blocks + text = re.sub(r"]*>.*?", "", html, flags=re.IGNORECASE | re.DOTALL) + text = re.sub(r"]*>.*?", "", text, flags=re.IGNORECASE | re.DOTALL) + # Remove HTML tags + text = re.sub(r"<[^>]+>", "", text) + # Decode common HTML entities + text = text.replace("&", "&").replace("<", "<").replace(">", ">") + text = text.replace(""", '"').replace("'", "'").replace(" ", " ") + # Collapse whitespace: multiple spaces/tabs -> single space, multiple newlines -> double newline + text = re.sub(r"[^\S\n]+", " ", text) + text = re.sub(r"\n{3,}", "\n\n", text) + # Strip leading/trailing whitespace per line + lines = [line.strip() for line in text.splitlines()] + text = "\n".join(lines) + return text.strip() diff --git a/skills/skill-creator/SKILL.md b/skills/skill-creator/SKILL.md index 6b6d5d12..697f9f9f 100644 --- a/skills/skill-creator/SKILL.md +++ b/skills/skill-creator/SKILL.md @@ -95,7 +95,7 @@ Do NOT create auxiliary documentation files: ## Installing a Skill from URL -1. Fetch the URL content (curl or web-fetch skill) +1. Fetch the URL content (curl or web_fetch tool) 2. Extract `name` from YAML frontmatter 3. Create directory `/skills//` and save content as `SKILL.md` 4. Check the saved SKILL.md for an installation/setup section — if it defines additional steps (e.g., downloading scripts, installing dependencies), execute them; otherwise installation is complete diff --git a/skills/web-fetch/SKILL.md b/skills/web-fetch/SKILL.md deleted file mode 100644 index 39315fb0..00000000 --- a/skills/web-fetch/SKILL.md +++ /dev/null @@ -1,56 +0,0 @@ ---- -name: web-fetch -description: Fetch and extract readable content from web pages. Use for lightweight page access without browser automation. -homepage: https://github.com/zhayujie/chatgpt-on-wechat -metadata: - emoji: 🌐 - requires: - bins: ["curl"] - always: true ---- - -# Web Fetch - -Fetch and extract readable content from web pages using curl and basic text processing. - -## Usage - -**Important**: Scripts are located relative to this skill's base directory. - -When you see this skill in ``, note the `` path. - -```bash -# General pattern: -bash "/scripts/fetch.sh" [output_file] - -# Example (replace with actual path from skill listing): -bash "~/chatgpt-on-wechat/skills/web-fetch/scripts/fetch.sh" "https://example.com" -``` - -**Parameters:** -- `url`: The HTTP/HTTPS URL to fetch (required) -- `output_file`: Optional file to save the output (default: stdout) - -**Returns:** -- Extracted page content with title and text - -## Examples - -### Fetch a web page -```bash -bash "/scripts/fetch.sh" "https://example.com" -``` - -### Save to file -```bash -bash "/scripts/fetch.sh" "https://example.com" output.txt -cat output.txt -``` - -## Notes - -- Uses curl for HTTP requests (timeout: 10s) -- Extracts title and basic text content -- Removes HTML tags and scripts -- Works with any standard web page -- No external dependencies beyond curl diff --git a/skills/web-fetch/scripts/fetch.sh b/skills/web-fetch/scripts/fetch.sh deleted file mode 100755 index 1713b263..00000000 --- a/skills/web-fetch/scripts/fetch.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash -# Fetch and extract readable content from a web page - -set -euo pipefail - -url="${1:-}" -output_file="${2:-}" - -if [ -z "$url" ]; then - echo "Error: URL is required" - echo "Usage: bash fetch.sh [output_file]" - exit 1 -fi - -# Validate URL -if [[ ! "$url" =~ ^https?:// ]]; then - echo "Error: Invalid URL (must start with http:// or https://)" - exit 1 -fi - -# Fetch the page with curl -html=$(curl -sS -L --max-time 10 \ - -H "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" \ - -H "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" \ - "$url" 2>&1) || { - echo "Error: Failed to fetch URL: $url" - exit 1 -} - -# Extract title -title=$(echo "$html" | grep -oP '(?<=).*?(?=)' | head -1 || echo "Untitled") - -# Remove script and style tags -text=$(echo "$html" | sed 's/]*>.*<\/script>//gI' | sed 's/]*>.*<\/style>//gI') - -# Remove HTML tags -text=$(echo "$text" | sed 's/<[^>]*>//g') - -# Clean up whitespace -text=$(echo "$text" | tr -s ' ' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') - -# Format output -result="Title: $title - -Content: -$text" - -# Output to file or stdout -if [ -n "$output_file" ]; then - echo "$result" > "$output_file" - echo "Content saved to: $output_file" -else - echo "$result" -fi