refactor: convert web-fetch from skill to native tool

2026-07-17 11:07:11 +08:00 · 2026-03-09 10:13:48 +08:00
parent 8623287ac1
commit ccb9030d3c
6 changed files with 110 additions and 111 deletions
--- a/agent/tools/init.py
+++ b/agent/tools/init.py
@@ -55,6 +55,15 @@ def _import_optional_tools():
    except Exception as e:
        logger.error(f"[Tools] WebSearch failed to load: {e}")
    # WebFetch Tool
    try:
        from agent.tools.web_fetch.web_fetch import WebFetch
        tools['WebFetch'] = WebFetch
    except ImportError as e:
        logger.error(f"[Tools] WebFetch not loaded - missing dependency: {e}")
    except Exception as e:
        logger.error(f"[Tools] WebFetch failed to load: {e}")
    return tools
 # Load optional tools
@@ -62,6 +71,7 @@ _optional_tools = _import_optional_tools()
 EnvConfig = _optional_tools.get('EnvConfig')
 SchedulerTool = _optional_tools.get('SchedulerTool')
 WebSearch = _optional_tools.get('WebSearch')
 WebFetch = _optional_tools.get('WebFetch')
 GoogleSearch = _optional_tools.get('GoogleSearch')
 FileSave = _optional_tools.get('FileSave')
 Terminal = _optional_tools.get('Terminal')
@@ -102,6 +112,7 @@ __all__ = [
    'EnvConfig',
    'SchedulerTool',
    'WebSearch',
    'WebFetch',
    # Optional tools (may be None if dependencies not available)
    # 'BrowserTool'
 ]
--- a/agent/tools/web_fetch/init.py
+++ b/agent/tools/web_fetch/init.py
--- a/agent/tools/web_fetch/web_fetch.py
+++ b/agent/tools/web_fetch/web_fetch.py
@@ -0,0 +1,98 @@
 """
 Web Fetch tool - Fetch and extract readable content from web pages.
 """
 import re
 from typing import Dict, Any
 from urllib.parse import urlparse
 import requests
 from agent.tools.base_tool import BaseTool, ToolResult
 from common.log import logger
 DEFAULT_TIMEOUT = 10
 DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 }
 class WebFetch(BaseTool):
    """Tool for fetching and extracting readable content from web pages"""
    name: str = "web_fetch"
    description: str = (
        "Fetch and extract readable text content from a web page URL. "
    )
    params: dict = {
        "type": "object",
        "properties": {
            "url": {
                "type": "string",
                "description": "The HTTP/HTTPS URL to fetch"
            }
        },
        "required": ["url"]
    }
    def __init__(self, config: dict = None):
        self.config = config or {}
    def execute(self, args: Dict[str, Any]) -> ToolResult:
        url = args.get("url", "").strip()
        if not url:
            return ToolResult.fail("Error: 'url' parameter is required")
        parsed = urlparse(url)
        if parsed.scheme not in ("http", "https"):
            return ToolResult.fail("Error: Invalid URL (must start with http:// or https://)")
        try:
            response = requests.get(
                url,
                headers=DEFAULT_HEADERS,
                timeout=DEFAULT_TIMEOUT,
                allow_redirects=True,
            )
            response.raise_for_status()
        except requests.Timeout:
            return ToolResult.fail(f"Error: Request timed out after {DEFAULT_TIMEOUT}s")
        except requests.ConnectionError:
            return ToolResult.fail(f"Error: Failed to connect to {parsed.netloc}")
        except requests.HTTPError as e:
            return ToolResult.fail(f"Error: HTTP {e.response.status_code} for URL: {url}")
        except Exception as e:
            return ToolResult.fail(f"Error: Failed to fetch URL: {e}")
        html = response.text
        title = self._extract_title(html)
        text = self._extract_text(html)
        return ToolResult.success(f"Title: {title}\n\nContent:\n{text}")
    @staticmethod
    def _extract_title(html: str) -> str:
        match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
        return match.group(1).strip() if match else "Untitled"
    @staticmethod
    def _extract_text(html: str) -> str:
        # Remove script and style blocks
        text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.IGNORECASE | re.DOTALL)
        text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.IGNORECASE | re.DOTALL)
        # Remove HTML tags
        text = re.sub(r"<[^>]+>", "", text)
        # Decode common HTML entities
        text = text.replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
        text = text.replace("&quot;", '"').replace("&#39;", "'").replace("&nbsp;", " ")
        # Collapse whitespace: multiple spaces/tabs -> single space, multiple newlines -> double newline
        text = re.sub(r"[^\S\n]+", " ", text)
        text = re.sub(r"\n{3,}", "\n\n", text)
        # Strip leading/trailing whitespace per line
        lines = [line.strip() for line in text.splitlines()]
        text = "\n".join(lines)
        return text.strip()
--- a/skills/skill-creator/SKILL.md
+++ b/skills/skill-creator/SKILL.md
@@ -95,7 +95,7 @@ Do NOT create auxiliary documentation files:
 ## Installing a Skill from URL
-1. Fetch the URL content (curl or web-fetch skill)
+1. Fetch the URL content (curl or web_fetch tool)
 2. Extract `name` from YAML frontmatter
 3. Create directory `<workspace>/skills/<name>/` and save content as `SKILL.md`
 4. Check the saved SKILL.md for an installation/setup section — if it defines additional steps (e.g., downloading scripts, installing dependencies), execute them; otherwise installation is complete
--- a/skills/web-fetch/SKILL.md
+++ b/skills/web-fetch/SKILL.md
@@ -1,56 +0,0 @@
 ---
 name: web-fetch
 description: Fetch and extract readable content from web pages. Use for lightweight page access without browser automation.
 homepage: https://github.com/zhayujie/chatgpt-on-wechat
 metadata:
  emoji: 🌐
  requires:
    bins: ["curl"]
  always: true
 ---
 # Web Fetch
 Fetch and extract readable content from web pages using curl and basic text processing.
 ## Usage
 **Important**: Scripts are located relative to this skill's base directory.
 When you see this skill in `<available_skills>`, note the `<base_dir>` path.
 ```bash
 # General pattern:
 bash "<base_dir>/scripts/fetch.sh" <url> [output_file]
 # Example (replace <base_dir> with actual path from skill listing):
 bash "~/chatgpt-on-wechat/skills/web-fetch/scripts/fetch.sh" "https://example.com"
 ```
 **Parameters:**
 - `url`: The HTTP/HTTPS URL to fetch (required)
 - `output_file`: Optional file to save the output (default: stdout)
 **Returns:**
 - Extracted page content with title and text
 ## Examples
 ### Fetch a web page
 ```bash
 bash "<base_dir>/scripts/fetch.sh" "https://example.com"
 ```
 ### Save to file
 ```bash
 bash "<base_dir>/scripts/fetch.sh" "https://example.com" output.txt
 cat output.txt
 ```
 ## Notes
 - Uses curl for HTTP requests (timeout: 10s)
 - Extracts title and basic text content
 - Removes HTML tags and scripts
 - Works with any standard web page
 - No external dependencies beyond curl
--- a/skills/web-fetch/scripts/fetch.sh
+++ b/skills/web-fetch/scripts/fetch.sh
@@ -1,54 +0,0 @@
 #!/usr/bin/env bash
 # Fetch and extract readable content from a web page
 set -euo pipefail
 url="${1:-}"
 output_file="${2:-}"
 if [ -z "$url" ]; then
    echo "Error: URL is required"
    echo "Usage: bash fetch.sh <url> [output_file]"
    exit 1
 fi
 # Validate URL
 if [[ ! "$url" =~ ^https?:// ]]; then
    echo "Error: Invalid URL (must start with http:// or https://)"
    exit 1
 fi
 # Fetch the page with curl
 html=$(curl -sS -L --max-time 10 \
    -H "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" \
    -H "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" \
    "$url" 2>&1) || {
    echo "Error: Failed to fetch URL: $url"
    exit 1
 }
 # Extract title
 title=$(echo "$html" | grep -oP '(?<=<title>).*?(?=</title>)' | head -1 || echo "Untitled")
 # Remove script and style tags
 text=$(echo "$html" | sed 's/<script[^>]*>.*<\/script>//gI' | sed 's/<style[^>]*>.*<\/style>//gI')
 # Remove HTML tags
 text=$(echo "$text" | sed 's/<[^>]*>//g')
 # Clean up whitespace
 text=$(echo "$text" | tr -s ' ' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
 # Format output
 result="Title: $title
 Content:
 $text"
 # Output to file or stdout
 if [ -n "$output_file" ]; then
    echo "$result" > "$output_file"
    echo "Content saved to: $output_file"
 else
    echo "$result"
 fi