mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-06-02 00:57:41 +08:00
refactor: convert web-fetch from skill to native tool
This commit is contained in:
@@ -55,6 +55,15 @@ def _import_optional_tools():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"[Tools] WebSearch failed to load: {e}")
|
logger.error(f"[Tools] WebSearch failed to load: {e}")
|
||||||
|
|
||||||
|
# WebFetch Tool
|
||||||
|
try:
|
||||||
|
from agent.tools.web_fetch.web_fetch import WebFetch
|
||||||
|
tools['WebFetch'] = WebFetch
|
||||||
|
except ImportError as e:
|
||||||
|
logger.error(f"[Tools] WebFetch not loaded - missing dependency: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[Tools] WebFetch failed to load: {e}")
|
||||||
|
|
||||||
return tools
|
return tools
|
||||||
|
|
||||||
# Load optional tools
|
# Load optional tools
|
||||||
@@ -62,6 +71,7 @@ _optional_tools = _import_optional_tools()
|
|||||||
EnvConfig = _optional_tools.get('EnvConfig')
|
EnvConfig = _optional_tools.get('EnvConfig')
|
||||||
SchedulerTool = _optional_tools.get('SchedulerTool')
|
SchedulerTool = _optional_tools.get('SchedulerTool')
|
||||||
WebSearch = _optional_tools.get('WebSearch')
|
WebSearch = _optional_tools.get('WebSearch')
|
||||||
|
WebFetch = _optional_tools.get('WebFetch')
|
||||||
GoogleSearch = _optional_tools.get('GoogleSearch')
|
GoogleSearch = _optional_tools.get('GoogleSearch')
|
||||||
FileSave = _optional_tools.get('FileSave')
|
FileSave = _optional_tools.get('FileSave')
|
||||||
Terminal = _optional_tools.get('Terminal')
|
Terminal = _optional_tools.get('Terminal')
|
||||||
@@ -102,6 +112,7 @@ __all__ = [
|
|||||||
'EnvConfig',
|
'EnvConfig',
|
||||||
'SchedulerTool',
|
'SchedulerTool',
|
||||||
'WebSearch',
|
'WebSearch',
|
||||||
|
'WebFetch',
|
||||||
# Optional tools (may be None if dependencies not available)
|
# Optional tools (may be None if dependencies not available)
|
||||||
# 'BrowserTool'
|
# 'BrowserTool'
|
||||||
]
|
]
|
||||||
|
|||||||
0
agent/tools/web_fetch/__init__.py
Normal file
0
agent/tools/web_fetch/__init__.py
Normal file
98
agent/tools/web_fetch/web_fetch.py
Normal file
98
agent/tools/web_fetch/web_fetch.py
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
"""
|
||||||
|
Web Fetch tool - Fetch and extract readable content from web pages.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import Dict, Any
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from agent.tools.base_tool import BaseTool, ToolResult
|
||||||
|
from common.log import logger
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_TIMEOUT = 10
|
||||||
|
|
||||||
|
DEFAULT_HEADERS = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class WebFetch(BaseTool):
|
||||||
|
"""Tool for fetching and extracting readable content from web pages"""
|
||||||
|
|
||||||
|
name: str = "web_fetch"
|
||||||
|
description: str = (
|
||||||
|
"Fetch and extract readable text content from a web page URL. "
|
||||||
|
)
|
||||||
|
|
||||||
|
params: dict = {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The HTTP/HTTPS URL to fetch"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["url"]
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, config: dict = None):
|
||||||
|
self.config = config or {}
|
||||||
|
|
||||||
|
def execute(self, args: Dict[str, Any]) -> ToolResult:
|
||||||
|
url = args.get("url", "").strip()
|
||||||
|
if not url:
|
||||||
|
return ToolResult.fail("Error: 'url' parameter is required")
|
||||||
|
|
||||||
|
parsed = urlparse(url)
|
||||||
|
if parsed.scheme not in ("http", "https"):
|
||||||
|
return ToolResult.fail("Error: Invalid URL (must start with http:// or https://)")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(
|
||||||
|
url,
|
||||||
|
headers=DEFAULT_HEADERS,
|
||||||
|
timeout=DEFAULT_TIMEOUT,
|
||||||
|
allow_redirects=True,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
except requests.Timeout:
|
||||||
|
return ToolResult.fail(f"Error: Request timed out after {DEFAULT_TIMEOUT}s")
|
||||||
|
except requests.ConnectionError:
|
||||||
|
return ToolResult.fail(f"Error: Failed to connect to {parsed.netloc}")
|
||||||
|
except requests.HTTPError as e:
|
||||||
|
return ToolResult.fail(f"Error: HTTP {e.response.status_code} for URL: {url}")
|
||||||
|
except Exception as e:
|
||||||
|
return ToolResult.fail(f"Error: Failed to fetch URL: {e}")
|
||||||
|
|
||||||
|
html = response.text
|
||||||
|
title = self._extract_title(html)
|
||||||
|
text = self._extract_text(html)
|
||||||
|
|
||||||
|
return ToolResult.success(f"Title: {title}\n\nContent:\n{text}")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_title(html: str) -> str:
|
||||||
|
match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
|
||||||
|
return match.group(1).strip() if match else "Untitled"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_text(html: str) -> str:
|
||||||
|
# Remove script and style blocks
|
||||||
|
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.IGNORECASE | re.DOTALL)
|
||||||
|
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.IGNORECASE | re.DOTALL)
|
||||||
|
# Remove HTML tags
|
||||||
|
text = re.sub(r"<[^>]+>", "", text)
|
||||||
|
# Decode common HTML entities
|
||||||
|
text = text.replace("&", "&").replace("<", "<").replace(">", ">")
|
||||||
|
text = text.replace(""", '"').replace("'", "'").replace(" ", " ")
|
||||||
|
# Collapse whitespace: multiple spaces/tabs -> single space, multiple newlines -> double newline
|
||||||
|
text = re.sub(r"[^\S\n]+", " ", text)
|
||||||
|
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||||
|
# Strip leading/trailing whitespace per line
|
||||||
|
lines = [line.strip() for line in text.splitlines()]
|
||||||
|
text = "\n".join(lines)
|
||||||
|
return text.strip()
|
||||||
@@ -95,7 +95,7 @@ Do NOT create auxiliary documentation files:
|
|||||||
|
|
||||||
## Installing a Skill from URL
|
## Installing a Skill from URL
|
||||||
|
|
||||||
1. Fetch the URL content (curl or web-fetch skill)
|
1. Fetch the URL content (curl or web_fetch tool)
|
||||||
2. Extract `name` from YAML frontmatter
|
2. Extract `name` from YAML frontmatter
|
||||||
3. Create directory `<workspace>/skills/<name>/` and save content as `SKILL.md`
|
3. Create directory `<workspace>/skills/<name>/` and save content as `SKILL.md`
|
||||||
4. Check the saved SKILL.md for an installation/setup section — if it defines additional steps (e.g., downloading scripts, installing dependencies), execute them; otherwise installation is complete
|
4. Check the saved SKILL.md for an installation/setup section — if it defines additional steps (e.g., downloading scripts, installing dependencies), execute them; otherwise installation is complete
|
||||||
|
|||||||
@@ -1,56 +0,0 @@
|
|||||||
---
|
|
||||||
name: web-fetch
|
|
||||||
description: Fetch and extract readable content from web pages. Use for lightweight page access without browser automation.
|
|
||||||
homepage: https://github.com/zhayujie/chatgpt-on-wechat
|
|
||||||
metadata:
|
|
||||||
emoji: 🌐
|
|
||||||
requires:
|
|
||||||
bins: ["curl"]
|
|
||||||
always: true
|
|
||||||
---
|
|
||||||
|
|
||||||
# Web Fetch
|
|
||||||
|
|
||||||
Fetch and extract readable content from web pages using curl and basic text processing.
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
**Important**: Scripts are located relative to this skill's base directory.
|
|
||||||
|
|
||||||
When you see this skill in `<available_skills>`, note the `<base_dir>` path.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# General pattern:
|
|
||||||
bash "<base_dir>/scripts/fetch.sh" <url> [output_file]
|
|
||||||
|
|
||||||
# Example (replace <base_dir> with actual path from skill listing):
|
|
||||||
bash "~/chatgpt-on-wechat/skills/web-fetch/scripts/fetch.sh" "https://example.com"
|
|
||||||
```
|
|
||||||
|
|
||||||
**Parameters:**
|
|
||||||
- `url`: The HTTP/HTTPS URL to fetch (required)
|
|
||||||
- `output_file`: Optional file to save the output (default: stdout)
|
|
||||||
|
|
||||||
**Returns:**
|
|
||||||
- Extracted page content with title and text
|
|
||||||
|
|
||||||
## Examples
|
|
||||||
|
|
||||||
### Fetch a web page
|
|
||||||
```bash
|
|
||||||
bash "<base_dir>/scripts/fetch.sh" "https://example.com"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Save to file
|
|
||||||
```bash
|
|
||||||
bash "<base_dir>/scripts/fetch.sh" "https://example.com" output.txt
|
|
||||||
cat output.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
## Notes
|
|
||||||
|
|
||||||
- Uses curl for HTTP requests (timeout: 10s)
|
|
||||||
- Extracts title and basic text content
|
|
||||||
- Removes HTML tags and scripts
|
|
||||||
- Works with any standard web page
|
|
||||||
- No external dependencies beyond curl
|
|
||||||
@@ -1,54 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# Fetch and extract readable content from a web page
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
url="${1:-}"
|
|
||||||
output_file="${2:-}"
|
|
||||||
|
|
||||||
if [ -z "$url" ]; then
|
|
||||||
echo "Error: URL is required"
|
|
||||||
echo "Usage: bash fetch.sh <url> [output_file]"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Validate URL
|
|
||||||
if [[ ! "$url" =~ ^https?:// ]]; then
|
|
||||||
echo "Error: Invalid URL (must start with http:// or https://)"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Fetch the page with curl
|
|
||||||
html=$(curl -sS -L --max-time 10 \
|
|
||||||
-H "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" \
|
|
||||||
-H "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" \
|
|
||||||
"$url" 2>&1) || {
|
|
||||||
echo "Error: Failed to fetch URL: $url"
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
# Extract title
|
|
||||||
title=$(echo "$html" | grep -oP '(?<=<title>).*?(?=</title>)' | head -1 || echo "Untitled")
|
|
||||||
|
|
||||||
# Remove script and style tags
|
|
||||||
text=$(echo "$html" | sed 's/<script[^>]*>.*<\/script>//gI' | sed 's/<style[^>]*>.*<\/style>//gI')
|
|
||||||
|
|
||||||
# Remove HTML tags
|
|
||||||
text=$(echo "$text" | sed 's/<[^>]*>//g')
|
|
||||||
|
|
||||||
# Clean up whitespace
|
|
||||||
text=$(echo "$text" | tr -s ' ' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
|
||||||
|
|
||||||
# Format output
|
|
||||||
result="Title: $title
|
|
||||||
|
|
||||||
Content:
|
|
||||||
$text"
|
|
||||||
|
|
||||||
# Output to file or stdout
|
|
||||||
if [ -n "$output_file" ]; then
|
|
||||||
echo "$result" > "$output_file"
|
|
||||||
echo "Content saved to: $output_file"
|
|
||||||
else
|
|
||||||
echo "$result"
|
|
||||||
fi
|
|
||||||
Reference in New Issue
Block a user