mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-06-02 00:57:41 +08:00
feat(browser): persistent login + CDP attach mode #2809
Browser sessions now reuse a Chromium user profile across runs by default
(`~/.cow/browser_profile`), so users only log in to a site once.
Three launch modes are selectable via `tools.browser` in config.json:
- persistent (default): Playwright Chromium with a persistent user_data_dir
- cdp: attach to an externally launched real Chrome via `cdp_endpoint`
(full fingerprints, ideal for sites with strict bot detection)
- fresh: clean context every run, set `persistent: false`
Also:
- Self-heal when the user closes the browser window mid-session: detect
closed page/context/browser via close listeners and exception scanning,
then transparently relaunch on the next request.
- Graceful CDP shutdown: disconnect only, never kill the user's Chrome.
- Friendly errors when the CDP endpoint is unreachable or the persistent
profile is locked, so the LLM can guide the user instead of looping.
- Fix tool config being silently overwritten by workspace config in
AgentInitializer; per-tool user settings (e.g. browser.cdp_endpoint)
are now merged instead of replaced.
- Update zh / en / ja docs with the new login-persistence section,
including the Chrome 137+ requirement to pair --remote-debugging-port
with a dedicated --user-data-dir.
This commit is contained in:
@@ -15,6 +15,10 @@ import threading
|
||||
from typing import Optional, Dict, Any, List, Callable
|
||||
|
||||
from common.log import logger
|
||||
from common.utils import expand_path
|
||||
|
||||
|
||||
_DEFAULT_USER_DATA_DIR = "~/.cow/browser_profile"
|
||||
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright, Browser, BrowserContext, Page, Playwright
|
||||
@@ -212,6 +216,21 @@ _SNAPSHOT_JS = """
|
||||
)
|
||||
|
||||
|
||||
_BROWSER_DEAD_HINTS = (
|
||||
"has been closed",
|
||||
"browser has disconnected",
|
||||
"target closed",
|
||||
"browser closed",
|
||||
"context or browser has been closed",
|
||||
)
|
||||
|
||||
|
||||
def _is_browser_dead_error(err: Exception) -> bool:
|
||||
"""Return True if *err* indicates the browser / page died out from under us."""
|
||||
msg = str(err).lower()
|
||||
return any(h in msg for h in _BROWSER_DEAD_HINTS)
|
||||
|
||||
|
||||
def _should_use_headless() -> bool:
|
||||
"""Decide headless mode: headless on Linux servers without display, headed elsewhere."""
|
||||
if sys.platform in ("win32", "darwin"):
|
||||
@@ -302,11 +321,38 @@ class BrowserService:
|
||||
self._context = None
|
||||
self._page = None
|
||||
|
||||
# Launch mode: one of "fresh" | "persistent" | "cdp".
|
||||
# - cdp: connect to an externally launched Chrome via CDP endpoint.
|
||||
# - persistent: launch with launch_persistent_context using a user_data_dir
|
||||
# so cookies / login state survive across runs (default).
|
||||
# - fresh: classic launch + new_context, clean state every run.
|
||||
cdp_endpoint = self._config.get("cdp_endpoint") or ""
|
||||
persistent_flag = self._config.get("persistent", True)
|
||||
user_data_dir_cfg = self._config.get("user_data_dir")
|
||||
if user_data_dir_cfg is None:
|
||||
user_data_dir_cfg = _DEFAULT_USER_DATA_DIR
|
||||
|
||||
self._cdp_endpoint: str = cdp_endpoint.strip() if isinstance(cdp_endpoint, str) else ""
|
||||
if self._cdp_endpoint:
|
||||
self._launch_mode = "cdp"
|
||||
self._user_data_dir: str = ""
|
||||
elif persistent_flag and user_data_dir_cfg:
|
||||
self._launch_mode = "persistent"
|
||||
self._user_data_dir = expand_path(str(user_data_dir_cfg))
|
||||
else:
|
||||
self._launch_mode = "fresh"
|
||||
self._user_data_dir = ""
|
||||
|
||||
# Idle auto-release
|
||||
idle_cfg = self._config.get("idle_timeout")
|
||||
self._idle_timeout: float = float(idle_cfg) if idle_cfg is not None else self._IDLE_TIMEOUT_DEFAULT
|
||||
self._idle_timer: Optional[threading.Timer] = None
|
||||
|
||||
# Set when the browser / page is detected to have died externally
|
||||
# (e.g. user manually closed the window). The next _submit() will then
|
||||
# tear down the stale thread and relaunch.
|
||||
self._needs_restart = False
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Background-thread lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
@@ -354,6 +400,12 @@ class BrowserService:
|
||||
result_slot["value"] = fn(*args, **kwargs)
|
||||
except Exception as e:
|
||||
result_slot["error"] = e
|
||||
if _is_browser_dead_error(e):
|
||||
self._needs_restart = True
|
||||
logger.warning(
|
||||
f"[Browser] Detected closed page/context ({e}); "
|
||||
"will relaunch on next request."
|
||||
)
|
||||
finally:
|
||||
result_slot["event"].set()
|
||||
|
||||
@@ -375,7 +427,7 @@ class BrowserService:
|
||||
result_slot["event"].set()
|
||||
|
||||
def _launch_browser(self):
|
||||
"""Launch Chromium on the background thread."""
|
||||
"""Launch / connect Chromium on the background thread."""
|
||||
if self._headless is None:
|
||||
headless_cfg = self._config.get("headless")
|
||||
self._headless = headless_cfg if headless_cfg is not None else _should_use_headless()
|
||||
@@ -390,27 +442,132 @@ class BrowserService:
|
||||
|
||||
viewport_w = self._config.get("viewport_width", 1280)
|
||||
viewport_h = self._config.get("viewport_height", 720)
|
||||
viewport = {"width": viewport_w, "height": viewport_h}
|
||||
user_agent = (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/131.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
self._playwright = sync_playwright().start()
|
||||
logger.info(f"[Browser] Launching Chromium (headless={self._headless})")
|
||||
|
||||
if self._launch_mode == "cdp":
|
||||
self._connect_cdp(viewport)
|
||||
elif self._launch_mode == "persistent":
|
||||
self._launch_persistent(launch_args, viewport, user_agent)
|
||||
else:
|
||||
self._launch_fresh(launch_args, viewport, user_agent)
|
||||
|
||||
logger.info("[Browser] Browser ready")
|
||||
|
||||
def _launch_fresh(self, launch_args: List[str], viewport: Dict[str, int], user_agent: str):
|
||||
"""Classic launch: brand new Chromium with an empty context."""
|
||||
logger.info(f"[Browser] Launching Chromium (fresh, headless={self._headless})")
|
||||
self._browser = self._playwright.chromium.launch(
|
||||
headless=self._headless,
|
||||
args=launch_args,
|
||||
)
|
||||
self._context = self._browser.new_context(
|
||||
viewport={"width": viewport_w, "height": viewport_h},
|
||||
user_agent=(
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/131.0.0.0 Safari/537.36"
|
||||
),
|
||||
viewport=viewport,
|
||||
user_agent=user_agent,
|
||||
)
|
||||
self._page = self._context.new_page()
|
||||
logger.info("[Browser] Browser ready")
|
||||
self._wire_close_listeners()
|
||||
|
||||
def _launch_persistent(self, launch_args: List[str], viewport: Dict[str, int], user_agent: str):
|
||||
"""Launch Chromium with a persistent user_data_dir so login state survives."""
|
||||
os.makedirs(self._user_data_dir, exist_ok=True)
|
||||
logger.info(
|
||||
f"[Browser] Launching Chromium (persistent, headless={self._headless}, "
|
||||
f"profile={self._user_data_dir})"
|
||||
)
|
||||
try:
|
||||
self._context = self._playwright.chromium.launch_persistent_context(
|
||||
user_data_dir=self._user_data_dir,
|
||||
headless=self._headless,
|
||||
args=launch_args,
|
||||
viewport=viewport,
|
||||
user_agent=user_agent,
|
||||
)
|
||||
except Exception as e:
|
||||
# Profile is locked when another Chromium instance already holds it.
|
||||
msg = str(e).lower()
|
||||
if "singletonlock" in msg or "profile" in msg or "lock" in msg:
|
||||
raise RuntimeError(
|
||||
f"Browser profile '{self._user_data_dir}' is in use by another process. "
|
||||
"Close the other Chromium / cow instance, or set a different "
|
||||
"tools.browser.user_data_dir."
|
||||
) from e
|
||||
raise
|
||||
|
||||
# Persistent context has no parent Browser handle; reuse the auto-created page.
|
||||
self._browser = None
|
||||
pages = self._context.pages
|
||||
self._page = pages[0] if pages else self._context.new_page()
|
||||
self._wire_close_listeners()
|
||||
|
||||
def _connect_cdp(self, viewport: Dict[str, int]):
|
||||
"""Attach to an existing Chrome started with --remote-debugging-port."""
|
||||
endpoint = self._cdp_endpoint
|
||||
logger.info(f"[Browser] Connecting to existing Chrome via CDP: {endpoint}")
|
||||
try:
|
||||
self._browser = self._playwright.chromium.connect_over_cdp(endpoint)
|
||||
except Exception as e:
|
||||
msg = str(e).lower()
|
||||
if "econnrefused" in msg or "connect" in msg or "refused" in msg:
|
||||
raise RuntimeError(
|
||||
f"Cannot reach Chrome at {endpoint}. The CDP browser is not "
|
||||
"running. Ask the user to launch Chrome with "
|
||||
"--remote-debugging-port and --user-data-dir, then retry. "
|
||||
"Do not retry this tool until the user confirms."
|
||||
) from e
|
||||
raise
|
||||
|
||||
contexts = self._browser.contexts
|
||||
if contexts:
|
||||
self._context = contexts[0]
|
||||
else:
|
||||
self._context = self._browser.new_context(viewport=viewport)
|
||||
|
||||
pages = self._context.pages
|
||||
self._page = pages[0] if pages else self._context.new_page()
|
||||
self._wire_close_listeners()
|
||||
|
||||
def _wire_close_listeners(self):
|
||||
"""Mark needs_restart whenever the browser / context / page dies externally."""
|
||||
def _on_dead(_obj=None):
|
||||
self._needs_restart = True
|
||||
|
||||
try:
|
||||
if self._browser:
|
||||
self._browser.on("disconnected", _on_dead)
|
||||
if self._context:
|
||||
self._context.on("close", _on_dead)
|
||||
if self._page:
|
||||
self._page.on("close", _on_dead)
|
||||
except Exception as e:
|
||||
logger.debug(f"[Browser] Failed to wire close listeners: {e}")
|
||||
|
||||
def _shutdown_browser(self):
|
||||
"""Shut down all Playwright resources on the background thread."""
|
||||
"""Shut down Playwright resources on the background thread.
|
||||
|
||||
Mode-specific behavior:
|
||||
- cdp: only disconnect the Playwright client; leave the user's Chrome
|
||||
and its tabs untouched (do NOT close the context).
|
||||
- persistent: close the persistent context (no separate browser handle).
|
||||
- fresh: close context, then browser.
|
||||
"""
|
||||
self._cancel_idle_timer()
|
||||
|
||||
if self._launch_mode == "cdp":
|
||||
# For CDP, browser.close() only detaches the Playwright client;
|
||||
# the user's Chrome process and its tabs stay alive.
|
||||
try:
|
||||
if self._browser:
|
||||
self._browser.close()
|
||||
except Exception as e:
|
||||
logger.debug(f"[Browser] cdp disconnect error: {e}")
|
||||
else:
|
||||
for obj, label in [
|
||||
(self._context, "context"),
|
||||
(self._browser, "browser"),
|
||||
@@ -420,6 +577,7 @@ class BrowserService:
|
||||
obj.close()
|
||||
except Exception as e:
|
||||
logger.debug(f"[Browser] {label} close error: {e}")
|
||||
|
||||
try:
|
||||
if self._playwright:
|
||||
self._playwright.stop()
|
||||
@@ -433,6 +591,13 @@ class BrowserService:
|
||||
|
||||
def _submit(self, fn: Callable, *args, **kwargs):
|
||||
"""Submit *fn* to the background thread and block until it completes."""
|
||||
# If the browser died externally (e.g. user closed the window), tear
|
||||
# down the stale thread first so _start_thread() will relaunch fresh.
|
||||
if self._needs_restart:
|
||||
logger.info("[Browser] Restarting after detecting closed browser")
|
||||
self.close()
|
||||
self._needs_restart = False
|
||||
|
||||
self._start_thread()
|
||||
|
||||
if not self._alive:
|
||||
@@ -481,6 +646,7 @@ class BrowserService:
|
||||
self._cancel_idle_timer()
|
||||
with self._lock:
|
||||
if not self._alive:
|
||||
self._needs_restart = False
|
||||
return
|
||||
self._alive = False
|
||||
t = self._thread
|
||||
@@ -490,6 +656,7 @@ class BrowserService:
|
||||
t.join(timeout=10)
|
||||
with self._lock:
|
||||
self._thread = None
|
||||
self._needs_restart = False
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Actions (each method is dispatched to the background thread)
|
||||
|
||||
@@ -4,6 +4,15 @@ Browser tool - Control a Chromium browser for web navigation and interaction.
|
||||
Uses Playwright under the hood. Browser instance is lazily started on first
|
||||
use, reused across tool calls within the same session, and cleaned up via
|
||||
close().
|
||||
|
||||
Launch modes (configured under `tools.browser` in config.json):
|
||||
- persistent (default): Chromium runs with a persistent user_data_dir
|
||||
(default `~/.cow/browser_profile`), so cookies and login state survive
|
||||
across runs. The user only needs to log in once.
|
||||
- cdp: When `cdp_endpoint` is set, attach to an externally launched Chrome
|
||||
via the Chrome DevTools Protocol. Lets the agent reuse the user's real
|
||||
browser (with all logins / extensions / true fingerprints).
|
||||
- fresh: Set `persistent` to false to fall back to a clean context every run.
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -26,6 +35,9 @@ class BrowserTool(BaseTool):
|
||||
"Workflow: navigate (auto-includes snapshot with element refs) → click/fill/select by ref → snapshot to verify.\n\n"
|
||||
"Use snapshot as the primary way to read pages. Use screenshot + send to show key results to the user. "
|
||||
"For login/CAPTCHA/authorization etc., screenshot and ask the user for help. "
|
||||
"Login state is persisted across sessions (cookies / localStorage are kept in a "
|
||||
"user profile directory), so once the user logs in to a site, the agent can keep "
|
||||
"using it without logging in again."
|
||||
)
|
||||
|
||||
params: dict = {
|
||||
|
||||
@@ -377,12 +377,18 @@ class AgentInitializer:
|
||||
tool = tool_manager.create_tool(tool_name)
|
||||
|
||||
if tool:
|
||||
# Apply workspace config to file operation tools
|
||||
# Apply workspace config to file operation tools.
|
||||
# Merge into the existing tool.config (set by ToolManager from
|
||||
# config.json's `tools.<name>` section) instead of replacing
|
||||
# it, otherwise per-tool user configs (e.g. browser.cdp_endpoint)
|
||||
# would be silently dropped.
|
||||
if tool_name in ['read', 'write', 'edit', 'bash', 'grep', 'find', 'ls', 'web_fetch', 'send', 'browser']:
|
||||
tool.config = file_config
|
||||
tool.cwd = file_config.get("cwd", getattr(tool, 'cwd', None))
|
||||
if 'memory_manager' in file_config:
|
||||
tool.memory_manager = file_config['memory_manager']
|
||||
merged_config = dict(getattr(tool, 'config', None) or {})
|
||||
merged_config.update(file_config)
|
||||
tool.config = merged_config
|
||||
tool.cwd = merged_config.get("cwd", getattr(tool, 'cwd', None))
|
||||
if 'memory_manager' in merged_config:
|
||||
tool.memory_manager = merged_config['memory_manager']
|
||||
tools.append(tool)
|
||||
except Exception as e:
|
||||
logger.warning(f"[AgentInitializer] Failed to load tool {tool_name}: {e}")
|
||||
|
||||
@@ -1,25 +1,172 @@
|
||||
---
|
||||
title: browser - Browser
|
||||
description: Access and interact with web pages
|
||||
description: Control a browser to access and interact with web pages
|
||||
---
|
||||
|
||||
Use a browser to access and interact with web pages, supports JavaScript-rendered dynamic pages.
|
||||
Control a Chromium browser for web navigation, element interaction and content extraction. Supports JavaScript-rendered pages and uses a compact DOM snapshot so the Agent can efficiently understand page structure.
|
||||
|
||||
## Dependencies
|
||||
## Installation
|
||||
|
||||
| Dependency | Install Command |
|
||||
| --- | --- |
|
||||
| `browser-use` ≥ 0.1.40 | `pip install browser-use` |
|
||||
| `markdownify` | `pip install markdownify` |
|
||||
| `playwright` + chromium | `pip install playwright && playwright install chromium` |
|
||||
<Tabs>
|
||||
<Tab title="CLI install (recommended)">
|
||||
```bash
|
||||
cow install-browser
|
||||
```
|
||||
|
||||
This command will:
|
||||
- Install the `playwright` Python package (with auto-fallback for older systems)
|
||||
- Install system dependencies on Linux
|
||||
- Download the Chromium browser (Linux servers automatically use the headless build)
|
||||
- Detect China-mainland networks and use mirror acceleration
|
||||
</Tab>
|
||||
<Tab title="Manual install">
|
||||
```bash
|
||||
pip install playwright
|
||||
playwright install chromium
|
||||
```
|
||||
|
||||
On Linux servers, install system dependencies as well:
|
||||
```bash
|
||||
sudo playwright install-deps chromium
|
||||
```
|
||||
|
||||
On older systems (e.g. Ubuntu 18.04, glibc < 2.28), install a compatible version:
|
||||
```bash
|
||||
pip install playwright==1.28.0
|
||||
python -m playwright install chromium
|
||||
```
|
||||
|
||||
To accelerate the Chromium download from China:
|
||||
```bash
|
||||
export PLAYWRIGHT_DOWNLOAD_HOST=https://registry.npmmirror.com/-/binary/playwright
|
||||
python -m playwright install chromium
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
<Note>
|
||||
1. Supported on Ubuntu 20.04+, Debian 10+, macOS and Windows. Older systems such as Ubuntu 18.04 will fall back to a compatible version automatically.
|
||||
2. The browser tool has heavy dependencies (~300MB) and is optional. For lightweight web content retrieval, use the `web_fetch` tool.
|
||||
</Note>
|
||||
|
||||
## Workflow
|
||||
|
||||
A typical browser workflow for the Agent:
|
||||
|
||||
1. **`navigate`** — Open the target URL
|
||||
2. **`snapshot`** — Get a compact DOM with auto-numbered interactive elements (`ref`)
|
||||
3. **`click` / `fill` / `select`** — Operate elements by `ref`
|
||||
4. **`snapshot`** — Snapshot again to verify the result
|
||||
|
||||
## Supported Actions
|
||||
|
||||
| Action | Description | Key parameters |
|
||||
| --- | --- | --- |
|
||||
| `navigate` | Open URL | `url` |
|
||||
| `snapshot` | Get structured page text (primary way) | `selector` (optional) |
|
||||
| `click` | Click an element | `ref` or `selector` |
|
||||
| `fill` | Fill text into an input | `ref` or `selector`, `text` |
|
||||
| `select` | Select a dropdown option | `ref` or `selector`, `value` |
|
||||
| `scroll` | Scroll the page | `direction` (up/down/left/right) |
|
||||
| `screenshot` | Save a screenshot to the workspace | `full_page` |
|
||||
| `wait` | Wait for an element or timeout | `selector`, `timeout` |
|
||||
| `press` | Press a key (Enter, Tab, etc.) | `key` |
|
||||
| `back` / `forward` | Browser back / forward | - |
|
||||
| `get_text` | Get an element's text content | `selector` |
|
||||
| `evaluate` | Run JavaScript | `script` |
|
||||
|
||||
## Use Cases
|
||||
|
||||
- Access specific URLs to get page content
|
||||
- Interact with web page elements (click, type, etc.)
|
||||
- Verify deployed web pages
|
||||
- Scrape dynamic content requiring JS rendering
|
||||
- Access a URL to retrieve dynamic page content
|
||||
- Fill in forms and log in
|
||||
- Operate web elements (click buttons, select options, etc.)
|
||||
- Verify the result of a deployed web page
|
||||
- Scrape content that requires JS rendering
|
||||
|
||||
## Run Mode
|
||||
|
||||
The browser picks a mode based on the runtime environment:
|
||||
|
||||
| Environment | Mode |
|
||||
| --- | --- |
|
||||
| macOS / Windows | Headed (browser window visible) |
|
||||
| Linux desktop (with DISPLAY) | Headed |
|
||||
| Linux server (no DISPLAY) | Headless |
|
||||
|
||||
You can override it in `config.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"tools": {
|
||||
"browser": {
|
||||
"headless": true
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Persistent Login
|
||||
|
||||
**Log in to a target site once and the Agent can keep using it.** Two ways are supported:
|
||||
|
||||
### Option 1: Persistent mode (default)
|
||||
|
||||
Works out of the box. Login state is saved under `~/.cow/browser_profile`. No configuration needed.
|
||||
|
||||
To disable persistence and start with a clean environment every time:
|
||||
|
||||
```json
|
||||
{
|
||||
"tools": {
|
||||
"browser": {
|
||||
"persistent": false
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Option 2: CDP mode (attach to real Chrome)
|
||||
|
||||
Have the Agent connect to a separately launched real Chrome (instead of the Chromium bundled with Playwright) for full browser fingerprints. Useful for sites with strict bot detection.
|
||||
|
||||
Launch Chrome with a debugging port and a dedicated user data directory:
|
||||
|
||||
<Tabs>
|
||||
<Tab title="macOS">
|
||||
```bash
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" \
|
||||
--remote-debugging-port=9222 \
|
||||
--user-data-dir="$HOME/.cow/chrome-cdp"
|
||||
```
|
||||
</Tab>
|
||||
<Tab title="Linux">
|
||||
```bash
|
||||
google-chrome \
|
||||
--remote-debugging-port=9222 \
|
||||
--user-data-dir="$HOME/.cow/chrome-cdp"
|
||||
```
|
||||
</Tab>
|
||||
<Tab title="Windows">
|
||||
```powershell
|
||||
& "C:\Program Files\Google\Chrome\Application\chrome.exe" `
|
||||
--remote-debugging-port=9222 `
|
||||
--user-data-dir="$env:USERPROFILE\.cow\chrome-cdp"
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
Then point the Agent at the endpoint in `config.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"tools": {
|
||||
"browser": {
|
||||
"cdp_endpoint": "http://localhost:9222"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
<Note>
|
||||
The browser tool has heavy dependencies. If not needed, skip installation. For lightweight web content retrieval, use the `web-fetch` skill instead.
|
||||
Chrome 137+ requires `--remote-debugging-port` to be paired with a dedicated `--user-data-dir`. As a result, the CDP-launched Chrome **cannot directly reuse the login state of your daily Chrome**; you'll need to log in once inside this dedicated profile.
|
||||
</Note>
|
||||
|
||||
@@ -1,25 +1,172 @@
|
||||
---
|
||||
title: browser - ブラウザ
|
||||
description: Webページへのアクセスと操作
|
||||
description: ブラウザを操作してWebページにアクセス・操作する
|
||||
---
|
||||
|
||||
ブラウザを使用してWebページにアクセス・操作します。JavaScriptでレンダリングされる動的ページにも対応しています。
|
||||
Chromiumブラウザを操作してWebページのナビゲーション、要素操作、コンテンツ取得を行います。JavaScriptでレンダリングされる動的ページに対応し、簡略化したDOMスナップショットによりAgentが効率的にページ構造を理解できます。
|
||||
|
||||
## 依存関係
|
||||
## インストール
|
||||
|
||||
| 依存関係 | インストールコマンド |
|
||||
| --- | --- |
|
||||
| `browser-use` ≥ 0.1.40 | `pip install browser-use` |
|
||||
| `markdownify` | `pip install markdownify` |
|
||||
| `playwright` + chromium | `pip install playwright && playwright install chromium` |
|
||||
<Tabs>
|
||||
<Tab title="CLIインストール(推奨)">
|
||||
```bash
|
||||
cow install-browser
|
||||
```
|
||||
|
||||
このコマンドは以下を自動で実行します:
|
||||
- `playwright` Pythonパッケージのインストール(古いシステムでは互換バージョンに自動フォールバック)
|
||||
- Linuxにおけるシステム依存のインストール
|
||||
- Chromiumブラウザのダウンロード(Linuxサーバーでは自動的にヘッドレス軽量版を使用)
|
||||
- 中国本土ネットワークの自動検知とミラー高速化
|
||||
</Tab>
|
||||
<Tab title="手動インストール">
|
||||
```bash
|
||||
pip install playwright
|
||||
playwright install chromium
|
||||
```
|
||||
|
||||
Linuxサーバーではシステム依存も必要:
|
||||
```bash
|
||||
sudo playwright install-deps chromium
|
||||
```
|
||||
|
||||
古いシステム(例: Ubuntu 18.04、glibc < 2.28)では互換バージョンをインストール:
|
||||
```bash
|
||||
pip install playwright==1.28.0
|
||||
python -m playwright install chromium
|
||||
```
|
||||
|
||||
中国からChromiumのダウンロードを高速化したい場合:
|
||||
```bash
|
||||
export PLAYWRIGHT_DOWNLOAD_HOST=https://registry.npmmirror.com/-/binary/playwright
|
||||
python -m playwright install chromium
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
<Note>
|
||||
1. Ubuntu 20.04+、Debian 10+、macOS、Windowsをサポート。Ubuntu 18.04などの古いシステムでは互換バージョンに自動フォールバックします。
|
||||
2. ブラウザToolは依存関係が大きい(約300MB)ため、不要な場合はインストールを省略できます。軽量なWebコンテンツ取得には `web_fetch` Toolをご利用ください。
|
||||
</Note>
|
||||
|
||||
## ワークフロー
|
||||
|
||||
Agentがブラウザを使う典型的な流れ:
|
||||
|
||||
1. **`navigate`** — 対象URLを開く
|
||||
2. **`snapshot`** — 簡略化したDOMを取得し、操作可能な要素には自動で番号(`ref`)が付く
|
||||
3. **`click` / `fill` / `select`** — `ref`で要素を操作する
|
||||
4. **`snapshot`** — 再度スナップショットを取得して結果を確認
|
||||
|
||||
## サポートされる操作
|
||||
|
||||
| 操作 | 説明 | 主なパラメータ |
|
||||
| --- | --- | --- |
|
||||
| `navigate` | URLを開く | `url` |
|
||||
| `snapshot` | 構造化されたページテキストを取得(主な利用方法) | `selector`(任意) |
|
||||
| `click` | 要素をクリック | `ref` または `selector` |
|
||||
| `fill` | 入力欄にテキストを入力 | `ref` または `selector`、`text` |
|
||||
| `select` | プルダウンから選択 | `ref` または `selector`、`value` |
|
||||
| `scroll` | ページをスクロール | `direction`(up/down/left/right) |
|
||||
| `screenshot` | スクリーンショットをワークスペースに保存 | `full_page` |
|
||||
| `wait` | 要素または時間を待機 | `selector`、`timeout` |
|
||||
| `press` | キー入力(Enter、Tabなど) | `key` |
|
||||
| `back` / `forward` | ブラウザの戻る/進む | - |
|
||||
| `get_text` | 要素のテキストを取得 | `selector` |
|
||||
| `evaluate` | JavaScriptを実行 | `script` |
|
||||
|
||||
## ユースケース
|
||||
|
||||
- 特定のURLにアクセスしてページ内容を取得
|
||||
- Webページの要素を操作(クリック、入力など)
|
||||
- デプロイされたWebページの検証
|
||||
- 指定URLにアクセスして動的コンテンツを取得
|
||||
- フォーム入力やログイン操作
|
||||
- Web要素の操作(ボタンクリック、項目選択など)
|
||||
- デプロイ後のWebページ動作確認
|
||||
- JSレンダリングが必要な動的コンテンツのスクレイピング
|
||||
|
||||
## 動作モード
|
||||
|
||||
実行環境に応じてブラウザのモードが自動選択されます:
|
||||
|
||||
| 環境 | モード |
|
||||
| --- | --- |
|
||||
| macOS / Windows | ヘッドモード(ブラウザウィンドウを表示) |
|
||||
| Linuxデスクトップ(DISPLAYあり) | ヘッドモード |
|
||||
| Linuxサーバー(DISPLAYなし) | ヘッドレスモード |
|
||||
|
||||
`config.json`で手動上書き可能:
|
||||
|
||||
```json
|
||||
{
|
||||
"tools": {
|
||||
"browser": {
|
||||
"headless": true
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## ログイン状態の永続化
|
||||
|
||||
**対象サイトに一度ログインすれば、Agentは以降そのまま利用できます。** 2つの方法があります:
|
||||
|
||||
### 方法1: Persistentモード(デフォルト)
|
||||
|
||||
設定不要、すぐに利用可能。ログイン情報は `~/.cow/browser_profile` に保存されます。
|
||||
|
||||
毎回クリーンな環境で起動したい場合は、永続化を無効化:
|
||||
|
||||
```json
|
||||
{
|
||||
"tools": {
|
||||
"browser": {
|
||||
"persistent": false
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 方法2: CDPモード(既存のChromeに接続)
|
||||
|
||||
Playwright付属のChromiumではなく、別途起動した本物のChromeにAgentを接続させることで、完全なブラウザフィンガープリントが得られます。Bot検知が厳しいサイトに有効です。
|
||||
|
||||
Chromeをデバッグポートと専用のユーザーデータディレクトリ付きで起動します:
|
||||
|
||||
<Tabs>
|
||||
<Tab title="macOS">
|
||||
```bash
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" \
|
||||
--remote-debugging-port=9222 \
|
||||
--user-data-dir="$HOME/.cow/chrome-cdp"
|
||||
```
|
||||
</Tab>
|
||||
<Tab title="Linux">
|
||||
```bash
|
||||
google-chrome \
|
||||
--remote-debugging-port=9222 \
|
||||
--user-data-dir="$HOME/.cow/chrome-cdp"
|
||||
```
|
||||
</Tab>
|
||||
<Tab title="Windows">
|
||||
```powershell
|
||||
& "C:\Program Files\Google\Chrome\Application\chrome.exe" `
|
||||
--remote-debugging-port=9222 `
|
||||
--user-data-dir="$env:USERPROFILE\.cow\chrome-cdp"
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
`config.json` で接続先を指定:
|
||||
|
||||
```json
|
||||
{
|
||||
"tools": {
|
||||
"browser": {
|
||||
"cdp_endpoint": "http://localhost:9222"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
<Note>
|
||||
ブラウザToolは依存関係が大きいため、不要な場合はインストールを省略できます。軽量なWebコンテンツ取得には、代わりに `web-fetch` Skillをご利用ください。
|
||||
Chrome 137以降では `--remote-debugging-port` を専用の `--user-data-dir` と組み合わせる必要があるため、CDPで起動するChromeは**普段使いのChromeのログイン状態をそのまま流用できません**。専用プロファイル内で一度ログインし直す必要があります。
|
||||
</Note>
|
||||
|
||||
@@ -45,7 +45,8 @@ description: 控制浏览器访问和操作网页
|
||||
</Tabs>
|
||||
|
||||
<Note>
|
||||
支持 Ubuntu 20.04+、Debian 10+、macOS、Windows。Ubuntu 18.04 等旧系统会自动降级安装兼容版本。
|
||||
1. 支持 Ubuntu 20.04+、Debian 10+、macOS、Windows。Ubuntu 18.04 等旧系统会自动降级安装兼容版本。
|
||||
2. 浏览器工具依赖较重(约300MB),为可选安装。轻量的网页内容获取可使用 `web_fetch` 工具。
|
||||
</Note>
|
||||
|
||||
## 工作流程
|
||||
@@ -104,6 +105,68 @@ Agent 使用浏览器的典型流程:
|
||||
}
|
||||
```
|
||||
|
||||
## 登录态持久化
|
||||
|
||||
**只需登录一次目标网站,Agent 后续可直接使用**。提供两种方式:
|
||||
|
||||
### 方式一:Persistent 模式(默认)
|
||||
|
||||
开箱即用,登录信息保存在 `~/.cow/browser_profile`。无需任何配置。
|
||||
|
||||
如需关闭持久化模式,每次都用纯净环境:
|
||||
|
||||
```json
|
||||
{
|
||||
"tools": {
|
||||
"browser": {
|
||||
"persistent": false
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 方式二:CDP 模式(接管真实 Chrome)
|
||||
|
||||
让 Agent 连接独立启动的真实 Chrome(而非 Playwright 自带的 Chromium),获得完整浏览器指纹,适合反爬严格的网站。
|
||||
|
||||
启动 Chrome 时加上调试端口和独立用户目录:
|
||||
|
||||
<Tabs>
|
||||
<Tab title="macOS">
|
||||
```bash
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" \
|
||||
--remote-debugging-port=9222 \
|
||||
--user-data-dir="$HOME/.cow/chrome-cdp"
|
||||
```
|
||||
</Tab>
|
||||
<Tab title="Linux">
|
||||
```bash
|
||||
google-chrome \
|
||||
--remote-debugging-port=9222 \
|
||||
--user-data-dir="$HOME/.cow/chrome-cdp"
|
||||
```
|
||||
</Tab>
|
||||
<Tab title="Windows">
|
||||
```powershell
|
||||
& "C:\Program Files\Google\Chrome\Application\chrome.exe" `
|
||||
--remote-debugging-port=9222 `
|
||||
--user-data-dir="$env:USERPROFILE\.cow\chrome-cdp"
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
在 `config.json` 中配置端点:
|
||||
|
||||
```json
|
||||
{
|
||||
"tools": {
|
||||
"browser": {
|
||||
"cdp_endpoint": "http://localhost:9222"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
<Note>
|
||||
浏览器工具依赖较重(~300MB),如不需要可不安装。轻量的网页内容获取可使用 `web_fetch` 工具。
|
||||
Chrome 137+ 限制 `--remote-debugging-port` 必须搭配独立 `--user-data-dir`,因此 CDP 启动的 Chrome **无法直接复用你日常 Chrome 的登录态**,需要在独立目录中重新登录一次。
|
||||
</Note>
|
||||
|
||||
Reference in New Issue
Block a user