chatgpt-on-wechat/agent/tools/browser/browser_tool.py

"""
Browser tool - Control a Chromium browser for web navigation and interaction.

Uses Playwright under the hood. Browser instance is lazily started on first
use, reused across tool calls within the same session, and cleaned up via
close().
"""

import json
import os
from typing import Dict, Any, Optional

from agent.tools.base_tool import BaseTool, ToolResult
from agent.tools.browser.browser_service import BrowserService
from common.log import logger


class BrowserTool(BaseTool):
    """Single tool exposing all browser actions via an 'action' parameter."""

    name: str = "browser"
    description: str = (
        "Control a browser to navigate web pages, interact with elements, and extract content. "
        "Actions: navigate, snapshot, click, fill, select, scroll, screenshot, wait, back, forward, "
        "get_text, press, evaluate.\n\n"
        "Workflow: navigate (auto-includes snapshot with element refs) → click/fill/select by ref → snapshot to verify.\n\n"
        "Use snapshot as the primary way to read pages. Use screenshot + send to show key results to the user. "
        "For login/CAPTCHA/authorization etc., screenshot and ask the user for help."
    )

    params: dict = {
        "type": "object",
        "properties": {
            "action": {
                "type": "string",
                "description": (
                    "The browser action to perform. One of: "
                    "navigate, snapshot, click, fill, select, scroll, "
                    "screenshot, wait, back, forward, get_text, press, evaluate"
                ),
                "enum": [
                    "navigate", "snapshot", "click", "fill", "select", "scroll",
                    "screenshot", "wait", "back", "forward", "get_text", "press",
                    "evaluate"
                ]
            },
            "url": {
                "type": "string",
                "description": "URL to navigate to (for 'navigate' action)"
            },
            "ref": {
                "type": "integer",
                "description": "Element ref number from snapshot (for click/fill/select)"
            },
            "selector": {
                "type": "string",
                "description": "CSS selector as fallback when ref is unavailable (for click/fill/select/wait/get_text)"
            },
            "text": {
                "type": "string",
                "description": "Text to type (for 'fill' action)"
            },
            "value": {
                "type": "string",
                "description": "Option value (for 'select' action)"
            },
            "key": {
                "type": "string",
                "description": "Key to press, e.g. Enter, Tab, Escape (for 'press' action)"
            },
            "direction": {
                "type": "string",
                "description": "Scroll direction: up, down, left, right (for 'scroll' action, default: down)"
            },
            "script": {
                "type": "string",
                "description": "JavaScript code to execute (for 'evaluate' action)"
            },
            "full_page": {
                "type": "boolean",
                "description": "Capture full page screenshot (for 'screenshot' action, default: false)"
            },
            "timeout": {
                "type": "integer",
                "description": "Timeout in milliseconds (optional, default varies by action)"
            }
        },
        "required": ["action"]
    }

    _shared_service: Optional[BrowserService] = None

    def __init__(self, config: dict = None):
        self.config = config or {}
        self.cwd = self.config.get("cwd", os.getcwd())
        self._service: Optional[BrowserService] = None

    def _get_service(self) -> BrowserService:
        """Get or create the browser service, sharing across copies."""
        if self._service is not None:
            return self._service

        # Reuse shared service across tool copies within the same session
        if BrowserTool._shared_service is not None:
            self._service = BrowserTool._shared_service
            return self._service

        self._service = BrowserService(self.config)
        BrowserTool._shared_service = self._service
        return self._service

    def execute(self, args: Dict[str, Any]) -> ToolResult:
        action = args.get("action", "").strip().lower()
        if not action:
            return ToolResult.fail("Error: 'action' parameter is required")

        handler = self._ACTION_MAP.get(action)
        if not handler:
            valid = ", ".join(sorted(self._ACTION_MAP.keys()))
            return ToolResult.fail(f"Unknown action '{action}'. Valid actions: {valid}")

        try:
            return handler(self, args)
        except Exception as e:
            logger.error(f"[Browser] Action '{action}' error: {e}")
            return ToolResult.fail(f"Browser error ({action}): {e}")

    # ------------------------------------------------------------------
    # Action handlers
    # ------------------------------------------------------------------

    def _do_navigate(self, args: Dict[str, Any]) -> ToolResult:
        url = args.get("url", "").strip()
        if not url:
            return ToolResult.fail("Error: 'url' is required for navigate action")
        if not url.startswith(("http://", "https://")):
            url = "https://" + url
        timeout = args.get("timeout", 30000)
        service = self._get_service()
        result = service.navigate(url, timeout=timeout)
        if "error" in result:
            return ToolResult.fail(result["error"])
        # Auto-snapshot after navigation so the agent gets page content in one call
        snapshot_text = service.snapshot()
        return ToolResult.success(
            f"Navigated to: {result['url']}\nTitle: {result['title']}\nStatus: {result['status']}\n\n"
            f"--- Page Snapshot ---\n{snapshot_text}"
        )

    def _do_snapshot(self, args: Dict[str, Any]) -> ToolResult:
        selector = args.get("selector")
        text = self._get_service().snapshot(selector=selector)
        return ToolResult.success(text)

    def _do_click(self, args: Dict[str, Any]) -> ToolResult:
        ref = args.get("ref")
        selector = args.get("selector")
        timeout = args.get("timeout", 5000)
        result = self._get_service().click(ref=ref, selector=selector, timeout=timeout)
        if "error" in result:
            return ToolResult.fail(result["error"])
        return ToolResult.success(f"Clicked successfully. Use 'snapshot' to see updated page.")

    def _do_fill(self, args: Dict[str, Any]) -> ToolResult:
        text = args.get("text", "")
        ref = args.get("ref")
        selector = args.get("selector")
        timeout = args.get("timeout", 5000)
        if not text and text != "":
            return ToolResult.fail("Error: 'text' is required for fill action")
        result = self._get_service().fill(text, ref=ref, selector=selector, timeout=timeout)
        if "error" in result:
            return ToolResult.fail(result["error"])
        return ToolResult.success(f"Filled text into element. Use 'snapshot' to verify.")

    def _do_select(self, args: Dict[str, Any]) -> ToolResult:
        value = args.get("value", "")
        ref = args.get("ref")
        selector = args.get("selector")
        timeout = args.get("timeout", 5000)
        if not value:
            return ToolResult.fail("Error: 'value' is required for select action")
        result = self._get_service().select(value, ref=ref, selector=selector, timeout=timeout)
        if "error" in result:
            return ToolResult.fail(result["error"])
        return ToolResult.success(f"Selected option '{value}'.")

    def _do_scroll(self, args: Dict[str, Any]) -> ToolResult:
        direction = args.get("direction", "down")
        amount = args.get("timeout", 500)  # reuse timeout field or default
        if "amount" in args:
            amount = args["amount"]
        result = self._get_service().scroll(direction=direction, amount=amount)
        if "error" in result:
            return ToolResult.fail(result["error"])
        pos = f"scrollY={result.get('scrollY', '?')}/{result.get('scrollHeight', '?')}"
        return ToolResult.success(f"Scrolled {direction}. Position: {pos}")

    def _do_screenshot(self, args: Dict[str, Any]) -> ToolResult:
        full_page = args.get("full_page", False)
        filepath = self._get_service().screenshot(full_page=full_page, cwd=self.cwd)
        return ToolResult.success(f"Screenshot saved to: {filepath}")

    def _do_wait(self, args: Dict[str, Any]) -> ToolResult:
        selector = args.get("selector")
        timeout = args.get("timeout", 5000)
        result = self._get_service().wait(selector=selector, timeout=timeout)
        if "error" in result:
            return ToolResult.fail(result["error"])
        return ToolResult.success(f"Wait completed.")

    def _do_back(self, args: Dict[str, Any]) -> ToolResult:
        result = self._get_service().go_back()
        if "error" in result:
            return ToolResult.fail(result["error"])
        return ToolResult.success(f"Navigated back to: {result['url']}")

    def _do_forward(self, args: Dict[str, Any]) -> ToolResult:
        result = self._get_service().go_forward()
        if "error" in result:
            return ToolResult.fail(result["error"])
        return ToolResult.success(f"Navigated forward to: {result['url']}")

    def _do_get_text(self, args: Dict[str, Any]) -> ToolResult:
        selector = args.get("selector", "").strip()
        if not selector:
            return ToolResult.fail("Error: 'selector' is required for get_text action")
        result = self._get_service().get_text(selector)
        if "error" in result:
            return ToolResult.fail(result["error"])
        return ToolResult.success(result["text"])

    def _do_press(self, args: Dict[str, Any]) -> ToolResult:
        key = args.get("key", "").strip()
        if not key:
            return ToolResult.fail("Error: 'key' is required for press action")
        result = self._get_service().press(key)
        if "error" in result:
            return ToolResult.fail(result["error"])
        return ToolResult.success(f"Pressed key: {key}")

    def _do_evaluate(self, args: Dict[str, Any]) -> ToolResult:
        script = args.get("script", "").strip()
        if not script:
            return ToolResult.fail("Error: 'script' is required for evaluate action")
        result = self._get_service().evaluate(script)
        if "error" in result:
            return ToolResult.fail(result["error"])
        val = result.get("result")
        if isinstance(val, (dict, list)):
            return ToolResult.success(json.dumps(val, ensure_ascii=False, indent=2))
        return ToolResult.success(str(val) if val is not None else "(no return value)")

    # Action dispatch table
    _ACTION_MAP = {
        "navigate": _do_navigate,
        "snapshot": _do_snapshot,
        "click": _do_click,
        "fill": _do_fill,
        "select": _do_select,
        "scroll": _do_scroll,
        "screenshot": _do_screenshot,
        "wait": _do_wait,
        "back": _do_back,
        "forward": _do_forward,
        "get_text": _do_get_text,
        "press": _do_press,
        "evaluate": _do_evaluate,
    }

    # ------------------------------------------------------------------
    # Lifecycle
    # ------------------------------------------------------------------

    def copy(self):
        """Share browser instance across tool copies (avoids re-launching)."""
        new_tool = BrowserTool(self.config)
        new_tool.model = self.model
        new_tool.context = getattr(self, "context", None)
        new_tool.cwd = self.cwd
        new_tool._service = self._service
        return new_tool

    def close(self):
        """Release browser resources."""
        if self._service:
            self._service.close()
            self._service = None
        BrowserTool._shared_service = None
        logger.info("[Browser] BrowserTool closed")