diff --git a/agent/tools/web_fetch/web_fetch.py b/agent/tools/web_fetch/web_fetch.py index a08fb8b4..87f6b39c 100644 --- a/agent/tools/web_fetch/web_fetch.py +++ b/agent/tools/web_fetch/web_fetch.py @@ -29,7 +29,7 @@ DEFAULT_HEADERS = { # Supported document file extensions PDF_SUFFIXES: Set[str] = {".pdf"} -WORD_SUFFIXES: Set[str] = {".doc", ".docx"} +WORD_SUFFIXES: Set[str] = {".docx"} TEXT_SUFFIXES: Set[str] = {".txt", ".md", ".markdown", ".rst", ".csv", ".tsv", ".log"} SPREADSHEET_SUFFIXES: Set[str] = {".xls", ".xlsx"} PPT_SUFFIXES: Set[str] = {".ppt", ".pptx"} @@ -56,7 +56,7 @@ class WebFetch(BaseTool): description: str = ( "Fetch content from a URL. For web pages, extracts readable text. " "For document files (PDF, Word, TXT, Markdown, Excel, PPT), downloads and parses the file content. " - "Supported file types: .pdf, .doc, .docx, .txt, .md, .csv, .xls, .xlsx, .ppt, .pptx" + "Supported file types: .pdf, .docx, .txt, .md, .csv, .xls, .xlsx, .ppt, .pptx" ) params: dict = { @@ -226,29 +226,16 @@ class WebFetch(BaseTool): return "\n\n".join(text_parts) def _parse_word(self, file_path: str) -> str: - """Extract text from Word documents (.doc/.docx).""" - suffix = os.path.splitext(file_path)[-1].lower() - - if suffix == ".docx": - try: - from docx import Document - except ImportError: - raise ImportError( - "python-docx library is required for .docx parsing. Install with: pip install python-docx" - ) - doc = Document(file_path) - paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] - return "\n\n".join(paragraphs) - - # .doc format - try textract or fallback + """Extract text from Word documents (.docx).""" try: - import textract - text = textract.process(file_path).decode("utf-8") - return text + from docx import Document except ImportError: raise ImportError( - "textract library is required for .doc parsing. Install with: pip install textract" + "python-docx library is required for .docx parsing. Install with: pip install python-docx" ) + doc = Document(file_path) + paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] + return "\n\n".join(paragraphs) def _parse_text(self, file_path: str) -> str: """Read plain text files (txt, md, csv, etc.).""" @@ -344,7 +331,6 @@ class WebFetch(BaseTool): """Check if Content-Type indicates a binary/document response.""" binary_types = [ "application/pdf", - "application/msword", "application/vnd.openxmlformats", "application/vnd.ms-excel", "application/vnd.ms-powerpoint", @@ -358,7 +344,6 @@ class WebFetch(BaseTool): ct_lower = content_type.lower() suffix_map = { "application/pdf": ".pdf", - "application/msword": ".doc", "application/vnd.openxmlformats-officedocument.wordprocessingml": ".docx", "application/vnd.ms-excel": ".xls", "application/vnd.openxmlformats-officedocument.spreadsheetml": ".xlsx", diff --git a/requirements-optional.txt b/requirements-optional.txt index 5208c7be..0707f119 100644 --- a/requirements-optional.txt +++ b/requirements-optional.txt @@ -29,3 +29,9 @@ google-generativeai # tencentcloud sdk tencentcloud-sdk-python>=3.0.0 + +# file parsing (web_fetch document support) +pypdf +python-docx +openpyxl +python-pptx