File size: 4,208 Bytes
1397957
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from typing import Dict, Any
import httpx
from .tool import BaseTool, ToolContext, ToolResult


class WebFetchTool(BaseTool):
    
    @property
    def id(self) -> str:
        return "webfetch"
    
    @property
    def description(self) -> str:
        return (
            "Fetch content from a URL and convert it to readable text or markdown. "
            "Use this when you need to read the content of a specific web page."
        )
    
    @property
    def parameters(self) -> Dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "url": {
                    "type": "string",
                    "description": "The URL to fetch"
                },
                "format": {
                    "type": "string",
                    "enum": ["text", "markdown", "html"],
                    "description": "Output format (default: markdown)",
                    "default": "markdown"
                }
            },
            "required": ["url"]
        }
    
    async def execute(self, args: Dict[str, Any], ctx: ToolContext) -> ToolResult:
        url = args["url"]
        output_format = args.get("format", "markdown")
        
        if not url.startswith(("http://", "https://")):
            url = "https://" + url
        
        try:
            async with httpx.AsyncClient(follow_redirects=True, timeout=30.0) as client:
                response = await client.get(
                    url,
                    headers={
                        "User-Agent": "Mozilla/5.0 (compatible; OpenCode-API/1.0)"
                    }
                )
                response.raise_for_status()
                html_content = response.text
            
            if output_format == "html":
                content = html_content[:50000]  # Limit size
            elif output_format == "text":
                content = self._html_to_text(html_content)
            else:  # markdown
                content = self._html_to_markdown(html_content)
            
            if len(content) > 50000:
                content = content[:50000] + "\n\n[Content truncated...]"
            
            return ToolResult(
                title=f"Fetched: {url}",
                output=content,
                metadata={"url": url, "format": output_format, "length": len(content)}
            )
            
        except httpx.HTTPStatusError as e:
            return ToolResult(
                title=f"Fetch failed: {url}",
                output=f"HTTP Error {e.response.status_code}: {e.response.reason_phrase}",
                metadata={"error": "http_error", "status_code": e.response.status_code}
            )
        except httpx.RequestError as e:
            return ToolResult(
                title=f"Fetch failed: {url}",
                output=f"Request error: {str(e)}",
                metadata={"error": "request_error"}
            )
        except Exception as e:
            return ToolResult(
                title=f"Fetch failed: {url}",
                output=f"Error: {str(e)}",
                metadata={"error": str(e)}
            )
    
    def _html_to_text(self, html: str) -> str:
        try:
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(html, "html.parser")
            
            for tag in soup(["script", "style", "nav", "footer", "header"]):
                tag.decompose()
            
            return soup.get_text(separator="\n", strip=True)
        except ImportError:
            import re
            text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
            text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
            text = re.sub(r"<[^>]+>", " ", text)
            text = re.sub(r"\s+", " ", text)
            return text.strip()
    
    def _html_to_markdown(self, html: str) -> str:
        try:
            import html2text
            h = html2text.HTML2Text()
            h.ignore_links = False
            h.ignore_images = True
            h.body_width = 0
            return h.handle(html)
        except ImportError:
            return self._html_to_text(html)