Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import json | |
| import os | |
| from typing import Optional | |
| import requests | |
| from smolagents.tools import Tool | |
| class BrightDataScraperTool(Tool): | |
| name = "brightdata_web_scraper" | |
| description = """ | |
| Scrape any webpage and return content in Markdown format. | |
| This tool can bypass bot detection and CAPTCHAs. | |
| Use this when you need to extract content from websites. | |
| """ | |
| output_type = "string" | |
| def __init__(self) -> None: | |
| self.inputs = { | |
| "url": { | |
| "type": "string", | |
| "description": "The URL of the webpage to scrape", | |
| } | |
| } | |
| super().__init__() | |
| def forward(self, url) -> str: | |
| url_str = self._coerce_url_input(url) | |
| if not url_str: | |
| return json.dumps({"error": "No valid URL provided"}) | |
| api_token = os.getenv("BRIGHT_DATA_API_TOKEN") | |
| unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker1") | |
| if not api_token: | |
| raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables") | |
| api_url = "https://api.brightdata.com/request" | |
| headers = { | |
| "Authorization": f"Bearer {api_token}", | |
| "Content-Type": "application/json", | |
| } | |
| payload = { | |
| "url": url_str, | |
| "zone": unlocker_zone, | |
| "format": "raw", | |
| "data_format": "markdown", | |
| } | |
| try: | |
| response = requests.post(api_url, json=payload, headers=headers, timeout=30) | |
| response.raise_for_status() | |
| return response.text | |
| except requests.exceptions.RequestException as exc: | |
| details = exc.response.text if getattr(exc, "response", None) is not None else "" | |
| return json.dumps({"error": str(exc), "details": details}) | |
| def _coerce_url_input(self, raw) -> Optional[str]: | |
| if isinstance(raw, str): | |
| return self._ensure_scheme(raw) | |
| if isinstance(raw, dict): | |
| orig_name = raw.get("orig_name") | |
| if isinstance(orig_name, str) and orig_name: | |
| return self._ensure_scheme(orig_name) | |
| url_value = raw.get("url") | |
| if isinstance(url_value, str): | |
| if url_value.startswith(("http://", "https://")): | |
| return url_value | |
| return None | |
| return None | |
| def _parse_file_dict_string(self, value: str) -> Optional[dict]: | |
| import ast | |
| try: | |
| parsed = ast.literal_eval(value) | |
| return parsed if isinstance(parsed, dict) else None | |
| except (ValueError, SyntaxError): | |
| return None | |