Spaces:
Sleeping
Sleeping
File size: 3,231 Bytes
1a9158e 925a131 59e2cd4 1a9158e eabf5d8 1a9158e fbd126d 1a9158e fbd126d 1a9158e fbd126d 2c4cae4 59e2cd4 2c4cae4 fbd126d 1a9158e fbd126d 2c4cae4 fbd126d 1a9158e fbd126d 1a9158e ebf4777 2c4cae4 59e2cd4 f268ab5 2c4cae4 f268ab5 2c4cae4 59e2cd4 810f177 59e2cd4 2c4cae4 59e2cd4 0acfa35 f860ade f268ab5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
from __future__ import annotations
import json
import os
from typing import Optional
import requests
from smolagents.tools import Tool
class BrightDataScraperTool(Tool):
name = "brightdata_web_scraper"
description = """
Scrape any webpage and return content in Markdown format.
This tool can bypass bot detection and CAPTCHAs.
Use this when you need to extract content from websites.
"""
output_type = "string"
def __init__(self) -> None:
self.inputs = {
"url": {
"type": "string",
"description": "The URL of the webpage to scrape",
}
}
super().__init__()
def forward(self, url) -> str:
url_str = self._coerce_url_input(url)
if not url_str:
return json.dumps({"error": "No valid URL provided"})
api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker1")
if not api_token:
raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
api_url = "https://api.brightdata.com/request"
headers = {
"Authorization": f"Bearer {api_token}",
"Content-Type": "application/json",
}
payload = {
"url": url_str,
"zone": unlocker_zone,
"format": "raw",
"data_format": "markdown",
}
try:
response = requests.post(api_url, json=payload, headers=headers, timeout=30)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as exc:
details = exc.response.text if getattr(exc, "response", None) is not None else ""
return json.dumps({"error": str(exc), "details": details})
def _coerce_url_input(self, raw) -> Optional[str]:
# Gradio may pass a plain URL string, or a stringified dict representing an upload, or a dict itself.
if isinstance(raw, str):
if raw.strip().startswith("{") and "orig_name" in raw:
parsed = self._parse_file_dict_string(raw)
if parsed:
raw = parsed
else:
return self._ensure_scheme(raw)
else:
return self._ensure_scheme(raw)
if isinstance(raw, dict):
orig_name = raw.get("orig_name")
if isinstance(orig_name, str) and orig_name:
return self._ensure_scheme(orig_name)
url_value = raw.get("url")
if isinstance(url_value, str):
if url_value.startswith(("http://", "https://")):
return url_value
return None
return None
def _ensure_scheme(self, url: str) -> str:
if url.startswith(("http://", "https://")):
return url
return f"https://{url}"
def _parse_file_dict_string(self, value: str) -> Optional[dict]:
import ast
try:
parsed = ast.literal_eval(value)
return parsed if isinstance(parsed, dict) else None
except (ValueError, SyntaxError):
return None
|