Spaces:

BrightData
/

brightdata-scraper-tool

Sleeping

File size: 3,231 Bytes

1a9158e
 
 
925a131
59e2cd4
1a9158e
eabf5d8
1a9158e
 
fbd126d
 
1a9158e
fbd126d
 
 
 
 
 
 
1a9158e
 
 
 
 
 
 
 
fbd126d
2c4cae4
59e2cd4
2c4cae4
 
 
 
fbd126d
1a9158e
fbd126d
 
 
 
 
 
 
 
 
 
 
2c4cae4
fbd126d
 
 
 
 
 
1a9158e
fbd126d
 
1a9158e
ebf4777
 
2c4cae4
59e2cd4
f268ab5
2c4cae4
f268ab5
 
 
 
 
 
 
 
2c4cae4
 
 
 
59e2cd4
810f177
 
 
59e2cd4
 
 
2c4cae4
59e2cd4
0acfa35
f860ade
 
 
 
f268ab5

from __future__ import annotations

import json
import os
from typing import Optional

import requests
from smolagents.tools import Tool


class BrightDataScraperTool(Tool):
    name = "brightdata_web_scraper"
    description = """
    Scrape any webpage and return content in Markdown format.
    This tool can bypass bot detection and CAPTCHAs.
    Use this when you need to extract content from websites.
    """
    output_type = "string"

    def __init__(self) -> None:
        self.inputs = {
            "url": {
                "type": "string",
                "description": "The URL of the webpage to scrape",
            }
        }
        super().__init__()

    def forward(self, url) -> str:
        url_str = self._coerce_url_input(url)

        if not url_str:
            return json.dumps({"error": "No valid URL provided"})

        api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
        unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker1")

        if not api_token:
            raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")

        api_url = "https://api.brightdata.com/request"
        headers = {
            "Authorization": f"Bearer {api_token}",
            "Content-Type": "application/json",
        }

        payload = {
            "url": url_str,
            "zone": unlocker_zone,
            "format": "raw",
            "data_format": "markdown",
        }

        try:
            response = requests.post(api_url, json=payload, headers=headers, timeout=30)
            response.raise_for_status()
            return response.text
        except requests.exceptions.RequestException as exc:
            details = exc.response.text if getattr(exc, "response", None) is not None else ""
            return json.dumps({"error": str(exc), "details": details})

    def _coerce_url_input(self, raw) -> Optional[str]:
        # Gradio may pass a plain URL string, or a stringified dict representing an upload, or a dict itself.
        if isinstance(raw, str):
            if raw.strip().startswith("{") and "orig_name" in raw:
                parsed = self._parse_file_dict_string(raw)
                if parsed:
                    raw = parsed
                else:
                    return self._ensure_scheme(raw)
            else:
                return self._ensure_scheme(raw)

        if isinstance(raw, dict):
            orig_name = raw.get("orig_name")
            if isinstance(orig_name, str) and orig_name:
                return self._ensure_scheme(orig_name)

            url_value = raw.get("url")
            if isinstance(url_value, str):
                if url_value.startswith(("http://", "https://")):
                    return url_value
                return None

        return None

    def _ensure_scheme(self, url: str) -> str:
        if url.startswith(("http://", "https://")):
            return url
        return f"https://{url}"

    def _parse_file_dict_string(self, value: str) -> Optional[dict]:
        import ast

        try:
            parsed = ast.literal_eval(value)
            return parsed if isinstance(parsed, dict) else None
        except (ValueError, SyntaxError):
            return None