Spaces:
Sleeping
Sleeping
File size: 1,700 Bytes
0397cdb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
from smolagents import Tool
import requests
import os
from dotenv import load_dotenv
# Load environment variables from .env if present
load_dotenv()
class BrightDataScraperTool(Tool):
name = "brightdata_web_scraper"
description = """
Scrape any webpage and return content in Markdown format.
This tool can bypass bot detection and CAPTCHAs.
Use this when you need to extract content from websites.
"""
inputs = {
"url": {
"type": "string",
"description": "The URL of the webpage to scrape",
}
}
output_type = "string"
def forward(self, url: str) -> str:
"""
Scrape a webpage using Bright Data's API.
Args:
url: The URL to scrape
Returns:
The scraped content in Markdown format
"""
api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker_1")
if not api_token:
raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
api_url = "https://api.brightdata.com/request"
headers = {
"Authorization": f"Bearer {api_token}",
"Content-Type": "application/json",
}
payload = {
"url": url,
"zone": unlocker_zone,
"format": "raw",
"data_format": "markdown",
}
try:
response = requests.post(api_url, json=payload, headers=headers)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
return f"Error scraping URL: {str(e)}"
|