Spaces:

BrightData
/

brightdata-ai-agent

Running

App Files Files Community

meirk-brd commited on 8 days ago

Commit

0397cdb

1 Parent(s): abb4f97

Merge remote README with local changes

Browse files

Files changed (10) hide show

.gitignore +5 -0
README.md +21 -4
app.py +75 -0
brightdata_datasets.py +615 -0
brightdata_scraper.py +59 -0
brightdata_search.py +91 -0
requirements.txt +5 -0
test_datasets.py +47 -0
test_scraper.py +14 -0
test_search.py +20 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.env
+venv/
+__pycache__/
+*.pyc
+.DS_Store

README.md CHANGED Viewed

@@ -1,12 +1,29 @@
 ---
-title: Brightdata Ai Agent
-emoji: 🦀
-colorFrom: gray
-colorTo: pink
 sdk: gradio
 sdk_version: 6.0.2
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Bright Data AI Agent
+emoji: 🌐
+colorFrom: blue
+colorTo: green
 sdk: gradio
 sdk_version: 6.0.2
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
+# Bright Data AI Agent
+An AI agent powered by Bright Data APIs for web scraping and search.
+## Features
+- **Web Search**: Search Google, Bing, or Yandex
+- **Web Scraping**: Extract content from any webpage
+- **Bot Protection Bypass**: Automatically handles CAPTCHAs and bot detection
+## Setup
+Set the following secrets in your Space settings:
+- `BRIGHT_DATA_API_TOKEN`: Your Bright Data API token
+- `BRIGHT_DATA_UNLOCKER_ZONE`: Your unlocker zone name (default: web_unlocker1)
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import gradio as gr
+from smolagents import CodeAgent
+from smolagents.models import InferenceClientModel
+from brightdata_scraper import BrightDataScraperTool
+from brightdata_search import BrightDataSearchTool
+from brightdata_datasets import BrightDataDatasetTool
+# Initialize tools
+scraper_tool = BrightDataScraperTool()
+search_tool = BrightDataSearchTool()
+dataset_tool = BrightDataDatasetTool()
+# Initialize the agent with a Hugging Face Inference model
+# Requires HF_TOKEN in the environment for authentication.
+model = InferenceClientModel(model_id="deepseek-ai/DeepSeek-V3.2")
+agent = CodeAgent(
+    tools=[scraper_tool, search_tool, dataset_tool],
+    model=model,
+    add_base_tools=True,
+    max_steps=4,
+    instructions="Answer with the first satisfactory result; do not call the same tool repeatedly once you have the needed data. Use final_answer() as soon as you can."
+)
+def run_agent(task: str) -> str:
+    """Run the agent with the given task."""
+    try:
+        result = agent.run(task)
+        return str(result)
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="Bright Data AI Agent") as demo:
+    gr.Markdown("# Bright Data AI Agent")
+    gr.Markdown(
+        """
+    This agent can help you with web scraping, search, and quick access to Bright Data datasets.
+    **Available capabilities:**
+    - Search Google, Bing, or Yandex
+    - Scrape any webpage (bypasses bot detection)
+    - Read structured data from 40+ prebuilt datasets (e.g., amazon_product, google_maps_reviews, linkedin_company_profile)
+    **Example tasks:**
+    - "Search for recent AI news on Google"
+    - "Scrape the content from https://example.com"
+    - "Fetch google_maps_reviews for this place URL with the last 7 days"
+    """
+    )
+    with gr.Row():
+        with gr.Column():
+            task_input = gr.Textbox(label="Task", placeholder="Enter your task here...", lines=3)
+            submit_btn = gr.Button("Run Agent", variant="primary")
+        with gr.Column():
+            output = gr.Textbox(label="Result", lines=15, max_lines=30)
+    submit_btn.click(fn=run_agent, inputs=[task_input], outputs=[output])
+    gr.Examples(
+        examples=[
+            ["Search for 'latest developments in AI' on Google"],
+            ["Scrape the content from https://example.com"],
+            ["What are the top Python programming tutorials?"],
+        ],
+        inputs=[task_input],
+    )
+if __name__ == "__main__":
+    demo.launch()

brightdata_datasets.py ADDED Viewed

	@@ -0,0 +1,615 @@

+from smolagents import Tool
+import json
+import os
+import time
+import requests
+from typing import Dict, Any
+from dotenv import load_dotenv
+# Load environment variables from .env if present
+load_dotenv()
+def _build_description(description_lines):
+    """Join multiline descriptions defined as lists."""
+    return "\n".join(description_lines)
+# Dataset catalogue mirrored from the MCP implementation (JS version).
+# Each entry defines the dataset_id, the required inputs, optional defaults,
+# and optional fixed values that are injected automatically.
+DATASETS: Dict[str, Dict[str, Any]] = {
+    "amazon_product": {
+        "dataset_id": "gd_l7q7dkf244hwjntr0",
+        "description": _build_description(
+            [
+                "Quickly read structured amazon product data.",
+                "Requires a valid product URL with /dp/ in it.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "amazon_product_reviews": {
+        "dataset_id": "gd_le8e811kzy4ggddlq",
+        "description": _build_description(
+            [
+                "Quickly read structured amazon product review data.",
+                "Requires a valid product URL with /dp/ in it.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "amazon_product_search": {
+        "dataset_id": "gd_lwdb4vjm1ehb499uxs",
+        "description": _build_description(
+            [
+                "Quickly read structured amazon product search data.",
+                "Requires a valid search keyword and amazon domain URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["keyword", "url"],
+        "fixed_values": {"pages_to_search": "1"},
+    },
+    "walmart_product": {
+        "dataset_id": "gd_l95fol7l1ru6rlo116",
+        "description": _build_description(
+            [
+                "Quickly read structured walmart product data.",
+                "Requires a valid product URL with /ip/ in it.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "walmart_seller": {
+        "dataset_id": "gd_m7ke48w81ocyu4hhz0",
+        "description": _build_description(
+            [
+                "Quickly read structured walmart seller data.",
+                "Requires a valid walmart seller URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "ebay_product": {
+        "dataset_id": "gd_ltr9mjt81n0zzdk1fb",
+        "description": _build_description(
+            [
+                "Quickly read structured ebay product data.",
+                "Requires a valid ebay product URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "homedepot_products": {
+        "dataset_id": "gd_lmusivh019i7g97q2n",
+        "description": _build_description(
+            [
+                "Quickly read structured homedepot product data.",
+                "Requires a valid homedepot product URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "zara_products": {
+        "dataset_id": "gd_lct4vafw1tgx27d4o0",
+        "description": _build_description(
+            [
+                "Quickly read structured zara product data.",
+                "Requires a valid zara product URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "etsy_products": {
+        "dataset_id": "gd_ltppk0jdv1jqz25mz",
+        "description": _build_description(
+            [
+                "Quickly read structured etsy product data.",
+                "Requires a valid etsy product URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "bestbuy_products": {
+        "dataset_id": "gd_ltre1jqe1jfr7cccf",
+        "description": _build_description(
+            [
+                "Quickly read structured bestbuy product data.",
+                "Requires a valid bestbuy product URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "linkedin_person_profile": {
+        "dataset_id": "gd_l1viktl72bvl7bjuj0",
+        "description": _build_description(
+            [
+                "Quickly read structured linkedin people profile data.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "linkedin_company_profile": {
+        "dataset_id": "gd_l1vikfnt1wgvvqz95w",
+        "description": _build_description(
+            [
+                "Quickly read structured linkedin company profile data.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "linkedin_job_listings": {
+        "dataset_id": "gd_lpfll7v5hcqtkxl6l",
+        "description": _build_description(
+            [
+                "Quickly read structured linkedin job listings data.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "linkedin_posts": {
+        "dataset_id": "gd_lyy3tktm25m4avu764",
+        "description": _build_description(
+            [
+                "Quickly read structured linkedin posts data.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "linkedin_people_search": {
+        "dataset_id": "gd_m8d03he47z8nwb5xc",
+        "description": _build_description(
+            [
+                "Quickly read structured linkedin people search data.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url", "first_name", "last_name"],
+    },
+    "crunchbase_company": {
+        "dataset_id": "gd_l1vijqt9jfj7olije",
+        "description": _build_description(
+            [
+                "Quickly read structured crunchbase company data.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "zoominfo_company_profile": {
+        "dataset_id": "gd_m0ci4a4ivx3j5l6nx",
+        "description": _build_description(
+            [
+                "Quickly read structured ZoomInfo company profile data.",
+                "Requires a valid ZoomInfo company URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "instagram_profiles": {
+        "dataset_id": "gd_l1vikfch901nx3by4",
+        "description": _build_description(
+            [
+                "Quickly read structured Instagram profile data.",
+                "Requires a valid Instagram URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "instagram_posts": {
+        "dataset_id": "gd_lk5ns7kz21pck8jpis",
+        "description": _build_description(
+            [
+                "Quickly read structured Instagram post data.",
+                "Requires a valid Instagram URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "instagram_reels": {
+        "dataset_id": "gd_lyclm20il4r5helnj",
+        "description": _build_description(
+            [
+                "Quickly read structured Instagram reel data.",
+                "Requires a valid Instagram URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "instagram_comments": {
+        "dataset_id": "gd_ltppn085pokosxh13",
+        "description": _build_description(
+            [
+                "Quickly read structured Instagram comments data.",
+                "Requires a valid Instagram URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "facebook_posts": {
+        "dataset_id": "gd_lyclm1571iy3mv57zw",
+        "description": _build_description(
+            [
+                "Quickly read structured Facebook post data.",
+                "Requires a valid Facebook post URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "facebook_marketplace_listings": {
+        "dataset_id": "gd_lvt9iwuh6fbcwmx1a",
+        "description": _build_description(
+            [
+                "Quickly read structured Facebook marketplace listing data.",
+                "Requires a valid Facebook marketplace listing URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "facebook_company_reviews": {
+        "dataset_id": "gd_m0dtqpiu1mbcyc2g86",
+        "description": _build_description(
+            [
+                "Quickly read structured Facebook company reviews data.",
+                "Requires a valid Facebook company URL and number of reviews.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url", "num_of_reviews"],
+    },
+    "facebook_events": {
+        "dataset_id": "gd_m14sd0to1jz48ppm51",
+        "description": _build_description(
+            [
+                "Quickly read structured Facebook events data.",
+                "Requires a valid Facebook event URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "tiktok_profiles": {
+        "dataset_id": "gd_l1villgoiiidt09ci",
+        "description": _build_description(
+            [
+                "Quickly read structured Tiktok profiles data.",
+                "Requires a valid Tiktok profile URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "tiktok_posts": {
+        "dataset_id": "gd_lu702nij2f790tmv9h",
+        "description": _build_description(
+            [
+                "Quickly read structured Tiktok post data.",
+                "Requires a valid Tiktok post URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "tiktok_shop": {
+        "dataset_id": "gd_m45m1u911dsa4274pi",
+        "description": _build_description(
+            [
+                "Quickly read structured Tiktok shop data.",
+                "Requires a valid Tiktok shop product URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "tiktok_comments": {
+        "dataset_id": "gd_lkf2st302ap89utw5k",
+        "description": _build_description(
+            [
+                "Quickly read structured Tiktok comments data.",
+                "Requires a valid Tiktok video URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "google_maps_reviews": {
+        "dataset_id": "gd_luzfs1dn2oa0teb81",
+        "description": _build_description(
+            [
+                "Quickly read structured Google maps reviews data.",
+                "Requires a valid Google maps URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url", "days_limit"],
+        "defaults": {"days_limit": "3"},
+    },
+    "google_shopping": {
+        "dataset_id": "gd_ltppk50q18kdw67omz",
+        "description": _build_description(
+            [
+                "Quickly read structured Google shopping data.",
+                "Requires a valid Google shopping product URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "google_play_store": {
+        "dataset_id": "gd_lsk382l8xei8vzm4u",
+        "description": _build_description(
+            [
+                "Quickly read structured Google play store data.",
+                "Requires a valid Google play store app URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "apple_app_store": {
+        "dataset_id": "gd_lsk9ki3u2iishmwrui",
+        "description": _build_description(
+            [
+                "Quickly read structured apple app store data.",
+                "Requires a valid apple app store app URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "reuter_news": {
+        "dataset_id": "gd_lyptx9h74wtlvpnfu",
+        "description": _build_description(
+            [
+                "Quickly read structured reuter news data.",
+                "Requires a valid reuter news report URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "github_repository_file": {
+        "dataset_id": "gd_lyrexgxc24b3d4imjt",
+        "description": _build_description(
+            [
+                "Quickly read structured github repository data.",
+                "Requires a valid github repository file URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "yahoo_finance_business": {
+        "dataset_id": "gd_lmrpz3vxmz972ghd7",
+        "description": _build_description(
+            [
+                "Quickly read structured yahoo finance business data.",
+                "Requires a valid yahoo finance business URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "x_posts": {
+        "dataset_id": "gd_lwxkxvnf1cynvib9co",
+        "description": _build_description(
+            [
+                "Quickly read structured X post data.",
+                "Requires a valid X post URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "zillow_properties_listing": {
+        "dataset_id": "gd_lfqkr8wm13ixtbd8f5",
+        "description": _build_description(
+            [
+                "Quickly read structured zillow properties listing data.",
+                "Requires a valid zillow properties listing URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "booking_hotel_listings": {
+        "dataset_id": "gd_m5mbdl081229ln6t4a",
+        "description": _build_description(
+            [
+                "Quickly read structured booking hotel listings data.",
+                "Requires a valid booking hotel listing URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "youtube_profiles": {
+        "dataset_id": "gd_lk538t2k2p1k3oos71",
+        "description": _build_description(
+            [
+                "Quickly read structured youtube profiles data.",
+                "Requires a valid youtube profile URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "youtube_comments": {
+        "dataset_id": "gd_lk9q0ew71spt1mxywf",
+        "description": _build_description(
+            [
+                "Quickly read structured youtube comments data.",
+                "Requires a valid youtube video URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url", "num_of_comments"],
+        "defaults": {"num_of_comments": "10"},
+    },
+    "reddit_posts": {
+        "dataset_id": "gd_lvz8ah06191smkebj4",
+        "description": _build_description(
+            [
+                "Quickly read structured reddit posts data.",
+                "Requires a valid reddit post URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+    "youtube_videos": {
+        "dataset_id": "gd_lk56epmy2i5g7lzu0k",
+        "description": _build_description(
+            [
+                "Quickly read structured YouTube videos data.",
+                "Requires a valid YouTube video URL.",
+                "This can be a cache lookup, so it can be more reliable than scraping.",
+            ]
+        ),
+        "inputs": ["url"],
+    },
+}
+class BrightDataDatasetTool(Tool):
+    name = "brightdata_dataset_fetch"
+    description = (
+        "Trigger a Bright Data dataset collection and poll until the snapshot is ready. "
+        "Choose a dataset key (e.g., amazon_product, linkedin_company_profile, google_maps_reviews) "
+        "and pass the required parameters as JSON."
+    )
+    inputs = {
+        "dataset": {
+            "type": "string",
+            "description": f"Dataset key. Options: {', '.join(sorted(DATASETS.keys()))}",
+        },
+        "params_json": {
+            "type": "string",
+            "description": "JSON string with the required inputs for the chosen dataset",
+        },
+    }
+    output_type = "string"
+    def _prepare_payload(self, dataset_key: str, params: Dict[str, Any]) -> Dict[str, Any]:
+        """Validate required fields, apply defaults, and merge fixed values."""
+        config = DATASETS[dataset_key]
+        payload = {}
+        defaults = config.get("defaults", {})
+        fixed_values = config.get("fixed_values", {})
+        for field in config["inputs"]:
+            if field in params:
+                payload[field] = params[field]
+            elif field in defaults:
+                payload[field] = defaults[field]
+            else:
+                raise ValueError(f"Missing required field '{field}' for dataset '{dataset_key}'")
+        # Apply fixed values that should always be sent
+        payload.update(fixed_values)
+        return payload
+    def forward(self, dataset: str, params_json: str) -> str:
+        """
+        Trigger a dataset run and poll until results are ready.
+        Args:
+            dataset: The dataset key from DATASETS.
+            params_json: JSON string containing required inputs for the dataset.
+        Returns:
+            JSON string of the snapshot data once ready.
+        """
+        api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
+        if not api_token:
+            raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
+        if dataset not in DATASETS:
+            raise ValueError(f"Unknown dataset '{dataset}'. Valid options: {', '.join(sorted(DATASETS.keys()))}")
+        try:
+            params = json.loads(params_json) if params_json else {}
+        except json.JSONDecodeError as exc:
+            raise ValueError(f"params_json is not valid JSON: {exc}") from exc
+        payload = self._prepare_payload(dataset, params)
+        dataset_id = DATASETS[dataset]["dataset_id"]
+        trigger_url = "https://api.brightdata.com/datasets/v3/trigger"
+        trigger_headers = {
+            "Authorization": f"Bearer {api_token}",
+            "Content-Type": "application/json",
+        }
+        trigger_response = requests.post(
+            trigger_url,
+            params={"dataset_id": dataset_id, "include_errors": "true"},
+            json=[payload],
+            headers=trigger_headers,
+            timeout=60,
+        )
+        trigger_response.raise_for_status()
+        snapshot_id = trigger_response.json().get("snapshot_id")
+        if not snapshot_id:
+            raise RuntimeError("No snapshot ID returned from Bright Data.")
+        # Poll for completion (up to 10 minutes, matching MCP logic)
+        snapshot_url = f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}"
+        max_attempts = 600
+        attempts = 0
+        while attempts < max_attempts:
+            try:
+                response = requests.get(
+                    snapshot_url,
+                    params={"format": "json"},
+                    headers={"Authorization": f"Bearer {api_token}"},
+                    timeout=30,
+                )
+                # If Bright Data returns an error response we don't want to loop forever
+                if response.status_code == 400:
+                    response.raise_for_status()
+                data = response.json()
+                if isinstance(data, list):
+                    return json.dumps(data, indent=2)
+                status = data.get("status") if isinstance(data, dict) else None
+                if status not in {"running", "building"}:
+                    return json.dumps(data, indent=2)
+                attempts += 1
+                time.sleep(1)
+            except requests.exceptions.RequestException as exc:
+                # Mirror JS logic: tolerate transient failures, but break on 400
+                if getattr(getattr(exc, "response", None), "status_code", None) == 400:
+                    raise
+                attempts += 1
+                time.sleep(1)
+        raise TimeoutError(f"Timeout waiting for snapshot {snapshot_id} after {max_attempts} seconds")

brightdata_scraper.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from smolagents import Tool
+import requests
+import os
+from dotenv import load_dotenv
+# Load environment variables from .env if present
+load_dotenv()
+class BrightDataScraperTool(Tool):
+    name = "brightdata_web_scraper"
+    description = """
+    Scrape any webpage and return content in Markdown format.
+    This tool can bypass bot detection and CAPTCHAs.
+    Use this when you need to extract content from websites.
+    """
+    inputs = {
+        "url": {
+            "type": "string",
+            "description": "The URL of the webpage to scrape",
+        }
+    }
+    output_type = "string"
+    def forward(self, url: str) -> str:
+        """
+        Scrape a webpage using Bright Data's API.
+        Args:
+            url: The URL to scrape
+        Returns:
+            The scraped content in Markdown format
+        """
+        api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
+        unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker_1")
+        if not api_token:
+            raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
+        api_url = "https://api.brightdata.com/request"
+        headers = {
+            "Authorization": f"Bearer {api_token}",
+            "Content-Type": "application/json",
+        }
+        payload = {
+            "url": url,
+            "zone": unlocker_zone,
+            "format": "raw",
+            "data_format": "markdown",
+        }
+        try:
+            response = requests.post(api_url, json=payload, headers=headers)
+            response.raise_for_status()
+            return response.text
+        except requests.exceptions.RequestException as e:
+            return f"Error scraping URL: {str(e)}"

brightdata_search.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from smolagents import Tool
+import requests
+import json
+import os
+from dotenv import load_dotenv
+# Load environment variables from .env if present
+load_dotenv()
+class BrightDataSearchTool(Tool):
+    name = "brightdata_search_engine"
+    description = """
+    Search Google, Bing, or Yandex and get structured results.
+    Returns search results with URLs, titles, and descriptions.
+    Ideal for gathering current information and news.
+    """
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The search query",
+        },
+        "engine": {
+            "type": "string",
+            "description": "Search engine to use: 'google', 'bing', or 'yandex'. Default is 'google'",
+            "nullable": True,
+            "default": "google",
+        },
+    }
+    output_type = "string"
+    def forward(self, query: str, engine: str = "google") -> str:
+        """
+        Search using Bright Data's search API.
+        Args:
+            query: The search query.
+            engine: Search engine to use (google, bing, or yandex).
+        Returns:
+            JSON string with search results or markdown for non-Google engines.
+        """
+        api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
+        unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker1")
+        if not api_token:
+            raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
+        search_urls = {
+            "google": f"https://www.google.com/search?q={requests.utils.quote(query)}&brd_json=1",
+            "bing": f"https://www.bing.com/search?q={requests.utils.quote(query)}",
+            "yandex": f"https://yandex.com/search/?text={requests.utils.quote(query)}",
+        }
+        search_url = search_urls.get(engine.lower(), search_urls["google"])
+        is_google = engine.lower() == "google"
+        api_url = "https://api.brightdata.com/request"
+        headers = {
+            "Authorization": f"Bearer {api_token}",
+            "Content-Type": "application/json",
+        }
+        payload = {
+            "url": search_url,
+            "zone": unlocker_zone,
+            "format": "raw",
+        }
+        if not is_google:
+            payload["data_format"] = "markdown"
+        try:
+            response = requests.post(api_url, json=payload, headers=headers)
+            response.raise_for_status()
+            if is_google:
+                data = response.json()
+                results = {
+                    "organic": data.get("organic", []),
+                    "images": [img.get("link") for img in data.get("images", [])],
+                    "related": data.get("related", []),
+                    "ai_overview": data.get("ai_overview"),
+                }
+                return json.dumps(results, indent=2)
+            # Return markdown for Bing/Yandex
+            return response.text
+        except requests.exceptions.RequestException as e:
+            return json.dumps({"error": str(e)})

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+smolagents>=1.0.0
+huggingface_hub>=0.20.0
+requests>=2.31.0
+python-dotenv>=1.0.0
+gradio>=4.0.0

test_datasets.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import json
+from brightdata_datasets import BrightDataDatasetTool
+def main():
+    dataset_tool = BrightDataDatasetTool()
+    # Example dataset and params; change these as needed for quick manual testing.
+    dataset_key = "google_maps_reviews"
+    params = {
+        "url": "https://www.google.com/maps/place/Google+Sydney+-+Pirrama+Road/@-33.866489,151.1958561,17z/data=!4m8!3m7!1s0x6b12ae37b47f5b37:0x8eaddfcd1b32ca52!8m2!3d-33.866489!4d151.1958561!9m1!1b1!16s%2Fg%2F1td76qvq?entry=ttu&g_ep=EgoyMDI1MTIwMi4wIKXMDSoASAFQAw%3D%3D",
+        "days_limit": "3",
+    }
+    result = dataset_tool.forward(dataset_key, json.dumps(params))
+    print("Dataset response keys / status:")
+    try:
+        parsed = json.loads(result)
+    except json.JSONDecodeError:
+        print("Non-JSON response, raw output (first 2000 chars):")
+        print(result[:2000])
+        return
+    # Response can be a bare list or a dict depending on dataset.
+    if isinstance(parsed, list):
+        print(f"Top-level type: list; items: {len(parsed)}")
+        if parsed:
+            print("First item sample:")
+            print(json.dumps(parsed[0], indent=2)[:1000])
+        return
+    print(f"Top-level keys: {list(parsed.keys())}")
+    items = parsed.get("items") or parsed.get("data") or parsed.get("records") or parsed.get("result")
+    if isinstance(items, list):
+        print(f"Items count: {len(items)}")
+        if items:
+            print("First item sample:")
+            print(json.dumps(items[0], indent=2)[:1000])
+    else:
+        print("No iterable items found. Raw JSON (first 2000 chars):")
+        print(json.dumps(parsed, indent=2)[:2000])
+if __name__ == "__main__":
+    main()

test_scraper.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from brightdata_scraper import BrightDataScraperTool
+def main():
+    scraper = BrightDataScraperTool()
+    url = "https://en.wikipedia.org/wiki/Meir_Kadosh"
+    result = scraper.forward(url)
+    print("Scraped Content (first 500 chars):")
+    print(result)
+if __name__ == "__main__":
+    main()

test_search.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from brightdata_search import BrightDataSearchTool
+import json
+def main():
+    search_tool = BrightDataSearchTool()
+    query = "Python programming tutorials"
+    result = search_tool.forward(query, engine="google")
+    print("Search Results (Google) summary:")
+    parsed = json.loads(result)
+    organic = parsed.get("organic", [])
+    print(f"Found {len(organic)} organic results")
+    print(organic)
+if __name__ == "__main__":
+    main()