Spaces:

BrightData
/

brightdata-scraper-tool

Sleeping

meirk-brd commited on 16 days ago

Commit

1a9158e

1 Parent(s): 324e5f6

refactor

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,5 +1,14 @@
 from smolagents import launch_gradio_demo
 from tool import BrightDataScraperTool
-tool = BrightDataScraperTool()
-launch_gradio_demo(tool)

+from __future__ import annotations
 from smolagents import launch_gradio_demo
 from tool import BrightDataScraperTool
+def main() -> None:
+    tool = BrightDataScraperTool()
+    launch_gradio_demo(tool)
+if __name__ == "__main__":
+    main()

tool.py CHANGED Viewed

@@ -1,33 +1,33 @@
-from typing import Any, Optional
-from smolagents.tools import Tool
 import os
 import requests
 class BrightDataScraperTool(Tool):
-    name = "brightdata_web_unlocker"
     description = """
     Scrape any webpage and return content in Markdown format.
     This tool can bypass bot detection and CAPTCHAs.
     Use this when you need to extract content from websites.
     """
-    inputs = {'url': {'type': 'string', 'description': 'The URL of the webpage to scrape'}}
     output_type = "string"
-    def forward(self, url: str) -> str:
-        """
-        Scrape a webpage using Bright Data's API.
-        Args:
-            url: The URL to scrape
-        Returns:
-            The scraped content in Markdown format
-        """
-        import os
-        import requests
         api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
-        unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker_1")
         if not api_token:
             raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
@@ -46,11 +46,8 @@ class BrightDataScraperTool(Tool):
         }
         try:
-            response = requests.post(api_url, json=payload, headers=headers)
             response.raise_for_status()
             return response.text
-        except requests.exceptions.RequestException as e:
-            return f"Error scraping URL: {str(e)}"
-    def __init__(self, *args, **kwargs):
-        self.is_initialized = False

+from __future__ import annotations
+import json
 import os
 import requests
+from smolagents.tools import Tool
 class BrightDataScraperTool(Tool):
+    name = "brightdata_web_scraper"
     description = """
     Scrape any webpage and return content in Markdown format.
     This tool can bypass bot detection and CAPTCHAs.
     Use this when you need to extract content from websites.
     """
     output_type = "string"
+    def __init__(self) -> None:
+        self.inputs = {
+            "url": {
+                "type": "string",
+                "description": "The URL of the webpage to scrape",
+            }
+        }
+        super().__init__()
+    def forward(self, url: str) -> str:
         api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
+        unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker1")
         if not api_token:
             raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
         }
         try:
+            response = requests.post(api_url, json=payload, headers=headers, timeout=30)
             response.raise_for_status()
             return response.text
+        except requests.exceptions.RequestException as exc:
+            return json.dumps({"error": str(exc)})