meirk-brd commited on
Commit
1a9158e
·
1 Parent(s): 324e5f6
Files changed (2) hide show
  1. app.py +11 -2
  2. tool.py +20 -23
app.py CHANGED
@@ -1,5 +1,14 @@
 
 
1
  from smolagents import launch_gradio_demo
 
2
  from tool import BrightDataScraperTool
3
 
4
- tool = BrightDataScraperTool()
5
- launch_gradio_demo(tool)
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
  from smolagents import launch_gradio_demo
4
+
5
  from tool import BrightDataScraperTool
6
 
7
+
8
+ def main() -> None:
9
+ tool = BrightDataScraperTool()
10
+ launch_gradio_demo(tool)
11
+
12
+
13
+ if __name__ == "__main__":
14
+ main()
tool.py CHANGED
@@ -1,33 +1,33 @@
1
- from typing import Any, Optional
2
- from smolagents.tools import Tool
 
3
  import os
 
4
  import requests
 
 
5
 
6
  class BrightDataScraperTool(Tool):
7
- name = "brightdata_web_unlocker"
8
  description = """
9
  Scrape any webpage and return content in Markdown format.
10
  This tool can bypass bot detection and CAPTCHAs.
11
  Use this when you need to extract content from websites.
12
  """
13
- inputs = {'url': {'type': 'string', 'description': 'The URL of the webpage to scrape'}}
14
  output_type = "string"
15
 
16
- def forward(self, url: str) -> str:
17
- """
18
- Scrape a webpage using Bright Data's API.
19
-
20
- Args:
21
- url: The URL to scrape
22
-
23
- Returns:
24
- The scraped content in Markdown format
25
- """
26
- import os
27
- import requests
28
 
 
29
  api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
30
- unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker_1")
31
 
32
  if not api_token:
33
  raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
@@ -46,11 +46,8 @@ class BrightDataScraperTool(Tool):
46
  }
47
 
48
  try:
49
- response = requests.post(api_url, json=payload, headers=headers)
50
  response.raise_for_status()
51
  return response.text
52
- except requests.exceptions.RequestException as e:
53
- return f"Error scraping URL: {str(e)}"
54
-
55
- def __init__(self, *args, **kwargs):
56
- self.is_initialized = False
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
  import os
5
+
6
  import requests
7
+ from smolagents.tools import Tool
8
+
9
 
10
  class BrightDataScraperTool(Tool):
11
+ name = "brightdata_web_scraper"
12
  description = """
13
  Scrape any webpage and return content in Markdown format.
14
  This tool can bypass bot detection and CAPTCHAs.
15
  Use this when you need to extract content from websites.
16
  """
 
17
  output_type = "string"
18
 
19
+ def __init__(self) -> None:
20
+ self.inputs = {
21
+ "url": {
22
+ "type": "string",
23
+ "description": "The URL of the webpage to scrape",
24
+ }
25
+ }
26
+ super().__init__()
 
 
 
 
27
 
28
+ def forward(self, url: str) -> str:
29
  api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
30
+ unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker1")
31
 
32
  if not api_token:
33
  raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
 
46
  }
47
 
48
  try:
49
+ response = requests.post(api_url, json=payload, headers=headers, timeout=30)
50
  response.raise_for_status()
51
  return response.text
52
+ except requests.exceptions.RequestException as exc:
53
+ return json.dumps({"error": str(exc)})