meirk-brd commited on
Commit
0397cdb
·
1 Parent(s): abb4f97

Merge remote README with local changes

Browse files
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .env
2
+ venv/
3
+ __pycache__/
4
+ *.pyc
5
+ .DS_Store
README.md CHANGED
@@ -1,12 +1,29 @@
1
  ---
2
- title: Brightdata Ai Agent
3
- emoji: 🦀
4
- colorFrom: gray
5
- colorTo: pink
6
  sdk: gradio
7
  sdk_version: 6.0.2
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Bright Data AI Agent
3
+ emoji: 🌐
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
  sdk_version: 6.0.2
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
+ # Bright Data AI Agent
14
+
15
+ An AI agent powered by Bright Data APIs for web scraping and search.
16
+
17
+ ## Features
18
+
19
+ - **Web Search**: Search Google, Bing, or Yandex
20
+ - **Web Scraping**: Extract content from any webpage
21
+ - **Bot Protection Bypass**: Automatically handles CAPTCHAs and bot detection
22
+
23
+ ## Setup
24
+
25
+ Set the following secrets in your Space settings:
26
+ - `BRIGHT_DATA_API_TOKEN`: Your Bright Data API token
27
+ - `BRIGHT_DATA_UNLOCKER_ZONE`: Your unlocker zone name (default: web_unlocker1)
28
+
29
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from smolagents import CodeAgent
3
+ from smolagents.models import InferenceClientModel
4
+ from brightdata_scraper import BrightDataScraperTool
5
+ from brightdata_search import BrightDataSearchTool
6
+ from brightdata_datasets import BrightDataDatasetTool
7
+
8
+ # Initialize tools
9
+ scraper_tool = BrightDataScraperTool()
10
+ search_tool = BrightDataSearchTool()
11
+ dataset_tool = BrightDataDatasetTool()
12
+
13
+ # Initialize the agent with a Hugging Face Inference model
14
+ # Requires HF_TOKEN in the environment for authentication.
15
+ model = InferenceClientModel(model_id="deepseek-ai/DeepSeek-V3.2")
16
+
17
+ agent = CodeAgent(
18
+ tools=[scraper_tool, search_tool, dataset_tool],
19
+ model=model,
20
+ add_base_tools=True,
21
+ max_steps=4,
22
+ instructions="Answer with the first satisfactory result; do not call the same tool repeatedly once you have the needed data. Use final_answer() as soon as you can."
23
+ )
24
+
25
+
26
+ def run_agent(task: str) -> str:
27
+ """Run the agent with the given task."""
28
+ try:
29
+ result = agent.run(task)
30
+ return str(result)
31
+ except Exception as e:
32
+ return f"Error: {str(e)}"
33
+
34
+
35
+ # Create Gradio interface
36
+ with gr.Blocks(title="Bright Data AI Agent") as demo:
37
+ gr.Markdown("# Bright Data AI Agent")
38
+ gr.Markdown(
39
+ """
40
+ This agent can help you with web scraping, search, and quick access to Bright Data datasets.
41
+
42
+ **Available capabilities:**
43
+ - Search Google, Bing, or Yandex
44
+ - Scrape any webpage (bypasses bot detection)
45
+ - Read structured data from 40+ prebuilt datasets (e.g., amazon_product, google_maps_reviews, linkedin_company_profile)
46
+
47
+ **Example tasks:**
48
+ - "Search for recent AI news on Google"
49
+ - "Scrape the content from https://example.com"
50
+ - "Fetch google_maps_reviews for this place URL with the last 7 days"
51
+ """
52
+ )
53
+
54
+ with gr.Row():
55
+ with gr.Column():
56
+ task_input = gr.Textbox(label="Task", placeholder="Enter your task here...", lines=3)
57
+ submit_btn = gr.Button("Run Agent", variant="primary")
58
+
59
+ with gr.Column():
60
+ output = gr.Textbox(label="Result", lines=15, max_lines=30)
61
+
62
+ submit_btn.click(fn=run_agent, inputs=[task_input], outputs=[output])
63
+
64
+ gr.Examples(
65
+ examples=[
66
+ ["Search for 'latest developments in AI' on Google"],
67
+ ["Scrape the content from https://example.com"],
68
+ ["What are the top Python programming tutorials?"],
69
+ ],
70
+ inputs=[task_input],
71
+ )
72
+
73
+
74
+ if __name__ == "__main__":
75
+ demo.launch()
brightdata_datasets.py ADDED
@@ -0,0 +1,615 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents import Tool
2
+ import json
3
+ import os
4
+ import time
5
+ import requests
6
+ from typing import Dict, Any
7
+ from dotenv import load_dotenv
8
+
9
+ # Load environment variables from .env if present
10
+ load_dotenv()
11
+
12
+
13
+ def _build_description(description_lines):
14
+ """Join multiline descriptions defined as lists."""
15
+ return "\n".join(description_lines)
16
+
17
+
18
+ # Dataset catalogue mirrored from the MCP implementation (JS version).
19
+ # Each entry defines the dataset_id, the required inputs, optional defaults,
20
+ # and optional fixed values that are injected automatically.
21
+ DATASETS: Dict[str, Dict[str, Any]] = {
22
+ "amazon_product": {
23
+ "dataset_id": "gd_l7q7dkf244hwjntr0",
24
+ "description": _build_description(
25
+ [
26
+ "Quickly read structured amazon product data.",
27
+ "Requires a valid product URL with /dp/ in it.",
28
+ "This can be a cache lookup, so it can be more reliable than scraping.",
29
+ ]
30
+ ),
31
+ "inputs": ["url"],
32
+ },
33
+ "amazon_product_reviews": {
34
+ "dataset_id": "gd_le8e811kzy4ggddlq",
35
+ "description": _build_description(
36
+ [
37
+ "Quickly read structured amazon product review data.",
38
+ "Requires a valid product URL with /dp/ in it.",
39
+ "This can be a cache lookup, so it can be more reliable than scraping.",
40
+ ]
41
+ ),
42
+ "inputs": ["url"],
43
+ },
44
+ "amazon_product_search": {
45
+ "dataset_id": "gd_lwdb4vjm1ehb499uxs",
46
+ "description": _build_description(
47
+ [
48
+ "Quickly read structured amazon product search data.",
49
+ "Requires a valid search keyword and amazon domain URL.",
50
+ "This can be a cache lookup, so it can be more reliable than scraping.",
51
+ ]
52
+ ),
53
+ "inputs": ["keyword", "url"],
54
+ "fixed_values": {"pages_to_search": "1"},
55
+ },
56
+ "walmart_product": {
57
+ "dataset_id": "gd_l95fol7l1ru6rlo116",
58
+ "description": _build_description(
59
+ [
60
+ "Quickly read structured walmart product data.",
61
+ "Requires a valid product URL with /ip/ in it.",
62
+ "This can be a cache lookup, so it can be more reliable than scraping.",
63
+ ]
64
+ ),
65
+ "inputs": ["url"],
66
+ },
67
+ "walmart_seller": {
68
+ "dataset_id": "gd_m7ke48w81ocyu4hhz0",
69
+ "description": _build_description(
70
+ [
71
+ "Quickly read structured walmart seller data.",
72
+ "Requires a valid walmart seller URL.",
73
+ "This can be a cache lookup, so it can be more reliable than scraping.",
74
+ ]
75
+ ),
76
+ "inputs": ["url"],
77
+ },
78
+ "ebay_product": {
79
+ "dataset_id": "gd_ltr9mjt81n0zzdk1fb",
80
+ "description": _build_description(
81
+ [
82
+ "Quickly read structured ebay product data.",
83
+ "Requires a valid ebay product URL.",
84
+ "This can be a cache lookup, so it can be more reliable than scraping.",
85
+ ]
86
+ ),
87
+ "inputs": ["url"],
88
+ },
89
+ "homedepot_products": {
90
+ "dataset_id": "gd_lmusivh019i7g97q2n",
91
+ "description": _build_description(
92
+ [
93
+ "Quickly read structured homedepot product data.",
94
+ "Requires a valid homedepot product URL.",
95
+ "This can be a cache lookup, so it can be more reliable than scraping.",
96
+ ]
97
+ ),
98
+ "inputs": ["url"],
99
+ },
100
+ "zara_products": {
101
+ "dataset_id": "gd_lct4vafw1tgx27d4o0",
102
+ "description": _build_description(
103
+ [
104
+ "Quickly read structured zara product data.",
105
+ "Requires a valid zara product URL.",
106
+ "This can be a cache lookup, so it can be more reliable than scraping.",
107
+ ]
108
+ ),
109
+ "inputs": ["url"],
110
+ },
111
+ "etsy_products": {
112
+ "dataset_id": "gd_ltppk0jdv1jqz25mz",
113
+ "description": _build_description(
114
+ [
115
+ "Quickly read structured etsy product data.",
116
+ "Requires a valid etsy product URL.",
117
+ "This can be a cache lookup, so it can be more reliable than scraping.",
118
+ ]
119
+ ),
120
+ "inputs": ["url"],
121
+ },
122
+ "bestbuy_products": {
123
+ "dataset_id": "gd_ltre1jqe1jfr7cccf",
124
+ "description": _build_description(
125
+ [
126
+ "Quickly read structured bestbuy product data.",
127
+ "Requires a valid bestbuy product URL.",
128
+ "This can be a cache lookup, so it can be more reliable than scraping.",
129
+ ]
130
+ ),
131
+ "inputs": ["url"],
132
+ },
133
+ "linkedin_person_profile": {
134
+ "dataset_id": "gd_l1viktl72bvl7bjuj0",
135
+ "description": _build_description(
136
+ [
137
+ "Quickly read structured linkedin people profile data.",
138
+ "This can be a cache lookup, so it can be more reliable than scraping.",
139
+ ]
140
+ ),
141
+ "inputs": ["url"],
142
+ },
143
+ "linkedin_company_profile": {
144
+ "dataset_id": "gd_l1vikfnt1wgvvqz95w",
145
+ "description": _build_description(
146
+ [
147
+ "Quickly read structured linkedin company profile data.",
148
+ "This can be a cache lookup, so it can be more reliable than scraping.",
149
+ ]
150
+ ),
151
+ "inputs": ["url"],
152
+ },
153
+ "linkedin_job_listings": {
154
+ "dataset_id": "gd_lpfll7v5hcqtkxl6l",
155
+ "description": _build_description(
156
+ [
157
+ "Quickly read structured linkedin job listings data.",
158
+ "This can be a cache lookup, so it can be more reliable than scraping.",
159
+ ]
160
+ ),
161
+ "inputs": ["url"],
162
+ },
163
+ "linkedin_posts": {
164
+ "dataset_id": "gd_lyy3tktm25m4avu764",
165
+ "description": _build_description(
166
+ [
167
+ "Quickly read structured linkedin posts data.",
168
+ "This can be a cache lookup, so it can be more reliable than scraping.",
169
+ ]
170
+ ),
171
+ "inputs": ["url"],
172
+ },
173
+ "linkedin_people_search": {
174
+ "dataset_id": "gd_m8d03he47z8nwb5xc",
175
+ "description": _build_description(
176
+ [
177
+ "Quickly read structured linkedin people search data.",
178
+ "This can be a cache lookup, so it can be more reliable than scraping.",
179
+ ]
180
+ ),
181
+ "inputs": ["url", "first_name", "last_name"],
182
+ },
183
+ "crunchbase_company": {
184
+ "dataset_id": "gd_l1vijqt9jfj7olije",
185
+ "description": _build_description(
186
+ [
187
+ "Quickly read structured crunchbase company data.",
188
+ "This can be a cache lookup, so it can be more reliable than scraping.",
189
+ ]
190
+ ),
191
+ "inputs": ["url"],
192
+ },
193
+ "zoominfo_company_profile": {
194
+ "dataset_id": "gd_m0ci4a4ivx3j5l6nx",
195
+ "description": _build_description(
196
+ [
197
+ "Quickly read structured ZoomInfo company profile data.",
198
+ "Requires a valid ZoomInfo company URL.",
199
+ "This can be a cache lookup, so it can be more reliable than scraping.",
200
+ ]
201
+ ),
202
+ "inputs": ["url"],
203
+ },
204
+ "instagram_profiles": {
205
+ "dataset_id": "gd_l1vikfch901nx3by4",
206
+ "description": _build_description(
207
+ [
208
+ "Quickly read structured Instagram profile data.",
209
+ "Requires a valid Instagram URL.",
210
+ "This can be a cache lookup, so it can be more reliable than scraping.",
211
+ ]
212
+ ),
213
+ "inputs": ["url"],
214
+ },
215
+ "instagram_posts": {
216
+ "dataset_id": "gd_lk5ns7kz21pck8jpis",
217
+ "description": _build_description(
218
+ [
219
+ "Quickly read structured Instagram post data.",
220
+ "Requires a valid Instagram URL.",
221
+ "This can be a cache lookup, so it can be more reliable than scraping.",
222
+ ]
223
+ ),
224
+ "inputs": ["url"],
225
+ },
226
+ "instagram_reels": {
227
+ "dataset_id": "gd_lyclm20il4r5helnj",
228
+ "description": _build_description(
229
+ [
230
+ "Quickly read structured Instagram reel data.",
231
+ "Requires a valid Instagram URL.",
232
+ "This can be a cache lookup, so it can be more reliable than scraping.",
233
+ ]
234
+ ),
235
+ "inputs": ["url"],
236
+ },
237
+ "instagram_comments": {
238
+ "dataset_id": "gd_ltppn085pokosxh13",
239
+ "description": _build_description(
240
+ [
241
+ "Quickly read structured Instagram comments data.",
242
+ "Requires a valid Instagram URL.",
243
+ "This can be a cache lookup, so it can be more reliable than scraping.",
244
+ ]
245
+ ),
246
+ "inputs": ["url"],
247
+ },
248
+ "facebook_posts": {
249
+ "dataset_id": "gd_lyclm1571iy3mv57zw",
250
+ "description": _build_description(
251
+ [
252
+ "Quickly read structured Facebook post data.",
253
+ "Requires a valid Facebook post URL.",
254
+ "This can be a cache lookup, so it can be more reliable than scraping.",
255
+ ]
256
+ ),
257
+ "inputs": ["url"],
258
+ },
259
+ "facebook_marketplace_listings": {
260
+ "dataset_id": "gd_lvt9iwuh6fbcwmx1a",
261
+ "description": _build_description(
262
+ [
263
+ "Quickly read structured Facebook marketplace listing data.",
264
+ "Requires a valid Facebook marketplace listing URL.",
265
+ "This can be a cache lookup, so it can be more reliable than scraping.",
266
+ ]
267
+ ),
268
+ "inputs": ["url"],
269
+ },
270
+ "facebook_company_reviews": {
271
+ "dataset_id": "gd_m0dtqpiu1mbcyc2g86",
272
+ "description": _build_description(
273
+ [
274
+ "Quickly read structured Facebook company reviews data.",
275
+ "Requires a valid Facebook company URL and number of reviews.",
276
+ "This can be a cache lookup, so it can be more reliable than scraping.",
277
+ ]
278
+ ),
279
+ "inputs": ["url", "num_of_reviews"],
280
+ },
281
+ "facebook_events": {
282
+ "dataset_id": "gd_m14sd0to1jz48ppm51",
283
+ "description": _build_description(
284
+ [
285
+ "Quickly read structured Facebook events data.",
286
+ "Requires a valid Facebook event URL.",
287
+ "This can be a cache lookup, so it can be more reliable than scraping.",
288
+ ]
289
+ ),
290
+ "inputs": ["url"],
291
+ },
292
+ "tiktok_profiles": {
293
+ "dataset_id": "gd_l1villgoiiidt09ci",
294
+ "description": _build_description(
295
+ [
296
+ "Quickly read structured Tiktok profiles data.",
297
+ "Requires a valid Tiktok profile URL.",
298
+ "This can be a cache lookup, so it can be more reliable than scraping.",
299
+ ]
300
+ ),
301
+ "inputs": ["url"],
302
+ },
303
+ "tiktok_posts": {
304
+ "dataset_id": "gd_lu702nij2f790tmv9h",
305
+ "description": _build_description(
306
+ [
307
+ "Quickly read structured Tiktok post data.",
308
+ "Requires a valid Tiktok post URL.",
309
+ "This can be a cache lookup, so it can be more reliable than scraping.",
310
+ ]
311
+ ),
312
+ "inputs": ["url"],
313
+ },
314
+ "tiktok_shop": {
315
+ "dataset_id": "gd_m45m1u911dsa4274pi",
316
+ "description": _build_description(
317
+ [
318
+ "Quickly read structured Tiktok shop data.",
319
+ "Requires a valid Tiktok shop product URL.",
320
+ "This can be a cache lookup, so it can be more reliable than scraping.",
321
+ ]
322
+ ),
323
+ "inputs": ["url"],
324
+ },
325
+ "tiktok_comments": {
326
+ "dataset_id": "gd_lkf2st302ap89utw5k",
327
+ "description": _build_description(
328
+ [
329
+ "Quickly read structured Tiktok comments data.",
330
+ "Requires a valid Tiktok video URL.",
331
+ "This can be a cache lookup, so it can be more reliable than scraping.",
332
+ ]
333
+ ),
334
+ "inputs": ["url"],
335
+ },
336
+ "google_maps_reviews": {
337
+ "dataset_id": "gd_luzfs1dn2oa0teb81",
338
+ "description": _build_description(
339
+ [
340
+ "Quickly read structured Google maps reviews data.",
341
+ "Requires a valid Google maps URL.",
342
+ "This can be a cache lookup, so it can be more reliable than scraping.",
343
+ ]
344
+ ),
345
+ "inputs": ["url", "days_limit"],
346
+ "defaults": {"days_limit": "3"},
347
+ },
348
+ "google_shopping": {
349
+ "dataset_id": "gd_ltppk50q18kdw67omz",
350
+ "description": _build_description(
351
+ [
352
+ "Quickly read structured Google shopping data.",
353
+ "Requires a valid Google shopping product URL.",
354
+ "This can be a cache lookup, so it can be more reliable than scraping.",
355
+ ]
356
+ ),
357
+ "inputs": ["url"],
358
+ },
359
+ "google_play_store": {
360
+ "dataset_id": "gd_lsk382l8xei8vzm4u",
361
+ "description": _build_description(
362
+ [
363
+ "Quickly read structured Google play store data.",
364
+ "Requires a valid Google play store app URL.",
365
+ "This can be a cache lookup, so it can be more reliable than scraping.",
366
+ ]
367
+ ),
368
+ "inputs": ["url"],
369
+ },
370
+ "apple_app_store": {
371
+ "dataset_id": "gd_lsk9ki3u2iishmwrui",
372
+ "description": _build_description(
373
+ [
374
+ "Quickly read structured apple app store data.",
375
+ "Requires a valid apple app store app URL.",
376
+ "This can be a cache lookup, so it can be more reliable than scraping.",
377
+ ]
378
+ ),
379
+ "inputs": ["url"],
380
+ },
381
+ "reuter_news": {
382
+ "dataset_id": "gd_lyptx9h74wtlvpnfu",
383
+ "description": _build_description(
384
+ [
385
+ "Quickly read structured reuter news data.",
386
+ "Requires a valid reuter news report URL.",
387
+ "This can be a cache lookup, so it can be more reliable than scraping.",
388
+ ]
389
+ ),
390
+ "inputs": ["url"],
391
+ },
392
+ "github_repository_file": {
393
+ "dataset_id": "gd_lyrexgxc24b3d4imjt",
394
+ "description": _build_description(
395
+ [
396
+ "Quickly read structured github repository data.",
397
+ "Requires a valid github repository file URL.",
398
+ "This can be a cache lookup, so it can be more reliable than scraping.",
399
+ ]
400
+ ),
401
+ "inputs": ["url"],
402
+ },
403
+ "yahoo_finance_business": {
404
+ "dataset_id": "gd_lmrpz3vxmz972ghd7",
405
+ "description": _build_description(
406
+ [
407
+ "Quickly read structured yahoo finance business data.",
408
+ "Requires a valid yahoo finance business URL.",
409
+ "This can be a cache lookup, so it can be more reliable than scraping.",
410
+ ]
411
+ ),
412
+ "inputs": ["url"],
413
+ },
414
+ "x_posts": {
415
+ "dataset_id": "gd_lwxkxvnf1cynvib9co",
416
+ "description": _build_description(
417
+ [
418
+ "Quickly read structured X post data.",
419
+ "Requires a valid X post URL.",
420
+ "This can be a cache lookup, so it can be more reliable than scraping.",
421
+ ]
422
+ ),
423
+ "inputs": ["url"],
424
+ },
425
+ "zillow_properties_listing": {
426
+ "dataset_id": "gd_lfqkr8wm13ixtbd8f5",
427
+ "description": _build_description(
428
+ [
429
+ "Quickly read structured zillow properties listing data.",
430
+ "Requires a valid zillow properties listing URL.",
431
+ "This can be a cache lookup, so it can be more reliable than scraping.",
432
+ ]
433
+ ),
434
+ "inputs": ["url"],
435
+ },
436
+ "booking_hotel_listings": {
437
+ "dataset_id": "gd_m5mbdl081229ln6t4a",
438
+ "description": _build_description(
439
+ [
440
+ "Quickly read structured booking hotel listings data.",
441
+ "Requires a valid booking hotel listing URL.",
442
+ "This can be a cache lookup, so it can be more reliable than scraping.",
443
+ ]
444
+ ),
445
+ "inputs": ["url"],
446
+ },
447
+ "youtube_profiles": {
448
+ "dataset_id": "gd_lk538t2k2p1k3oos71",
449
+ "description": _build_description(
450
+ [
451
+ "Quickly read structured youtube profiles data.",
452
+ "Requires a valid youtube profile URL.",
453
+ "This can be a cache lookup, so it can be more reliable than scraping.",
454
+ ]
455
+ ),
456
+ "inputs": ["url"],
457
+ },
458
+ "youtube_comments": {
459
+ "dataset_id": "gd_lk9q0ew71spt1mxywf",
460
+ "description": _build_description(
461
+ [
462
+ "Quickly read structured youtube comments data.",
463
+ "Requires a valid youtube video URL.",
464
+ "This can be a cache lookup, so it can be more reliable than scraping.",
465
+ ]
466
+ ),
467
+ "inputs": ["url", "num_of_comments"],
468
+ "defaults": {"num_of_comments": "10"},
469
+ },
470
+ "reddit_posts": {
471
+ "dataset_id": "gd_lvz8ah06191smkebj4",
472
+ "description": _build_description(
473
+ [
474
+ "Quickly read structured reddit posts data.",
475
+ "Requires a valid reddit post URL.",
476
+ "This can be a cache lookup, so it can be more reliable than scraping.",
477
+ ]
478
+ ),
479
+ "inputs": ["url"],
480
+ },
481
+ "youtube_videos": {
482
+ "dataset_id": "gd_lk56epmy2i5g7lzu0k",
483
+ "description": _build_description(
484
+ [
485
+ "Quickly read structured YouTube videos data.",
486
+ "Requires a valid YouTube video URL.",
487
+ "This can be a cache lookup, so it can be more reliable than scraping.",
488
+ ]
489
+ ),
490
+ "inputs": ["url"],
491
+ },
492
+ }
493
+
494
+
495
+ class BrightDataDatasetTool(Tool):
496
+ name = "brightdata_dataset_fetch"
497
+ description = (
498
+ "Trigger a Bright Data dataset collection and poll until the snapshot is ready. "
499
+ "Choose a dataset key (e.g., amazon_product, linkedin_company_profile, google_maps_reviews) "
500
+ "and pass the required parameters as JSON."
501
+ )
502
+ inputs = {
503
+ "dataset": {
504
+ "type": "string",
505
+ "description": f"Dataset key. Options: {', '.join(sorted(DATASETS.keys()))}",
506
+ },
507
+ "params_json": {
508
+ "type": "string",
509
+ "description": "JSON string with the required inputs for the chosen dataset",
510
+ },
511
+ }
512
+ output_type = "string"
513
+
514
+ def _prepare_payload(self, dataset_key: str, params: Dict[str, Any]) -> Dict[str, Any]:
515
+ """Validate required fields, apply defaults, and merge fixed values."""
516
+ config = DATASETS[dataset_key]
517
+ payload = {}
518
+
519
+ defaults = config.get("defaults", {})
520
+ fixed_values = config.get("fixed_values", {})
521
+
522
+ for field in config["inputs"]:
523
+ if field in params:
524
+ payload[field] = params[field]
525
+ elif field in defaults:
526
+ payload[field] = defaults[field]
527
+ else:
528
+ raise ValueError(f"Missing required field '{field}' for dataset '{dataset_key}'")
529
+
530
+ # Apply fixed values that should always be sent
531
+ payload.update(fixed_values)
532
+ return payload
533
+
534
+ def forward(self, dataset: str, params_json: str) -> str:
535
+ """
536
+ Trigger a dataset run and poll until results are ready.
537
+
538
+ Args:
539
+ dataset: The dataset key from DATASETS.
540
+ params_json: JSON string containing required inputs for the dataset.
541
+
542
+ Returns:
543
+ JSON string of the snapshot data once ready.
544
+ """
545
+ api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
546
+ if not api_token:
547
+ raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
548
+
549
+ if dataset not in DATASETS:
550
+ raise ValueError(f"Unknown dataset '{dataset}'. Valid options: {', '.join(sorted(DATASETS.keys()))}")
551
+
552
+ try:
553
+ params = json.loads(params_json) if params_json else {}
554
+ except json.JSONDecodeError as exc:
555
+ raise ValueError(f"params_json is not valid JSON: {exc}") from exc
556
+
557
+ payload = self._prepare_payload(dataset, params)
558
+ dataset_id = DATASETS[dataset]["dataset_id"]
559
+
560
+ trigger_url = "https://api.brightdata.com/datasets/v3/trigger"
561
+ trigger_headers = {
562
+ "Authorization": f"Bearer {api_token}",
563
+ "Content-Type": "application/json",
564
+ }
565
+
566
+ trigger_response = requests.post(
567
+ trigger_url,
568
+ params={"dataset_id": dataset_id, "include_errors": "true"},
569
+ json=[payload],
570
+ headers=trigger_headers,
571
+ timeout=60,
572
+ )
573
+ trigger_response.raise_for_status()
574
+ snapshot_id = trigger_response.json().get("snapshot_id")
575
+
576
+ if not snapshot_id:
577
+ raise RuntimeError("No snapshot ID returned from Bright Data.")
578
+
579
+ # Poll for completion (up to 10 minutes, matching MCP logic)
580
+ snapshot_url = f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}"
581
+ max_attempts = 600
582
+ attempts = 0
583
+
584
+ while attempts < max_attempts:
585
+ try:
586
+ response = requests.get(
587
+ snapshot_url,
588
+ params={"format": "json"},
589
+ headers={"Authorization": f"Bearer {api_token}"},
590
+ timeout=30,
591
+ )
592
+
593
+ # If Bright Data returns an error response we don't want to loop forever
594
+ if response.status_code == 400:
595
+ response.raise_for_status()
596
+
597
+ data = response.json()
598
+ if isinstance(data, list):
599
+ return json.dumps(data, indent=2)
600
+
601
+ status = data.get("status") if isinstance(data, dict) else None
602
+ if status not in {"running", "building"}:
603
+ return json.dumps(data, indent=2)
604
+
605
+ attempts += 1
606
+ time.sleep(1)
607
+
608
+ except requests.exceptions.RequestException as exc:
609
+ # Mirror JS logic: tolerate transient failures, but break on 400
610
+ if getattr(getattr(exc, "response", None), "status_code", None) == 400:
611
+ raise
612
+ attempts += 1
613
+ time.sleep(1)
614
+
615
+ raise TimeoutError(f"Timeout waiting for snapshot {snapshot_id} after {max_attempts} seconds")
brightdata_scraper.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents import Tool
2
+ import requests
3
+ import os
4
+ from dotenv import load_dotenv
5
+
6
+ # Load environment variables from .env if present
7
+ load_dotenv()
8
+
9
+
10
+ class BrightDataScraperTool(Tool):
11
+ name = "brightdata_web_scraper"
12
+ description = """
13
+ Scrape any webpage and return content in Markdown format.
14
+ This tool can bypass bot detection and CAPTCHAs.
15
+ Use this when you need to extract content from websites.
16
+ """
17
+ inputs = {
18
+ "url": {
19
+ "type": "string",
20
+ "description": "The URL of the webpage to scrape",
21
+ }
22
+ }
23
+ output_type = "string"
24
+
25
+ def forward(self, url: str) -> str:
26
+ """
27
+ Scrape a webpage using Bright Data's API.
28
+
29
+ Args:
30
+ url: The URL to scrape
31
+
32
+ Returns:
33
+ The scraped content in Markdown format
34
+ """
35
+ api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
36
+ unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker_1")
37
+
38
+ if not api_token:
39
+ raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
40
+
41
+ api_url = "https://api.brightdata.com/request"
42
+ headers = {
43
+ "Authorization": f"Bearer {api_token}",
44
+ "Content-Type": "application/json",
45
+ }
46
+
47
+ payload = {
48
+ "url": url,
49
+ "zone": unlocker_zone,
50
+ "format": "raw",
51
+ "data_format": "markdown",
52
+ }
53
+
54
+ try:
55
+ response = requests.post(api_url, json=payload, headers=headers)
56
+ response.raise_for_status()
57
+ return response.text
58
+ except requests.exceptions.RequestException as e:
59
+ return f"Error scraping URL: {str(e)}"
brightdata_search.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents import Tool
2
+ import requests
3
+ import json
4
+ import os
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables from .env if present
8
+ load_dotenv()
9
+
10
+
11
+ class BrightDataSearchTool(Tool):
12
+ name = "brightdata_search_engine"
13
+ description = """
14
+ Search Google, Bing, or Yandex and get structured results.
15
+ Returns search results with URLs, titles, and descriptions.
16
+ Ideal for gathering current information and news.
17
+ """
18
+ inputs = {
19
+ "query": {
20
+ "type": "string",
21
+ "description": "The search query",
22
+ },
23
+ "engine": {
24
+ "type": "string",
25
+ "description": "Search engine to use: 'google', 'bing', or 'yandex'. Default is 'google'",
26
+ "nullable": True,
27
+ "default": "google",
28
+ },
29
+ }
30
+ output_type = "string"
31
+
32
+ def forward(self, query: str, engine: str = "google") -> str:
33
+ """
34
+ Search using Bright Data's search API.
35
+
36
+ Args:
37
+ query: The search query.
38
+ engine: Search engine to use (google, bing, or yandex).
39
+
40
+ Returns:
41
+ JSON string with search results or markdown for non-Google engines.
42
+ """
43
+ api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
44
+ unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker1")
45
+
46
+ if not api_token:
47
+ raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
48
+
49
+ search_urls = {
50
+ "google": f"https://www.google.com/search?q={requests.utils.quote(query)}&brd_json=1",
51
+ "bing": f"https://www.bing.com/search?q={requests.utils.quote(query)}",
52
+ "yandex": f"https://yandex.com/search/?text={requests.utils.quote(query)}",
53
+ }
54
+
55
+ search_url = search_urls.get(engine.lower(), search_urls["google"])
56
+ is_google = engine.lower() == "google"
57
+
58
+ api_url = "https://api.brightdata.com/request"
59
+ headers = {
60
+ "Authorization": f"Bearer {api_token}",
61
+ "Content-Type": "application/json",
62
+ }
63
+
64
+ payload = {
65
+ "url": search_url,
66
+ "zone": unlocker_zone,
67
+ "format": "raw",
68
+ }
69
+
70
+ if not is_google:
71
+ payload["data_format"] = "markdown"
72
+
73
+ try:
74
+ response = requests.post(api_url, json=payload, headers=headers)
75
+ response.raise_for_status()
76
+
77
+ if is_google:
78
+ data = response.json()
79
+ results = {
80
+ "organic": data.get("organic", []),
81
+ "images": [img.get("link") for img in data.get("images", [])],
82
+ "related": data.get("related", []),
83
+ "ai_overview": data.get("ai_overview"),
84
+ }
85
+ return json.dumps(results, indent=2)
86
+
87
+ # Return markdown for Bing/Yandex
88
+ return response.text
89
+
90
+ except requests.exceptions.RequestException as e:
91
+ return json.dumps({"error": str(e)})
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ smolagents>=1.0.0
2
+ huggingface_hub>=0.20.0
3
+ requests>=2.31.0
4
+ python-dotenv>=1.0.0
5
+ gradio>=4.0.0
test_datasets.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from brightdata_datasets import BrightDataDatasetTool
3
+
4
+
5
+ def main():
6
+ dataset_tool = BrightDataDatasetTool()
7
+
8
+ # Example dataset and params; change these as needed for quick manual testing.
9
+ dataset_key = "google_maps_reviews"
10
+ params = {
11
+ "url": "https://www.google.com/maps/place/Google+Sydney+-+Pirrama+Road/@-33.866489,151.1958561,17z/data=!4m8!3m7!1s0x6b12ae37b47f5b37:0x8eaddfcd1b32ca52!8m2!3d-33.866489!4d151.1958561!9m1!1b1!16s%2Fg%2F1td76qvq?entry=ttu&g_ep=EgoyMDI1MTIwMi4wIKXMDSoASAFQAw%3D%3D",
12
+ "days_limit": "3",
13
+ }
14
+
15
+ result = dataset_tool.forward(dataset_key, json.dumps(params))
16
+
17
+ print("Dataset response keys / status:")
18
+ try:
19
+ parsed = json.loads(result)
20
+ except json.JSONDecodeError:
21
+ print("Non-JSON response, raw output (first 2000 chars):")
22
+ print(result[:2000])
23
+ return
24
+
25
+ # Response can be a bare list or a dict depending on dataset.
26
+ if isinstance(parsed, list):
27
+ print(f"Top-level type: list; items: {len(parsed)}")
28
+ if parsed:
29
+ print("First item sample:")
30
+ print(json.dumps(parsed[0], indent=2)[:1000])
31
+ return
32
+
33
+ print(f"Top-level keys: {list(parsed.keys())}")
34
+
35
+ items = parsed.get("items") or parsed.get("data") or parsed.get("records") or parsed.get("result")
36
+ if isinstance(items, list):
37
+ print(f"Items count: {len(items)}")
38
+ if items:
39
+ print("First item sample:")
40
+ print(json.dumps(items[0], indent=2)[:1000])
41
+ else:
42
+ print("No iterable items found. Raw JSON (first 2000 chars):")
43
+ print(json.dumps(parsed, indent=2)[:2000])
44
+
45
+
46
+ if __name__ == "__main__":
47
+ main()
test_scraper.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from brightdata_scraper import BrightDataScraperTool
2
+
3
+ def main():
4
+ scraper = BrightDataScraperTool()
5
+
6
+ url = "https://en.wikipedia.org/wiki/Meir_Kadosh"
7
+ result = scraper.forward(url)
8
+
9
+ print("Scraped Content (first 500 chars):")
10
+ print(result)
11
+
12
+
13
+ if __name__ == "__main__":
14
+ main()
test_search.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from brightdata_search import BrightDataSearchTool
2
+ import json
3
+
4
+
5
+ def main():
6
+ search_tool = BrightDataSearchTool()
7
+
8
+ query = "Python programming tutorials"
9
+ result = search_tool.forward(query, engine="google")
10
+
11
+ print("Search Results (Google) summary:")
12
+ parsed = json.loads(result)
13
+ organic = parsed.get("organic", [])
14
+ print(f"Found {len(organic)} organic results")
15
+
16
+ print(organic)
17
+
18
+
19
+ if __name__ == "__main__":
20
+ main()