BrightData commited on
Commit
6fd2cde
·
verified ·
1 Parent(s): 4f9a62c

Add Bright Data Dataset Tool

Browse files
Files changed (1) hide show
  1. tool.py +2 -2
tool.py CHANGED
@@ -1,9 +1,9 @@
1
  from typing import Any, Optional
2
  from smolagents.tools import Tool
3
  import json
4
- import time
5
  import requests
6
  import os
 
7
 
8
  class BrightDataDatasetTool(Tool):
9
  name = "brightdata_dataset_fetch"
@@ -15,7 +15,7 @@ class BrightDataDatasetTool(Tool):
15
  self.datasets = globals().get("DATASETS")
16
  if not self.datasets:
17
  import json
18
- fallback_json = """{"amazon_product": {"dataset_id": "gd_l7q7dkf244hwjntr0", "description": "Quickly read structured amazon product data.\nRequires a valid product URL with /dp/ in it.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "amazon_product_reviews": {"dataset_id": "gd_le8e811kzy4ggddlq", "description": "Quickly read structured amazon product review data.\nRequires a valid product URL with /dp/ in it.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "amazon_product_search": {"dataset_id": "gd_lwdb4vjm1ehb499uxs", "description": "Quickly read structured amazon product search data.\nRequires a valid search keyword and amazon domain URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["keyword", "url"], "fixed_values": {"pages_to_search": "1"}}, "walmart_product": {"dataset_id": "gd_l95fol7l1ru6rlo116", "description": "Quickly read structured walmart product data.\nRequires a valid product URL with /ip/ in it.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "walmart_seller": {"dataset_id": "gd_m7ke48w81ocyu4hhz0", "description": "Quickly read structured walmart seller data.\nRequires a valid walmart seller URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "ebay_product": {"dataset_id": "gd_ltr9mjt81n0zzdk1fb", "description": "Quickly read structured ebay product data.\nRequires a valid ebay product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "homedepot_products": {"dataset_id": "gd_lmusivh019i7g97q2n", "description": "Quickly read structured homedepot product data.\nRequires a valid homedepot product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "zara_products": {"dataset_id": "gd_lct4vafw1tgx27d4o0", "description": "Quickly read structured zara product data.\nRequires a valid zara product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "etsy_products": {"dataset_id": "gd_ltppk0jdv1jqz25mz", "description": "Quickly read structured etsy product data.\nRequires a valid etsy product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "bestbuy_products": {"dataset_id": "gd_ltre1jqe1jfr7cccf", "description": "Quickly read structured bestbuy product data.\nRequires a valid bestbuy product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_person_profile": {"dataset_id": "gd_l1viktl72bvl7bjuj0", "description": "Quickly read structured linkedin people profile data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_company_profile": {"dataset_id": "gd_l1vikfnt1wgvvqz95w", "description": "Quickly read structured linkedin company profile data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_job_listings": {"dataset_id": "gd_lpfll7v5hcqtkxl6l", "description": "Quickly read structured linkedin job listings data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_posts": {"dataset_id": "gd_lyy3tktm25m4avu764", "description": "Quickly read structured linkedin posts data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_people_search": {"dataset_id": "gd_m8d03he47z8nwb5xc", "description": "Quickly read structured linkedin people search data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url", "first_name", "last_name"]}, "crunchbase_company": {"dataset_id": "gd_l1vijqt9jfj7olije", "description": "Quickly read structured crunchbase company data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "zoominfo_company_profile": {"dataset_id": "gd_m0ci4a4ivx3j5l6nx", "description": "Quickly read structured ZoomInfo company profile data.\nRequires a valid ZoomInfo company URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "instagram_profiles": {"dataset_id": "gd_l1vikfch901nx3by4", "description": "Quickly read structured Instagram profile data.\nRequires a valid Instagram URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "instagram_posts": {"dataset_id": "gd_lk5ns7kz21pck8jpis", "description": "Quickly read structured Instagram post data.\nRequires a valid Instagram URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "instagram_reels": {"dataset_id": "gd_lyclm20il4r5helnj", "description": "Quickly read structured Instagram reel data.\nRequires a valid Instagram URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "instagram_comments": {"dataset_id": "gd_ltppn085pokosxh13", "description": "Quickly read structured Instagram comments data.\nRequires a valid Instagram URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "facebook_posts": {"dataset_id": "gd_lyclm1571iy3mv57zw", "description": "Quickly read structured Facebook post data.\nRequires a valid Facebook post URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "facebook_marketplace_listings": {"dataset_id": "gd_lvt9iwuh6fbcwmx1a", "description": "Quickly read structured Facebook marketplace listing data.\nRequires a valid Facebook marketplace listing URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "facebook_company_reviews": {"dataset_id": "gd_m0dtqpiu1mbcyc2g86", "description": "Quickly read structured Facebook company reviews data.\nRequires a valid Facebook company URL and number of reviews.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url", "num_of_reviews"]}, "facebook_events": {"dataset_id": "gd_m14sd0to1jz48ppm51", "description": "Quickly read structured Facebook events data.\nRequires a valid Facebook event URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "tiktok_profiles": {"dataset_id": "gd_l1villgoiiidt09ci", "description": "Quickly read structured Tiktok profiles data.\nRequires a valid Tiktok profile URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "tiktok_posts": {"dataset_id": "gd_lu702nij2f790tmv9h", "description": "Quickly read structured Tiktok post data.\nRequires a valid Tiktok post URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "tiktok_shop": {"dataset_id": "gd_m45m1u911dsa4274pi", "description": "Quickly read structured Tiktok shop data.\nRequires a valid Tiktok shop product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "tiktok_comments": {"dataset_id": "gd_lkf2st302ap89utw5k", "description": "Quickly read structured Tiktok comments data.\nRequires a valid Tiktok video URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "google_maps_reviews": {"dataset_id": "gd_luzfs1dn2oa0teb81", "description": "Quickly read structured Google maps reviews data.\nRequires a valid Google maps URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url", "days_limit"], "defaults": {"days_limit": "3"}}, "google_shopping": {"dataset_id": "gd_ltppk50q18kdw67omz", "description": "Quickly read structured Google shopping data.\nRequires a valid Google shopping product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "google_play_store": {"dataset_id": "gd_lsk382l8xei8vzm4u", "description": "Quickly read structured Google play store data.\nRequires a valid Google play store app URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "apple_app_store": {"dataset_id": "gd_lsk9ki3u2iishmwrui", "description": "Quickly read structured apple app store data.\nRequires a valid apple app store app URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "reuter_news": {"dataset_id": "gd_lyptx9h74wtlvpnfu", "description": "Quickly read structured reuter news data.\nRequires a valid reuter news report URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "github_repository_file": {"dataset_id": "gd_lyrexgxc24b3d4imjt", "description": "Quickly read structured github repository data.\nRequires a valid github repository file URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "yahoo_finance_business": {"dataset_id": "gd_lmrpz3vxmz972ghd7", "description": "Quickly read structured yahoo finance business data.\nRequires a valid yahoo finance business URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "x_posts": {"dataset_id": "gd_lwxkxvnf1cynvib9co", "description": "Quickly read structured X post data.\nRequires a valid X post URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "zillow_properties_listing": {"dataset_id": "gd_lfqkr8wm13ixtbd8f5", "description": "Quickly read structured zillow properties listing data.\nRequires a valid zillow properties listing URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "booking_hotel_listings": {"dataset_id": "gd_m5mbdl081229ln6t4a", "description": "Quickly read structured booking hotel listings data.\nRequires a valid booking hotel listing URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "youtube_profiles": {"dataset_id": "gd_lk538t2k2p1k3oos71", "description": "Quickly read structured youtube profiles data.\nRequires a valid youtube profile URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "youtube_comments": {"dataset_id": "gd_lk9q0ew71spt1mxywf", "description": "Quickly read structured youtube comments data.\nRequires a valid youtube video URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url", "num_of_comments"], "defaults": {"num_of_comments": "10"}}, "reddit_posts": {"dataset_id": "gd_lvz8ah06191smkebj4", "description": "Quickly read structured reddit posts data.\nRequires a valid reddit post URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "youtube_videos": {"dataset_id": "gd_lk56epmy2i5g7lzu0k", "description": "Quickly read structured YouTube videos data.\nRequires a valid YouTube video URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}}"""
19
  self.datasets = json.loads(fallback_json)
20
  self.inputs = {
21
  "dataset": {
 
1
  from typing import Any, Optional
2
  from smolagents.tools import Tool
3
  import json
 
4
  import requests
5
  import os
6
+ import time
7
 
8
  class BrightDataDatasetTool(Tool):
9
  name = "brightdata_dataset_fetch"
 
15
  self.datasets = globals().get("DATASETS")
16
  if not self.datasets:
17
  import json
18
+ fallback_json = r'{"amazon_product": {"dataset_id": "gd_l7q7dkf244hwjntr0", "description": "Quickly read structured amazon product data.\nRequires a valid product URL with /dp/ in it.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "amazon_product_reviews": {"dataset_id": "gd_le8e811kzy4ggddlq", "description": "Quickly read structured amazon product review data.\nRequires a valid product URL with /dp/ in it.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "amazon_product_search": {"dataset_id": "gd_lwdb4vjm1ehb499uxs", "description": "Quickly read structured amazon product search data.\nRequires a valid search keyword and amazon domain URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["keyword", "url"], "fixed_values": {"pages_to_search": "1"}}, "walmart_product": {"dataset_id": "gd_l95fol7l1ru6rlo116", "description": "Quickly read structured walmart product data.\nRequires a valid product URL with /ip/ in it.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "walmart_seller": {"dataset_id": "gd_m7ke48w81ocyu4hhz0", "description": "Quickly read structured walmart seller data.\nRequires a valid walmart seller URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "ebay_product": {"dataset_id": "gd_ltr9mjt81n0zzdk1fb", "description": "Quickly read structured ebay product data.\nRequires a valid ebay product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "homedepot_products": {"dataset_id": "gd_lmusivh019i7g97q2n", "description": "Quickly read structured homedepot product data.\nRequires a valid homedepot product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "zara_products": {"dataset_id": "gd_lct4vafw1tgx27d4o0", "description": "Quickly read structured zara product data.\nRequires a valid zara product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "etsy_products": {"dataset_id": "gd_ltppk0jdv1jqz25mz", "description": "Quickly read structured etsy product data.\nRequires a valid etsy product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "bestbuy_products": {"dataset_id": "gd_ltre1jqe1jfr7cccf", "description": "Quickly read structured bestbuy product data.\nRequires a valid bestbuy product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_person_profile": {"dataset_id": "gd_l1viktl72bvl7bjuj0", "description": "Quickly read structured linkedin people profile data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_company_profile": {"dataset_id": "gd_l1vikfnt1wgvvqz95w", "description": "Quickly read structured linkedin company profile data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_job_listings": {"dataset_id": "gd_lpfll7v5hcqtkxl6l", "description": "Quickly read structured linkedin job listings data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_posts": {"dataset_id": "gd_lyy3tktm25m4avu764", "description": "Quickly read structured linkedin posts data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_people_search": {"dataset_id": "gd_m8d03he47z8nwb5xc", "description": "Quickly read structured linkedin people search data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url", "first_name", "last_name"]}, "crunchbase_company": {"dataset_id": "gd_l1vijqt9jfj7olije", "description": "Quickly read structured crunchbase company data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "zoominfo_company_profile": {"dataset_id": "gd_m0ci4a4ivx3j5l6nx", "description": "Quickly read structured ZoomInfo company profile data.\nRequires a valid ZoomInfo company URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "instagram_profiles": {"dataset_id": "gd_l1vikfch901nx3by4", "description": "Quickly read structured Instagram profile data.\nRequires a valid Instagram URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "instagram_posts": {"dataset_id": "gd_lk5ns7kz21pck8jpis", "description": "Quickly read structured Instagram post data.\nRequires a valid Instagram URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "instagram_reels": {"dataset_id": "gd_lyclm20il4r5helnj", "description": "Quickly read structured Instagram reel data.\nRequires a valid Instagram URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "instagram_comments": {"dataset_id": "gd_ltppn085pokosxh13", "description": "Quickly read structured Instagram comments data.\nRequires a valid Instagram URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "facebook_posts": {"dataset_id": "gd_lyclm1571iy3mv57zw", "description": "Quickly read structured Facebook post data.\nRequires a valid Facebook post URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "facebook_marketplace_listings": {"dataset_id": "gd_lvt9iwuh6fbcwmx1a", "description": "Quickly read structured Facebook marketplace listing data.\nRequires a valid Facebook marketplace listing URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "facebook_company_reviews": {"dataset_id": "gd_m0dtqpiu1mbcyc2g86", "description": "Quickly read structured Facebook company reviews data.\nRequires a valid Facebook company URL and number of reviews.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url", "num_of_reviews"]}, "facebook_events": {"dataset_id": "gd_m14sd0to1jz48ppm51", "description": "Quickly read structured Facebook events data.\nRequires a valid Facebook event URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "tiktok_profiles": {"dataset_id": "gd_l1villgoiiidt09ci", "description": "Quickly read structured Tiktok profiles data.\nRequires a valid Tiktok profile URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "tiktok_posts": {"dataset_id": "gd_lu702nij2f790tmv9h", "description": "Quickly read structured Tiktok post data.\nRequires a valid Tiktok post URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "tiktok_shop": {"dataset_id": "gd_m45m1u911dsa4274pi", "description": "Quickly read structured Tiktok shop data.\nRequires a valid Tiktok shop product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "tiktok_comments": {"dataset_id": "gd_lkf2st302ap89utw5k", "description": "Quickly read structured Tiktok comments data.\nRequires a valid Tiktok video URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "google_maps_reviews": {"dataset_id": "gd_luzfs1dn2oa0teb81", "description": "Quickly read structured Google maps reviews data.\nRequires a valid Google maps URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url", "days_limit"], "defaults": {"days_limit": "3"}}, "google_shopping": {"dataset_id": "gd_ltppk50q18kdw67omz", "description": "Quickly read structured Google shopping data.\nRequires a valid Google shopping product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "google_play_store": {"dataset_id": "gd_lsk382l8xei8vzm4u", "description": "Quickly read structured Google play store data.\nRequires a valid Google play store app URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "apple_app_store": {"dataset_id": "gd_lsk9ki3u2iishmwrui", "description": "Quickly read structured apple app store data.\nRequires a valid apple app store app URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "reuter_news": {"dataset_id": "gd_lyptx9h74wtlvpnfu", "description": "Quickly read structured reuter news data.\nRequires a valid reuter news report URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "github_repository_file": {"dataset_id": "gd_lyrexgxc24b3d4imjt", "description": "Quickly read structured github repository data.\nRequires a valid github repository file URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "yahoo_finance_business": {"dataset_id": "gd_lmrpz3vxmz972ghd7", "description": "Quickly read structured yahoo finance business data.\nRequires a valid yahoo finance business URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "x_posts": {"dataset_id": "gd_lwxkxvnf1cynvib9co", "description": "Quickly read structured X post data.\nRequires a valid X post URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "zillow_properties_listing": {"dataset_id": "gd_lfqkr8wm13ixtbd8f5", "description": "Quickly read structured zillow properties listing data.\nRequires a valid zillow properties listing URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "booking_hotel_listings": {"dataset_id": "gd_m5mbdl081229ln6t4a", "description": "Quickly read structured booking hotel listings data.\nRequires a valid booking hotel listing URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "youtube_profiles": {"dataset_id": "gd_lk538t2k2p1k3oos71", "description": "Quickly read structured youtube profiles data.\nRequires a valid youtube profile URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "youtube_comments": {"dataset_id": "gd_lk9q0ew71spt1mxywf", "description": "Quickly read structured youtube comments data.\nRequires a valid youtube video URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url", "num_of_comments"], "defaults": {"num_of_comments": "10"}}, "reddit_posts": {"dataset_id": "gd_lvz8ah06191smkebj4", "description": "Quickly read structured reddit posts data.\nRequires a valid reddit post URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "youtube_videos": {"dataset_id": "gd_lk56epmy2i5g7lzu0k", "description": "Quickly read structured YouTube videos data.\nRequires a valid YouTube video URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}}'
19
  self.datasets = json.loads(fallback_json)
20
  self.inputs = {
21
  "dataset": {