Spaces:
Runtime error
Runtime error
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| import concurrent.futures | |
| class SearchClient: | |
| def __init__(self, vendor, engine_id=None, api_key=None): | |
| self.vendor = vendor | |
| if vendor == "google": | |
| self.endpoint = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={engine_id}" | |
| elif vendor == "bing": | |
| self.endpoint = "https://api.bing.microsoft.com/v7.0/search" | |
| self.headers = { | |
| "Ocp-Apim-Subscription-Key": api_key, | |
| } | |
| def _extract_text_from_link(link): | |
| page = requests.get(link) | |
| if page.status_code == 200: | |
| soup = BeautifulSoup(page.content, "html.parser") | |
| text = soup.get_text() | |
| cleaned_text = re.sub(r"\s+", " ", text) | |
| return cleaned_text | |
| return None | |
| def _fetch_text_from_links(self, links): | |
| results = [] | |
| with concurrent.futures.ThreadPoolExecutor() as executor: | |
| future_to_link = { | |
| executor.submit(self._extract_text_from_link, link): link | |
| for link in links | |
| } | |
| for future in concurrent.futures.as_completed(future_to_link): | |
| link = future_to_link[future] | |
| try: | |
| cleaned_text = future.result() | |
| if cleaned_text: | |
| results.append({"text": cleaned_text, "link": link}) | |
| except Exception as e: | |
| print(f"Error fetching data from {link}: {e}") | |
| return results | |
| def _google_search(self, query, n_crawl): | |
| response = requests.get(self.endpoint, params={"q": query}) | |
| search_results = response.json() | |
| results = [] | |
| count = 0 | |
| for item in search_results.get("items", []): | |
| if count >= n_crawl: | |
| break | |
| link = item["link"] | |
| results.append(link) | |
| count += 1 | |
| text_results = self._fetch_text_from_links(results) | |
| return text_results | |
| def _bing_search(self, query, n_crawl): | |
| params = { | |
| "q": query, | |
| "count": n_crawl, # You might need to adjust this based on Bing API requirements | |
| "mkt": "en-US", | |
| } | |
| response = requests.get(self.endpoint, headers=self.headers, params=params) | |
| search_results = response.json() | |
| results = [] | |
| for item in search_results.get("webPages", {}).get("value", []): | |
| link = item["url"] | |
| results.append(link) | |
| text_results = self._fetch_text_from_links(results) | |
| return text_results | |
| def search(self, query, n_crawl): | |
| if self.vendor == "google": | |
| return self._google_search(query, n_crawl) | |
| elif self.vendor == "bing": | |
| return self._bing_search(query, n_crawl) | |
| else: | |
| return "Invalid vendor" | |