Spaces:
Running
Running
| import trafilatura | |
| from tqdm.asyncio import tqdm_asyncio as tqdm_async | |
| from graphgen.models import GoogleSearch | |
| from graphgen.utils import logger | |
| async def _process_single_entity( | |
| entity_name: str, google_search_client: GoogleSearch | |
| ) -> str | None: | |
| search_results = google_search_client.search(entity_name) | |
| if not search_results: | |
| return None | |
| # Get more details from the first search result | |
| first_result = search_results[0] | |
| content = trafilatura.fetch_url(first_result["link"]) | |
| summary = trafilatura.extract(content, include_comments=False, include_links=False) | |
| summary = summary.strip() | |
| logger.info( | |
| "Entity %s search result: %s", | |
| entity_name, | |
| summary, | |
| ) | |
| return summary | |
| async def search_google( | |
| google_search_client: GoogleSearch, | |
| entities: set[str], | |
| ) -> dict: | |
| """ | |
| Search with Google and return the contexts. | |
| :param google_search_client: Google search client | |
| :param entities: list of entities to search | |
| :return: | |
| """ | |
| google_data = {} | |
| async for entity in tqdm_async( | |
| entities, desc="Searching Google", total=len(entities) | |
| ): | |
| try: | |
| summary = await _process_single_entity(entity, google_search_client) | |
| if summary: | |
| google_data[entity] = summary | |
| except Exception as e: # pylint: disable=broad-except | |
| logger.error("Error processing entity %s: %s", entity, str(e)) | |
| return google_data | |