Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 15

Commit

e1111e0

verified ·

1 Parent(s): 2a0098d

Update main.py

Browse files

Files changed (1) hide show

main.py +48 -27

main.py CHANGED Viewed

@@ -4,8 +4,13 @@ from fastapi import FastAPI, HTTPException, Query
 from dotenv import load_dotenv
 import aiohttp
 from bs4 import BeautifulSoup
 # --- Configuration ---
 load_dotenv()
 LLM_API_KEY = os.getenv("LLM_API_KEY")
@@ -30,6 +35,15 @@ SNAPZION_HEADERS = {
     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36',
 }
 # LLM Configuration
 LLM_API_URL = "https://api.inference.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/llama-3.1-8b-instruct/fp-8"
@@ -38,44 +52,46 @@ LLM_MODEL = "meta-llama/llama-3.1-8b-instruct/fp-8"
 app = FastAPI(
     title="AI Search Snippets API (Snapzion)",
     description="Provides AI-generated summaries from Snapzion search results.",
-    version="1.0.1"
 )
 # --- Core Asynchronous Functions ---
 async def call_snapzion_search(session: aiohttp.ClientSession, query: str) -> list:
-    """Calls the Snapzion search API and returns a list of organic results."""
     try:
         async with session.post(SNAPZION_API_URL, headers=SNAPZION_HEADERS, json={"query": query}, timeout=15) as response:
             response.raise_for_status()
             data = await response.json()
             return data.get("organic_results", [])
     except Exception as e:
         raise HTTPException(status_code=503, detail=f"Search service (Snapzion) failed: {e}")
 async def scrape_url(session: aiohttp.ClientSession, url: str) -> str:
-    """Asynchronously scrapes the primary text content from a URL, ignoring PDFs."""
     if url.lower().endswith('.pdf'):
-        return "Content is a PDF, which cannot be scraped."
     try:
-        async with session.get(url, timeout=10) as response:
             if response.status != 200:
-                return f"Error: Failed to fetch {url} with status {response.status}"
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
             for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
                 tag.decompose()
             return " ".join(soup.stripped_strings)
     except Exception as e:
-        return f"Error: Could not scrape {url}. Reason: {e}"
 async def get_ai_snippet(query: str, context: str, sources: list) -> str:
-    """Generates a synthesized answer using an LLM based on the provided context."""
     headers = {"Authorization": f"Bearer {LLM_API_KEY}", "Content-Type": "application/json"}
     source_list_str = "\n".join([f"[{i+1}] {source['title']}: {source['link']}" for i, source in enumerate(sources)])
     prompt = f"""
-Based *only* on the provided context from web pages, provide a concise, factual answer to the user's query. Cite every sentence with the corresponding source number(s), like `[1]`, `[2]`, or `[1, 3]`.
 Sources:
 {source_list_str}
@@ -90,7 +106,6 @@ User Query: "{query}"
 Answer with citations:
 """
     data = {"model": LLM_MODEL, "messages": [{"role": "user", "content": prompt}], "max_tokens": 500}
     async with aiohttp.ClientSession() as session:
         try:
             async with session.post(LLM_API_URL, headers=headers, json=data, timeout=45) as response:
@@ -98,39 +113,45 @@ Answer with citations:
                 result = await response.json()
                 return result['choices'][0]['message']['content']
         except Exception as e:
             raise HTTPException(status_code=502, detail=f"Failed to get response from LLM: {e}")
 # --- API Endpoint ---
 @app.get("/search")
 async def ai_search(q: str = Query(..., min_length=3, description="The search query.")):
-    """
-    Performs an AI-powered search using Snapzion. It finds relevant web pages,
-    scrapes their content, and generates a synthesized answer with citations.
-    """
     async with aiohttp.ClientSession() as session:
-        # 1. Search for relevant web pages using Snapzion
         search_results = await call_snapzion_search(session, q)
         if not search_results:
             raise HTTPException(status_code=404, detail="Could not find any relevant sources for the query.")
-        # Limit to the top 4 results for speed and relevance
-        sources = search_results[:4]
-        # 2. Scrape all pages concurrently for speed
         scrape_tasks = [scrape_url(session, source["link"]) for source in sources]
         scraped_contents = await asyncio.gather(*scrape_tasks)
-        # 3. Combine content and snippets for a rich context
-        full_context = "\n\n".join(
-            f"Source [{i+1}] (from {sources[i]['link']}):\nOriginal Snippet: {sources[i]['snippet']}\nScraped Content: {content}"
-            for i, content in enumerate(scraped_contents) if not content.startswith("Error:")
-        )
         if not full_context.strip():
-            raise HTTPException(status_code=500, detail="Failed to scrape content from all available sources.")
-        # 4. Generate the final AI snippet
         ai_summary = await get_ai_snippet(q, full_context, sources)
     return {"ai_summary": ai_summary, "sources": sources}

 from dotenv import load_dotenv
 import aiohttp
 from bs4 import BeautifulSoup
+import logging
 # --- Configuration ---
+# Configure logging to see what's happening
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 load_dotenv()
 LLM_API_KEY = os.getenv("LLM_API_KEY")
     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36',
 }
+# ***** CHANGE 1: Add general-purpose browser headers for scraping *****
+SCRAPING_HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
+    'Accept-Language': 'en-US,en;q=0.9',
+    'Connection': 'keep-alive',
+    'Upgrade-Insecure-Requests': '1',
+}
 # LLM Configuration
 LLM_API_URL = "https://api.inference.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/llama-3.1-8b-instruct/fp-8"
 app = FastAPI(
     title="AI Search Snippets API (Snapzion)",
     description="Provides AI-generated summaries from Snapzion search results.",
+    version="1.1.0" # Version bump for new resilience feature
 )
 # --- Core Asynchronous Functions ---
 async def call_snapzion_search(session: aiohttp.ClientSession, query: str) -> list:
     try:
         async with session.post(SNAPZION_API_URL, headers=SNAPZION_HEADERS, json={"query": query}, timeout=15) as response:
             response.raise_for_status()
             data = await response.json()
             return data.get("organic_results", [])
     except Exception as e:
+        logger.error(f"Snapzion API call failed: {e}")
         raise HTTPException(status_code=503, detail=f"Search service (Snapzion) failed: {e}")
+# ***** CHANGE 2: Improve the scraping function *****
 async def scrape_url(session: aiohttp.ClientSession, url: str) -> str:
+    """Asynchronously scrapes text from a URL, now with browser headers."""
     if url.lower().endswith('.pdf'):
+        return "Error: Content is a PDF, which cannot be scraped."
     try:
+        # Use the new scraping headers to look like a real browser
+        async with session.get(url, headers=SCRAPING_HEADERS, timeout=10, ssl=False) as response:
             if response.status != 200:
+                logger.warning(f"Failed to fetch {url}, status code: {response.status}")
+                return f"Error: Failed to fetch with status {response.status}"
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
             for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
                 tag.decompose()
             return " ".join(soup.stripped_strings)
     except Exception as e:
+        logger.warning(f"Could not scrape {url}. Reason: {e}")
+        return f"Error: Could not scrape. Reason: {e}"
 async def get_ai_snippet(query: str, context: str, sources: list) -> str:
     headers = {"Authorization": f"Bearer {LLM_API_KEY}", "Content-Type": "application/json"}
     source_list_str = "\n".join([f"[{i+1}] {source['title']}: {source['link']}" for i, source in enumerate(sources)])
     prompt = f"""
+Based *only* on the provided context, provide a concise, factual answer to the user's query. Cite every sentence with the corresponding source number(s), like `[1]` or `[2, 3]`.
 Sources:
 {source_list_str}
 Answer with citations:
 """
     data = {"model": LLM_MODEL, "messages": [{"role": "user", "content": prompt}], "max_tokens": 500}
     async with aiohttp.ClientSession() as session:
         try:
             async with session.post(LLM_API_URL, headers=headers, json=data, timeout=45) as response:
                 result = await response.json()
                 return result['choices'][0]['message']['content']
         except Exception as e:
+            logger.error(f"LLM API call failed: {e}")
             raise HTTPException(status_code=502, detail=f"Failed to get response from LLM: {e}")
 # --- API Endpoint ---
 @app.get("/search")
 async def ai_search(q: str = Query(..., min_length=3, description="The search query.")):
     async with aiohttp.ClientSession() as session:
         search_results = await call_snapzion_search(session, q)
         if not search_results:
             raise HTTPException(status_code=404, detail="Could not find any relevant sources for the query.")
+        sources = search_results[:5] # Use top 5 sources
         scrape_tasks = [scrape_url(session, source["link"]) for source in sources]
         scraped_contents = await asyncio.gather(*scrape_tasks)
+        # ***** CHANGE 3: Implement the robust fallback logic *****
+        successful_scrapes = [content for content in scraped_contents if not content.startswith("Error:")]
+        full_context = ""
+        if successful_scrapes:
+            logger.info(f"Successfully scraped {len(successful_scrapes)} out of {len(sources)} sources.")
+            # Build context from successfully scraped content
+            full_context = "\n\n".join(
+                f"Source [{i+1}] ({sources[i]['link']}):\n{scraped_contents[i]}"
+                for i in range(len(sources)) if not scraped_contents[i].startswith("Error:")
+            )
+        else:
+            # If ALL scrapes failed, fall back to using the snippets from the search API
+            logger.warning("All scraping attempts failed. Falling back to using API snippets for context.")
+            full_context = "\n\n".join(
+                f"Source [{i+1}] ({source['link']}):\n{source['snippet']}"
+                for i, source in enumerate(sources)
+            )
         if not full_context.strip():
+            # This is a final safety net, should rarely be hit now
+            raise HTTPException(status_code=500, detail="Could not construct any context from sources or snippets.")
         ai_summary = await get_ai_snippet(q, full_context, sources)
     return {"ai_summary": ai_summary, "sources": sources}