Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 15

Commit

46a015f

verified ·

1 Parent(s): 396db14

Update main.py

Browse files

Files changed (1) hide show

main.py +79 -61

main.py CHANGED Viewed

@@ -2,8 +2,9 @@ import os
 import asyncio
 import json
 import logging
 import re
-from typing import AsyncGenerator, Optional
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
@@ -24,16 +25,29 @@ if not LLM_API_KEY:
 else:
     logger.info("LLM API Key loaded successfully.")
 # API Provider Constants
 SNAPZION_API_URL = "https://search.snapzion.com/get-snippets"
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "gpt-4.1-mini"
-MAX_CONTEXT_CHAR_LENGTH = 120000
 # Headers
-SNAPZION_HEADERS = { 'Content-Type': 'application/json', 'User-Agent': 'AI-Deep-Research-Agent/1.0' }
-SCRAPING_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36' }
-LLM_HEADERS = { "Authorization": f"Bearer {LLM_API_KEY}", "Content-Type": "application/json", "Accept": "application/json" }
 # --- Pydantic Models & Helper Functions ---
 class DeepResearchRequest(BaseModel):
@@ -43,23 +57,19 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
     match = re.search(r'\[.*\]', text, re.DOTALL)
     if match:
         json_str = match.group(0)
-        try:
-            return json.loads(json_str)
-        except json.JSONDecodeError:
-            logger.error(f"Failed to parse extracted JSON string: {json_str}")
-            return None
-    logger.warning(f"No JSON array found in LLM response: {text}")
     return None
 # --- FastAPI App ---
 app = FastAPI(
     title="AI Deep Research API",
-    description="Provides streaming deep research completions.",
-    version="2.5.0" # Version bump for AttributeError fix
 )
-# --- Core Service Functions (Unchanged) ---
-async def call_snapzion_search(session: aiohttp.ClientSession, query: str) -> list:
     try:
         async with session.post(SNAPZION_API_URL, headers=SNAPZION_HEADERS, json={"query": query}, timeout=15) as response:
             response.raise_for_status(); data = await response.json()
@@ -70,7 +80,9 @@ async def call_snapzion_search(session: aiohttp.ClientSession, query: str) -> li
 async def scrape_url(session: aiohttp.ClientSession, url: str) -> str:
     if url.lower().endswith('.pdf'): return "Error: PDF content cannot be scraped."
     try:
-        async with session.get(url, headers=SCRAPING_HEADERS, timeout=10, ssl=False) as response:
             if response.status != 200: return f"Error: HTTP status {response.status}"
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
@@ -79,70 +91,77 @@ async def scrape_url(session: aiohttp.ClientSession, url: str) -> str:
     except Exception as e:
         logger.warning(f"Scraping failed for {url}: {e}"); return f"Error: {e}"
-async def search_and_scrape(session: aiohttp.ClientSession, query: str) -> tuple[str, list]:
-    search_results = await call_snapzion_search(session, query); sources = search_results[:4]
-    if not sources: return "", []
-    scrape_tasks = [scrape_url(session, source["link"]) for source in sources]
-    scraped_contents = await asyncio.gather(*scrape_tasks)
-    context = "\n\n".join(f"Source: {sources[i]['link']}\nContent: {content}" for i, content in enumerate(scraped_contents) if not content.startswith("Error:"))
-    return context, sources
 # --- Streaming Deep Research Logic ---
 async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
     def format_sse(data: dict) -> str: return f"data: {json.dumps(data)}\n\n"
     try:
         async with aiohttp.ClientSession() as session:
-            # Step 1: Generate Sub-Questions
             yield format_sse({"event": "status", "data": "Generating research plan..."})
-            sub_question_prompt = {
-                "model": LLM_MODEL, "messages": [{"role": "user", "content": f"Generate 3-4 key sub-questions for a research report on '{query}'. Your response MUST be ONLY the raw JSON array, without markdown, explanations, or any other text. Example: [\"Question 1?\", \"Question 2?\"]"}]
-            }
             try:
-                async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=sub_question_prompt, timeout=20) as response:
-                    response.raise_for_status()
-                    result = await response.json()
-                    # ***** CHANGE 1: The definitive fix for the AttributeError *****
-                    sub_questions = None
-                    if isinstance(result, dict) and 'choices' in result:
-                        # Handle standard OpenAI dictionary format
-                        llm_content = result.get('choices', [{}])[0].get('message', {}).get('content', '')
-                        sub_questions = extract_json_from_llm_response(llm_content)
-                    elif isinstance(result, list):
-                        # Handle the case where the API returns the list directly
-                        sub_questions = result
-                    if not sub_questions or not isinstance(sub_questions, list):
-                        raise ValueError(f"Could not extract a valid list of questions from LLM response: {result}")
             except Exception as e:
                 logger.error(f"Failed to generate research plan: {e}")
                 yield format_sse({"event": "error", "data": f"Could not generate research plan. Reason: {e}"}); return
             yield format_sse({"event": "plan", "data": sub_questions})
-            # The rest of the pipeline can now execute
-            research_tasks = [search_and_scrape(session, sq) for sq in sub_questions]
-            yield format_sse({"event": "status", "data": f"Starting research on {len(sub_questions)} topics..."})
-            consolidated_context, all_sources = "", []
-            for task in asyncio.as_completed(research_tasks):
-                context, sources = await task
-                if context: consolidated_context += context + "\n\n---\n\n"
-                if sources: all_sources.extend(sources)
             if not consolidated_context.strip():
-                yield format_sse({"event": "error", "data": "Failed to gather any research context."}); return
-            yield format_sse({"event": "status", "data": "Generating final report..."})
             if len(consolidated_context) > MAX_CONTEXT_CHAR_LENGTH:
                 consolidated_context = consolidated_context[:MAX_CONTEXT_CHAR_LENGTH]
-            final_report_prompt = f'Synthesize the provided context into a comprehensive report on "{query}". Use markdown. Context:\n{consolidated_context}'
-            final_report_payload = {"model": LLM_MODEL, "messages": [{"role": "user", "content": final_report_prompt}], "stream": True}
-            async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=final_report_payload) as response:
                 response.raise_for_status()
                 async for line in response.content:
                     if line.strip():
@@ -155,10 +174,9 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                             if content: yield format_sse({"event": "chunk", "data": content})
                         except json.JSONDecodeError: continue
-            unique_sources = list({s['link']: s for s in all_sources}.values())
-            yield format_sse({"event": "sources", "data": unique_sources})
     except Exception as e:
-        logger.error(f"A critical error occurred in the main research stream: {e}")
         yield format_sse({"event": "error", "data": str(e)})
     finally:
         yield format_sse({"event": "done", "data": "Deep research complete."})

 import asyncio
 import json
 import logging
+import random
 import re
+from typing import AsyncGenerator, Optional, Tuple, List
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
 else:
     logger.info("LLM API Key loaded successfully.")
+# --- Constants & Headers ---
 # API Provider Constants
 SNAPZION_API_URL = "https://search.snapzion.com/get-snippets"
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "gpt-4.1-mini"
+# Automatic Context Sizing based on Tokens
+TARGET_TOKEN_LIMIT = 28000  # Safe limit for models with ~32k context windows
+ESTIMATED_CHARS_PER_TOKEN = 4
+MAX_CONTEXT_CHAR_LENGTH = TARGET_TOKEN_LIMIT * ESTIMATED_CHARS_PER_TOKEN
+# Real Browser User Agents for Rotation
+USER_AGENTS = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1"
+]
 # Headers
+SNAPZION_HEADERS = {'Content-Type': 'application/json'}
+LLM_HEADERS = {"Authorization": f"Bearer {LLM_API_KEY}", "Content-Type": "application/json", "Accept": "application/json"}
 # --- Pydantic Models & Helper Functions ---
 class DeepResearchRequest(BaseModel):
     match = re.search(r'\[.*\]', text, re.DOTALL)
     if match:
         json_str = match.group(0)
+        try: return json.loads(json_str)
+        except json.JSONDecodeError: return None
     return None
 # --- FastAPI App ---
 app = FastAPI(
     title="AI Deep Research API",
+    description="Provides robust, streaming deep research completions.",
+    version="3.0.0"  # Major version bump for robustness overhaul
 )
+# --- Core Service Functions ---
+async def call_snapzion_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
     try:
         async with session.post(SNAPZION_API_URL, headers=SNAPZION_HEADERS, json={"query": query}, timeout=15) as response:
             response.raise_for_status(); data = await response.json()
 async def scrape_url(session: aiohttp.ClientSession, url: str) -> str:
     if url.lower().endswith('.pdf'): return "Error: PDF content cannot be scraped."
     try:
+        # Rotate user agents for each request
+        headers = {'User-Agent': random.choice(USER_AGENTS)}
+        async with session.get(url, headers=headers, timeout=10, ssl=False) as response:
             if response.status != 200: return f"Error: HTTP status {response.status}"
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
     except Exception as e:
         logger.warning(f"Scraping failed for {url}: {e}"); return f"Error: {e}"
+async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
+    """Scrapes a single source and falls back to its snippet if scraping fails."""
+    scraped_content = await scrape_url(session, source['link'])
+    if scraped_content.startswith("Error:"):
+        # SNIPPET FALLBACK LOGIC
+        logger.warning(f"Scraping failed for {source['link']}. Falling back to snippet.")
+        return source['snippet'], source
+    return scraped_content, source
 # --- Streaming Deep Research Logic ---
 async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
     def format_sse(data: dict) -> str: return f"data: {json.dumps(data)}\n\n"
     try:
         async with aiohttp.ClientSession() as session:
+            # Step 1: Generate Research Plan
             yield format_sse({"event": "status", "data": "Generating research plan..."})
+            plan_prompt = {"model": LLM_MODEL, "messages": [{"role": "user", "content": f"Generate 3-4 key sub-questions for a research report on '{query}'. Your response MUST be ONLY the raw JSON array, without markdown. Example: [\"Question 1?\"]"}]}
             try:
+                async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=plan_prompt, timeout=20) as response:
+                    response.raise_for_status(); result = await response.json()
+                    sub_questions = result if isinstance(result, list) else extract_json_from_llm_response(result['choices'][0]['message']['content'])
+                    if not isinstance(sub_questions, list): raise ValueError(f"Could not extract a valid list from LLM response: {result}")
             except Exception as e:
                 logger.error(f"Failed to generate research plan: {e}")
                 yield format_sse({"event": "error", "data": f"Could not generate research plan. Reason: {e}"}); return
             yield format_sse({"event": "plan", "data": sub_questions})
+            # Step 2: Conduct Research in Parallel
+            yield format_sse({"event": "status", "data": f"Searching for sources for {len(sub_questions)} topics..."})
+            search_tasks = [call_snapzion_search(session, sq) for sq in sub_questions]
+            all_search_results = await asyncio.gather(*search_tasks)
+            # Deduplicate sources by link to avoid scraping the same page multiple times
+            unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
+            if not unique_sources:
+                yield format_sse({"event": "error", "data": "Search did not return any usable sources."}); return
+            yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Scraping and processing..."})
+            # Process all unique sources concurrently with snippet fallback
+            processing_tasks = [research_and_process_source(session, source) for source in unique_sources]
+            consolidated_context = ""
+            all_sources_used = []
+            successful_scrapes = 0
+            for task in asyncio.as_completed(processing_tasks):
+                content, source_info = await task
+                if content:
+                    consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
+                    all_sources_used.append(source_info)
+                    if not content == source_info['snippet']: # Count as success only if not a snippet
+                        successful_scrapes += 1
+            logger.info(f"Context gathering complete. Successfully scraped {successful_scrapes}/{len(unique_sources)} pages. Used {len(all_sources_used)} total sources (including snippets).")
             if not consolidated_context.strip():
+                yield format_sse({"event": "error", "data": "Failed to gather any research context from scraping or snippets."}); return
+            # Step 3: Synthesize Final Report
+            yield format_sse({"event": "status", "data": "Synthesizing final report..."})
             if len(consolidated_context) > MAX_CONTEXT_CHAR_LENGTH:
+                logger.warning(f"Context truncated from {len(consolidated_context)} to {MAX_CONTEXT_CHAR_LENGTH} chars.")
                 consolidated_context = consolidated_context[:MAX_CONTEXT_CHAR_LENGTH]
+            report_prompt = f'Synthesize the provided context into a comprehensive, well-structured report on "{query}". Use markdown. Context:\n{consolidated_context}'
+            report_payload = {"model": LLM_MODEL, "messages": [{"role": "user", "content": report_prompt}], "stream": True}
+            async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                 response.raise_for_status()
                 async for line in response.content:
                     if line.strip():
                             if content: yield format_sse({"event": "chunk", "data": content})
                         except json.JSONDecodeError: continue
+            yield format_sse({"event": "sources", "data": all_sources_used})
     except Exception as e:
+        logger.error(f"A critical error occurred in the main research stream: {e}", exc_info=True)
         yield format_sse({"event": "error", "data": str(e)})
     finally:
         yield format_sse({"event": "done", "data": "Deep research complete."})