Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 18

Commit

0d0aa07

verified ·

1 Parent(s): b5390ee

Update main.py

Browse files

Files changed (1) hide show

main.py +618 -225

main.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import asyncio
 import json
@@ -18,7 +19,6 @@ from fake_useragent import UserAgent
 from collections import defaultdict
 # --- Configuration ---
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - %(message)s'
@@ -33,21 +33,23 @@ else:
     logging.info("LLM API Key loaded successfully.")
 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
-MAX_SOURCES_TO_PROCESS = 15
-MAX_CONCURRENT_REQUESTS = 3
-SEARCH_TIMEOUT = 180  # 3 minutes
-TOTAL_TIMEOUT = 600  # 10 minutes total budget
-REQUEST_DELAY = 1.0
-RETRY_ATTEMPTS = 3
-RETRY_DELAY = 3.0
 # Initialize fake user agent generator
 try:
     ua = UserAgent()
-except Exception:
     class SimpleUA:
         def random(self):
             return random.choice([
@@ -65,319 +67,710 @@ LLM_HEADERS = {
 class DeepResearchRequest(BaseModel):
     query: str
-    search_time: int = 180  # Default to 3 minutes
-# --- FastAPI App Initialization ---
 app = FastAPI(
     title="AI Deep Research API",
-    description="Provides comprehensive, long-form research reports from real-time web searches.",
-    version="4.1.0"
 )
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
-    allow_headers=["*"],
 )
-# --- Helper Functions ---
 def extract_json_from_llm_response(text: str) -> Optional[list]:
-    """Extracts a JSON array from the LLM's potentially messy string response."""
     match = re.search(r'\[.*\]', text, re.DOTALL)
     if match:
         try:
             return json.loads(match.group(0))
         except json.JSONDecodeError:
-            logger.warning("Failed to decode JSON from LLM response.")
             return None
     return None
 async def get_real_user_agent() -> str:
-    """Provides a realistic, randomly rotated user agent string."""
-    return ua.random
 def clean_url(url: str) -> str:
-    """Cleans and normalizes URLs, especially from search engine redirects."""
     if not url:
         return ""
     if url.startswith('//duckduckgo.com/l/'):
         try:
-            parsed_url = urlparse(f"https:{url}")
-            params = dict(p.split('=') for p in parsed_url.query.split('&'))
-            return unquote(params.get('uddg', ''))
-        except Exception:
             pass
     if url.startswith('//'):
-        return 'https:' + url
-    if not url.startswith(('http://', 'https://')):
-        return 'https://' + url
     return url
-async def fetch_search_results(query: str, session: aiohttp.ClientSession, max_results: int = 10) -> List[dict]:
-    """Performs a web search using DuckDuckGo's HTML interface with robust retry logic."""
-    search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
-    headers = {'User-Agent': await get_real_user_agent(), 'Accept-Language': 'en-US,en;q=0.5'}
     for attempt in range(RETRY_ATTEMPTS):
         try:
-            async with session.get(search_url, headers=headers, timeout=10) as response:
-                if response.status == 200:
                     html = await response.text()
                     soup = BeautifulSoup(html, 'html.parser')
                     results = []
-                    for result in soup.select('.result'):
-                        title_elem = result.select_one('.result__title a')
-                        snippet_elem = result.select_one('.result__snippet')
-                        if title_elem and snippet_elem and title_elem.has_attr('href'):
-                            link = clean_url(title_elem['href'])
-                            if link:
                                 results.append({
                                     'title': title_elem.get_text(strip=True),
-                                    'link': link,
-                                    'snippet': snippet_elem.get_text(strip=True)
                                 })
-                        if len(results) >= max_results:
-                            break
-                    logger.info(f"Found {len(results)} search results for query: '{query}'")
-                    return results
-                else:
-                    logger.warning(f"Search attempt {attempt+1} for '{query}' failed with status: {response.status}")
         except Exception as e:
-            logger.error(f"Search attempt {attempt+1} for '{query}' failed with error: {e}")
-        if attempt < RETRY_ATTEMPTS - 1:
-            await asyncio.sleep(RETRY_DELAY)
     return []
-async def process_web_source(session: aiohttp.ClientSession, source: dict, timeout: int = 15) -> Tuple[Optional[str], dict]:
-    """Fetches and extracts the main textual content from a web page."""
-    url = source.get('link')
-    if not url:
-        return None, source
     headers = {'User-Agent': await get_real_user_agent()}
     source_info = source.copy()
-    start_time = time.time()
     try:
-        if any(url.lower().endswith(ext) for ext in ['.pdf', '.jpg', '.png', '.zip', '.mp3', '.mp4']):
-            return None, source
-        async with session.get(url, headers=headers, timeout=timeout, ssl=False) as response:
-            if response.status != 200 or 'text/html' not in response.headers.get('Content-Type', ''):
-                return None, source
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
-            for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'form']):
                 tag.decompose()
-            main_content = soup.select_one('main, article, #main, #content, .main, .post-content, .entry-content')
             if not main_content:
-                main_content = soup.body
-            if main_content:
-                content = " ".join(main_content.stripped_strings)
-                content = re.sub(r'\s{2,}', ' ', content).strip()
                 if len(content.split()) < 50:
-                    return None, source
-                source_info['word_count'] = len(content.split())
-                source_info['processing_time'] = time.time() - start_time
-                return content, source_info
     except Exception as e:
-        logger.warning(f"Error processing source {url}: {str(e)}")
-    return None, source
 async def generate_research_plan(query: str, session: aiohttp.ClientSession) -> List[str]:
-    """Generates a list of sub-questions to guide the research process."""
-    plan_prompt = {
-        "model": LLM_MODEL,
-        "messages": [{
-            "role": "user",
-            "content": f"""You are a research strategist. Your task is to generate 5 distinct, insightful sub-questions to form a comprehensive research plan for the topic: '{query}'.
-These questions will guide an AI in searching the web. Focus on different facets of the topic, such as its background, current state, key components, challenges, and future trends.
-Your response MUST be ONLY a raw JSON array of strings, with no other text or explanation.
-Example: ["What is the history of X?", "How does X compare to its main competitors?", "What are the primary use cases for X in 2025?"]"""
-        }],
-        "temperature": 0.7,
-        "max_tokens": 500
-    }
-    fallback_plan = [
-        f"What is the foundational definition and history of {query}?",
-        f"What are the core components and key features of {query}?",
-        f"Who are the major competitors or alternatives to {query}?",
-        f"What are the primary challenges and limitations associated with {query}?",
-        f"What are the latest trends and future predictions for {query}?"
-    ]
     try:
         async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=plan_prompt, timeout=30) as response:
-            if response.status == 200:
-                result = await response.json()
                 content = result['choices'][0]['message']['content']
                 sub_questions = extract_json_from_llm_response(content)
-                if sub_questions and isinstance(sub_questions, list) and len(sub_questions) > 2:
-                    return [str(q) for q in sub_questions]
     except Exception as e:
-        logger.error(f"Failed to generate research plan: {e}")
-    return fallback_plan
-async def run_deep_research_stream(query: str, search_time: int) -> AsyncGenerator[str, None]:
-    """The main orchestrator for the deep research process, yielding SSE events."""
     def format_sse(data: dict) -> str:
-        """Formats a dictionary into an SSE message string."""
         return f"data: {json.dumps(data)}\n\n"
     start_time = time.time()
     try:
-        yield format_sse({"event": "status", "data": "Initializing deep research..."})
         async with aiohttp.ClientSession() as session:
-            yield format_sse({"event": "status", "data": "Step 1: Generating research plan..."})
-            plan = await generate_research_plan(query, session)
-            yield format_sse({"event": "plan", "data": plan})
-            yield format_sse({"event": "status", "data": f"Step 2: Searching web for up to {search_time} seconds..."})
-            search_tasks = [fetch_search_results(q, session) for q in [query] + plan]
-            search_results_lists = await asyncio.gather(*search_tasks)
-            seen_urls = set()
-            all_sources = []
-            for result_list in search_results_lists:
-                for result in result_list:
-                    if result['link'] not in seen_urls:
-                        seen_urls.add(result['link'])
-                        all_sources.append(result)
-            query_terms = query.lower().split()
-            all_sources.sort(key=lambda s: sum(1 for term in query_terms if term in s['title'].lower()), reverse=True)
-            selected_sources = all_sources[:MAX_SOURCES_TO_PROCESS]
-            yield format_sse({"event": "found_sources", "data": selected_sources})
             if not selected_sources:
-                yield format_sse({"event": "error", "data": "Could not find any relevant web sources. Please try a different query."})
                 return
-            yield format_sse({"event": "status", "data": f"Step 3: Processing {len(selected_sources)} sources..."})
             semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
-            async def process_source_and_yield(source):
-                """Helper to yield status before and after processing a source."""
-                yield format_sse({"event": "processing_source", "data": {"link": source.get('link'), "title": source.get('title')}})
-                content, source_info = await process_web_source(session, source)
-                if content:
-                    yield format_sse({"event": "processed_source_success", "data": source_info})
-                else:
-                    yield format_sse({"event": "processed_source_failure", "data": source_info})
-                return content, source_info
             async def process_with_semaphore(source):
                 async with semaphore:
-                    results = []
-                    async for result in process_source_and_yield(source):
-                        results.append(result)
-                    return results
-            tasks = [asyncio.create_task(process_with_semaphore(source)) for source in selected_sources]
-            consolidated_context = ""
-            successful_sources = []
-            for task in asyncio.as_completed(tasks):
-                results = await task
-                for res in results:
-                    if isinstance(res, str): # SSE event
-                        yield res
-                # The actual return value is the last item
-                content, source_info = results[-1]
-                if content:
-                    consolidated_context += f"Source URL: {source_info['link']}\nSource Title: {source_info['title']}\n\nContent:\n{content}\n\n---\n\n"
-                    successful_sources.append(source_info)
-            if not successful_sources:
-                yield format_sse({"event": "error", "data": "Failed to extract content from any of the selected sources."})
                 return
-            yield format_sse({"event": "status", "data": f"Step 4: Synthesizing report from {len(successful_sources)} sources..."})
-            report_prompt = f"""You are an expert research analyst. Your task is to write a comprehensive, in-depth, and exceptionally long-form report on the topic: "{query}".
-You have been provided with detailed content from {len(successful_sources)} web sources. You must use this information as the primary basis for your report.
-**Report Requirements:**
-1.  **Length:** The final report must be extremely detailed and thorough, aiming for a word count between 4,000 and 8,000 words. Do not write a short summary.
-2.  **Structure:** Organize the report into the following sections. Elaborate extensively within each one.
-    - **Executive Summary:** A concise, high-level overview of the entire report.
-    - **Introduction:** Set the context and background for '{query}'.
-    - **Deep Dive Analysis:** This should be the largest part of the report. Break it down into multiple sub-sections based on the research plan: {', '.join(plan)}. Analyze, synthesize, and connect information from the provided sources. Do not just list facts; provide deep insights.
-    - **Challenges and Limitations:** A dedicated analysis of the problems, criticisms, and hurdles related to the topic.
-    - **Future Outlook and Predictions:** Analyze the future trends, potential developments, and expert predictions.
-    - **Conclusion:** Summarize the key findings and provide a final, conclusive statement.
-3.  **Citations:** While you write, you MUST cite the information you use. After a sentence or paragraph that uses a source, add a citation like `[Source: example.com/article]`. Use the actual URLs provided in the context.
-4.  **Formatting:** Use Markdown for clear headings, subheadings, bullet points, and bold text to improve readability.
-**SOURCE MATERIAL:**
-{consolidated_context[:1000000]}
-Begin writing the full, comprehensive report now. Adhere strictly to all instructions.
-"""
             report_payload = {
                 "model": LLM_MODEL,
                 "messages": [{"role": "user", "content": report_prompt}],
                 "stream": True,
-                # **CHANGE**: max_tokens parameter has been removed to allow for automatic/unlimited length.
             }
-            async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload, timeout=TOTAL_TIMEOUT) as response:
                 if response.status != 200:
-                    error_text = await response.text()
-                    yield format_sse({"event": "error", "data": f"Report generation failed with status {response.status}: {error_text}"})
                     return
-                # **CHANGE**: Switched to direct chunk streaming as requested.
                 async for line in response.content:
                     line_str = line.decode('utf-8').strip()
                     if line_str.startswith('data:'):
-                        content = line_str[5:].strip()
-                        if content == "[DONE]":
-                            break
-                        try:
-                            chunk = json.loads(content)
-                            delta = chunk.get("choices", [{}])[0].get("delta", {}).get("content")
-                            if delta:
-                                # Stream chunks as they arrive without line buffering
-                                yield format_sse({"event": "chunk", "data": delta})
-                        except (json.JSONDecodeError, IndexError):
-                            continue
-        duration = time.time() - start_time
-        stats = {
-            "duration_seconds": round(duration, 1),
-            "sources_found": len(all_sources),
-            "sources_processed": len(selected_sources),
-            "sources_successful": len(successful_sources)
-        }
-        yield format_sse({"event": "stats", "data": stats})
-        yield format_sse({"event": "sources", "data": successful_sources})
     except Exception as e:
-        logger.critical(f"A critical error occurred in the research stream: {e}", exc_info=True)
-        yield format_sse({"event": "error", "data": f"An unexpected critical error occurred: {str(e)}"})
     finally:
-        yield format_sse({"event": "complete", "data": "Research process finished."})
 @app.post("/deep-research", response_class=StreamingResponse)
 async def deep_research_endpoint(request: DeepResearchRequest):
-    """The main API endpoint for initiating a deep research task."""
-    query = request.query.strip()
-    if not query or len(query) < 5:
-        raise HTTPException(status_code=400, detail="Query is too short. Please provide a more detailed query.")
-    search_time = max(60, min(request.search_time, 300))
     return StreamingResponse(
-        run_deep_research_stream(query, search_time),
         media_type="text/event-stream",
         headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
     )
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 import os
 import asyncio
 import json
 from collections import defaultdict
 # --- Configuration ---
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - %(message)s'
     logging.info("LLM API Key loaded successfully.")
 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+MAX_SOURCES_TO_PROCESS = 20  # Increased for more research
+MAX_CONCURRENT_REQUESTS = 2
+SEARCH_TIMEOUT = 300  # 5 minutes for longer research
+TOTAL_TIMEOUT = 600  # Increased to allow more time for generation
+REQUEST_DELAY = 3.0
+RETRY_ATTEMPTS = 5
+RETRY_DELAY = 5.0
+USER_AGENT_ROTATION = True
+CONTEXT_WINDOW_SIZE = 10000000  # 10 million tokens
+MAX_CONTEXT_SIZE = 2000000  # Increased practical limit for prompt
 # Initialize fake user agent generator
 try:
     ua = UserAgent()
+except:
     class SimpleUA:
         def random(self):
             return random.choice([
 class DeepResearchRequest(BaseModel):
     query: str
+    search_time: int = 300  # Default to 5 minutes
 app = FastAPI(
     title="AI Deep Research API",
+    description="Provides comprehensive research reports from real web searches within 5 minutes.",
+    version="3.0.0"
 )
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
+    allow_headers=["*"]
 )
 def extract_json_from_llm_response(text: str) -> Optional[list]:
+    """Extract JSON array from LLM response text."""
     match = re.search(r'\[.*\]', text, re.DOTALL)
     if match:
         try:
             return json.loads(match.group(0))
         except json.JSONDecodeError:
             return None
     return None
 async def get_real_user_agent() -> str:
+    """Get a realistic user agent string."""
+    try:
+        if isinstance(ua, UserAgent):
+            return ua.random
+        return ua.random()
+    except:
+        return "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
 def clean_url(url: str) -> str:
+    """Clean up and normalize URLs."""
     if not url:
         return ""
     if url.startswith('//duckduckgo.com/l/'):
+        url = f"https:{url}"
         try:
+            parsed = urlparse(url)
+            query_params = parsed.query
+            if 'uddg=' in query_params:
+                match = re.search(r'uddg=([^&]+)', query_params)
+                if match:
+                    return unquote(match.group(1))
+        except:
             pass
     if url.startswith('//'):
+        url = 'https:' + url
+    elif not url.startswith(('http://', 'https://')):
+        url = 'https://' + url
     return url
+async def check_robots_txt(url: str) -> bool:
+    """Check if scraping is allowed by robots.txt."""
+    try:
+        domain_match = re.search(r'https?://([^/]+)', url)
+        if not domain_match:
+            return False
+        domain = domain_match.group(1)
+        robots_url = f"https://{domain}/robots.txt"
+        async with aiohttp.ClientSession() as session:
+            headers = {'User-Agent': await get_real_user_agent()}
+            async with session.get(robots_url, headers=headers, timeout=5) as response:
+                if response.status == 200:
+                    robots = await response.text()
+                    if "Disallow: /" in robots:
+                        return False
+                    path = re.sub(r'https?://[^/]+', '', url)
+                    if any(f"Disallow: {p}" in robots for p in [path, path.rstrip('/') + '/']):
+                        return False
+        return True
+    except Exception as e:
+        logging.warning(f"Could not check robots.txt for {url}: {e}")
+        return False
+async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
+    """Perform a real search using DuckDuckGo's HTML interface with robust retry logic."""
+    headers = {
+        "User-Agent": await get_real_user_agent(),
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.5",
+        "Referer": "https://duckduckgo.com/",
+        "DNT": "1"
+    }
     for attempt in range(RETRY_ATTEMPTS):
         try:
+            search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
+            async with aiohttp.ClientSession() as session:
+                async with session.get(search_url, headers=headers, timeout=10) as response:
+                    if response.status != 200:
+                        if response.status == 202:
+                            logging.warning(f"Search attempt {attempt + 1} failed with status 202 for query '{query}'")
+                            if attempt < RETRY_ATTEMPTS - 1:
+                                await asyncio.sleep(RETRY_DELAY)
+                                continue
+                        logging.warning(f"Search failed with status {response.status} for query '{query}'")
+                        return []
                     html = await response.text()
                     soup = BeautifulSoup(html, 'html.parser')
                     results = []
+                    for selector in ['.result__body', '.result__a', '.result']:
+                        if len(results) >= max_results:
+                            break
+                        for result in soup.select(selector)[:max_results]:
+                            try:
+                                title_elem = result.select_one('.result__title .result__a') or result.select_one('.result__a')
+                                if not title_elem:
+                                    continue
+                                link = title_elem['href']
+                                snippet_elem = result.select_one('.result__snippet')
+                                clean_link = clean_url(link)
+                                if not clean_link or clean_link.startswith('javascript:'):
+                                    continue
+                                snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
                                 results.append({
                                     'title': title_elem.get_text(strip=True),
+                                    'link': clean_link,
+                                    'snippet': snippet
                                 })
+                            except Exception as e:
+                                logging.warning(f"Error parsing search result: {e}")
+                                continue
+                    logging.info(f"Found {len(results)} real search results for '{query}'")
+                    return results[:max_results]
         except Exception as e:
+            logging.error(f"Search attempt {attempt + 1} failed for '{query}': {e}")
+            if attempt < RETRY_ATTEMPTS - 1:
+                await asyncio.sleep(RETRY_DELAY)
+                continue
+    logging.error(f"All {RETRY_ATTEMPTS} search attempts failed for '{query}'")
     return []
+async def process_web_source(session: aiohttp.ClientSession, source: dict, timeout: int = 15) -> Tuple[str, dict]:
+    """Process a real web source with improved content extraction and error handling."""
     headers = {'User-Agent': await get_real_user_agent()}
     source_info = source.copy()
+    source_info['link'] = clean_url(source['link'])
+    if not source_info['link'] or not source_info['link'].startswith(('http://', 'https://')):
+        logging.warning(f"Invalid URL: {source_info['link']}")
+        return source.get('snippet', ''), source_info
+    if not await check_robots_txt(source_info['link']):
+        logging.info(f"Scraping disallowed by robots.txt for {source_info['link']}")
+        return source.get('snippet', ''), source_info
     try:
+        logging.info(f"Processing source: {source_info['link']}")
+        start_time = time.time()
+        if any(source_info['link'].lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx']):
+            logging.info(f"Skipping non-HTML content at {source_info['link']}")
+            return source.get('snippet', ''), source_info
+        await asyncio.sleep(REQUEST_DELAY)
+        async with session.get(source_info['link'], headers=headers, timeout=timeout, ssl=False) as response:
+            if response.status != 200:
+                logging.warning(f"HTTP {response.status} for {source_info['link']}")
+                return source.get('snippet', ''), source_info
+            content_type = response.headers.get('Content-Type', '').lower()
+            if 'text/html' not in content_type:
+                logging.info(f"Non-HTML content at {source_info['link']} (type: {content_type})")
+                return source.get('snippet', ''), source_info
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
+            for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe', 'noscript', 'form']):
                 tag.decompose()
+            selectors_to_try = [
+                'main',
+                'article',
+                '[role="main"]',
+                '.main-content',
+                '.content',
+                '.article-body',
+                '.post-content',
+                '.entry-content',
+                '#content',
+                '#main',
+                '.main',
+                '.article'
+            ]
+            main_content = None
+            for selector in selectors_to_try:
+                main_content = soup.select_one(selector)
+                if main_content:
+                    break
             if not main_content:
+                all_elements = soup.find_all()
+                candidates = [el for el in all_elements if el.name not in ['script', 'style', 'nav', 'footer', 'header']]
+                if candidates:
+                    candidates.sort(key=lambda x: len(x.get_text()), reverse=True)
+                    main_content = candidates[0] if candidates else soup
+            if not main_content:
+                main_content = soup.find('body') or soup
+            content = " ".join(main_content.stripped_strings)
+            content = re.sub(r'\s+', ' ', content).strip()
+            if len(content.split()) < 50 and len(html) > 10000:
+                paras = soup.find_all('p')
+                content = " ".join([p.get_text() for p in paras if p.get_text().strip()])
+                content = re.sub(r'\s+', ' ', content).strip()
                 if len(content.split()) < 50:
+                    content = " ".join(soup.stripped_strings)
+                    content = re.sub(r'\s+', ' ', content).strip()
+            if len(content.split()) < 30:
+                for tag in ['div', 'section', 'article']:
+                    for element in soup.find_all(tag):
+                        if len(element.get_text().split()) > 200:
+                            content = " ".join(element.stripped_strings)
+                            content = re.sub(r'\s+', ' ', content).strip()
+                            if len(content.split()) >= 30:
+                                break
+                    if len(content.split()) >= 30:
+                        break
+            if len(content.split()) < 30:
+                logging.warning(f"Very little content extracted from {source_info['link']}")
+                return source.get('snippet', ''), source_info
+            source_info['word_count'] = len(content.split())
+            source_info['processing_time'] = time.time() - start_time
+            return content, source_info
+    except asyncio.TimeoutError:
+        logging.warning(f"Timeout while processing {source_info['link']}")
+        return source.get('snippet', ''), source_info
     except Exception as e:
+        logging.warning(f"Error processing {source_info['link']}: {str(e)[:200]}")
+        return source.get('snippet', ''), source_info
 async def generate_research_plan(query: str, session: aiohttp.ClientSession) -> List[str]:
+    """Generate a comprehensive research plan with sub-questions."""
     try:
+        plan_prompt = {
+            "model": LLM_MODEL,
+            "messages": [{
+                "role": "user",
+                "content": f"""Generate 4-6 comprehensive sub-questions for in-depth research on '{query}'.
+                Focus on key aspects that would provide a complete understanding of the topic.
+                Your response MUST be ONLY the raw JSON array with no additional text.
+                Example: ["What is the historical background of X?", "What are the current trends in X?"]"""
+            }],
+            "temperature": 0.7,
+            "max_tokens": 300
+        }
         async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=plan_prompt, timeout=30) as response:
+            response.raise_for_status()
+            result = await response.json()
+            if isinstance(result, list):
+                return result
+            elif isinstance(result, dict) and 'choices' in result:
                 content = result['choices'][0]['message']['content']
                 sub_questions = extract_json_from_llm_response(content)
+                if sub_questions and isinstance(sub_questions, list):
+                    cleaned = []
+                    for q in sub_questions:
+                        if isinstance(q, str) and q.strip():
+                            cleaned_q = re.sub(r'^[^a-zA-Z0-9]*|[^a-zA-Z0-9]*$', '', q)
+                            if cleaned_q:
+                                cleaned.append(cleaned_q)
+                    return cleaned[:6]
+        return [
+            f"What is {query} and its key features?",
+            f"How does {query} compare to alternatives?",
+            f"What are the current developments in {query}?",
+            f"What are the main challenges with {query}?",
+            f"What does the future hold for {query}?"
+        ]
     except Exception as e:
+        logging.error(f"Failed to generate research plan: {e}")
+        return [
+            f"What is {query}?",
+            f"What are the key aspects of {query}?",
+            f"What are current trends in {query}?",
+            f"What are the challenges with {query}?"
+        ]
+async def continuous_search(query: str, search_time: int = 300) -> AsyncGenerator[Dict[str, any], None]:
+    """Perform continuous searching with retries and diverse queries, yielding updates for each new result."""
+    start_time = time.time()
+    all_results = []
+    seen_urls = set()
+    fallback_results = []
+    query_variations = [
+        query,
+        f"{query} comparison",
+        f"{query} review",
+        f"{query} latest developments",
+        f"{query} features and benefits",
+        f"{query} challenges and limitations"
+    ]
+    async with aiohttp.ClientSession() as session:
+        iteration = 0
+        result_count = 0
+        while time.time() - start_time < search_time:
+            iteration += 1
+            random.shuffle(query_variations)
+            for q in query_variations:
+                if time.time() - start_time >= search_time:
+                    logger.info(f"Search timed out after {search_time} seconds. Found {len(all_results)} results.")
+                    break
+                logger.info(f"Iteration {iteration}: Searching for query variation: {q}")
+                yield {"event": "status", "data": f"Searching for '{q}'..."}
+                try:
+                    results = await fetch_search_results(q, max_results=5)
+                    logger.info(f"Retrieved {len(results)} results for query '{q}'")
+                    for result in results:
+                        clean_link = clean_url(result['link'])
+                        if clean_link and clean_link not in seen_urls:
+                            seen_urls.add(clean_link)
+                            result['link'] = clean_link
+                            all_results.append(result)
+                            fallback_results.append(result)
+                            result_count += 1
+                            logger.info(f"Added new result: {result['title']} ({result['link']})")
+                            yield {"event": "found_result", "data": f"Found result {result_count}: {result['title']} ({result['link']})"}
+                    await asyncio.sleep(REQUEST_DELAY)
+                    if len(all_results) >= MAX_SOURCES_TO_PROCESS * 1.5:
+                        logger.info(f"Reached sufficient results: {len(all_results)}")
+                        break
+                except Exception as e:
+                    logger.error(f"Error during search for '{q}': {e}")
+                    yield {"event": "warning", "data": f"Search error for '{q}': {str(e)[:100]}"}
+                    await asyncio.sleep(RETRY_DELAY)
+            if len(all_results) >= MAX_SOURCES_TO_PROCESS * 1.5:
+                break
+        logger.info(f"Completed continuous search. Total results: {len(all_results)}")
+    if len(all_results) < MAX_SOURCES_TO_PROCESS:
+        logger.warning(f"Insufficient results ({len(all_results)}), using fallback results")
+        yield {"event": "warning", "data": f"Insufficient results, using fallback results to reach minimum."}
+        all_results.extend(fallback_results[:MAX_SOURCES_TO_PROCESS - len(all_results)])
+    if all_results:
+        def score_result(result):
+            query_terms = set(query.lower().split())
+            title = result['title'].lower()
+            snippet = result['snippet'].lower()
+            matches = sum(1 for term in query_terms if term in title or term in snippet)
+            snippet_length = len(result['snippet'].split())
+            return matches * 10 + snippet_length
+        all_results.sort(key=score_result, reverse=True)
+    yield {"event": "final_search_results", "data": all_results[:MAX_SOURCES_TO_PROCESS * 2]}
+async def filter_and_select_sources(results: List[dict]) -> List[dict]:
+    """Filter and select the best sources from search results."""
+    if not results:
+        logger.warning("No search results to filter.")
+        return []
+    logger.info(f"Filtering {len(results)} search results...")
+    domain_counts = defaultdict(int)
+    domain_results = defaultdict(list)
+    for result in results:
+        domain = urlparse(result['link']).netloc
+        domain_counts[domain] += 1
+        domain_results[domain].append(result)
+    selected = []
+    for domain, domain_res in domain_results.items():
+        if len(selected) >= MAX_SOURCES_TO_PROCESS:
+            break
+        if domain_res:
+            selected.append(domain_res[0])
+            logger.info(f"Selected top result from domain {domain}: {domain_res[0]['link']}")
+    if len(selected) < MAX_SOURCES_TO_PROCESS:
+        domain_quality = {}
+        for domain, domain_res in domain_results.items():
+            avg_length = sum(len(r['snippet'].split()) for r in domain_res) / len(domain_res)
+            domain_quality[domain] = avg_length
+        sorted_domains = sorted(domain_quality.items(), key=lambda x: x[1], reverse=True)
+        for domain, _ in sorted_domains:
+            if len(selected) >= MAX_SOURCES_TO_PROCESS:
+                break
+            for res in domain_results[domain]:
+                if res not in selected:
+                    selected.append(res)
+                    logger.info(f"Added additional result from high-quality domain {domain}: {res['link']}")
+                    if len(selected) >= MAX_SOURCES_TO_PROCESS:
+                        break
+    if len(selected) < MAX_SOURCES_TO_PROCESS:
+        all_results_sorted = sorted(results, key=lambda x: len(x['snippet'].split()), reverse=True)
+        for res in all_results_sorted:
+            if res not in selected:
+                selected.append(res)
+                logger.info(f"Added fallback high-snippet result: {res['link']}")
+                if len(selected) >= MAX_SOURCES_TO_PROCESS:
+                    break
+    logger.info(f"Selected {len(selected)} sources after filtering.")
+    return selected[:MAX_SOURCES_TO_PROCESS]
+async def run_deep_research_stream(query: str, search_time: int = 300) -> AsyncGenerator[str, None]:
     def format_sse(data: dict) -> str:
         return f"data: {json.dumps(data)}\n\n"
     start_time = time.time()
+    processed_sources = 0
+    successful_sources = 0
+    total_tokens = 0
     try:
+        yield format_sse({
+            "event": "status",
+            "data": f"Starting deep research on '{query}'. Search time limit: {search_time} seconds."
+        })
         async with aiohttp.ClientSession() as session:
+            yield format_sse({"event": "status", "data": "Generating comprehensive research plan..."})
+            try:
+                sub_questions = await generate_research_plan(query, session)
+                yield format_sse({"event": "plan", "data": sub_questions})
+            except Exception as e:
+                yield format_sse({
+                    "event": "error",
+                    "data": f"Failed to generate research plan: {str(e)[:200]}"
+                })
+                sub_questions = [
+                    f"What is {query}?",
+                    f"What are the key aspects of {query}?",
+                    f"What are current trends in {query}?",
+                    f"What are the challenges with {query}?"
+                ]
+                yield format_sse({"event": "plan", "data": sub_questions})
+            yield format_sse({
+                "event": "status",
+                "data": f"Performing continuous search for up to {search_time} seconds..."
+            })
+            search_results = []
+            async for update in continuous_search(query, search_time):
+                if update["event"] == "final_search_results":
+                    search_results = update["data"]
+                else:
+                    yield format_sse(update)
+            yield format_sse({
+                "event": "status",
+                "data": f"Found {len(search_results)} potential sources. Selecting the best ones..."
+            })
+            yield format_sse({
+                "event": "found_sources",
+                "data": search_results
+            })
+            if not search_results:
+                yield format_sse({
+                    "event": "error",
+                    "data": "No search results found. Check your query and try again."
+                })
+                return
+            selected_sources = await filter_and_select_sources(search_results)
+            yield format_sse({
+                "event": "status",
+                "data": f"Selected {len(selected_sources)} high-quality sources to process."
+            })
+            yield format_sse({
+                "event": "selected_sources",
+                "data": selected_sources
+            })
             if not selected_sources:
+                yield format_sse({
+                    "event": "error",
+                    "data": "No valid sources found after filtering."
+                })
                 return
             semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
+            consolidated_context = ""
+            all_sources_used = []
+            processing_errors = 0
             async def process_with_semaphore(source):
                 async with semaphore:
+                    return await process_web_source(session, source, timeout=20)
+            processing_tasks = []
+            for i, source in enumerate(selected_sources):
+                elapsed = time.time() - start_time
+                if elapsed > TOTAL_TIMEOUT * 0.8:
+                    yield format_sse({
+                        "event": "status",
+                        "data": f"Approaching time limit, stopping source processing at {i}/{len(selected_sources)}"
+                    })
+                    break
+                if i > 0:
+                    await asyncio.sleep(REQUEST_DELAY * 0.5)
+                task = asyncio.create_task(process_with_semaphore(source))
+                processing_tasks.append(task)
+                if (i + 1) % 2 == 0 or (i + 1) == len(selected_sources):
+                    yield format_sse({
+                        "event": "status",
+                        "data": f"Processed {min(i+1, len(selected_sources))}/{len(selected_sources)} sources..."
+                    })
+            for future in asyncio.as_completed(processing_tasks):
+                processed_sources += 1
+                content, source_info = await future
+                if content and content.strip():
+                    consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
+                    all_sources_used.append(source_info)
+                    successful_sources += 1
+                    total_tokens += len(content.split())
+                    yield format_sse({
+                        "event": "processed_source",
+                        "data": source_info
+                    })
+                else:
+                    processing_errors += 1
+                    yield format_sse({
+                        "event": "warning",
+                        "data": f"Failed to extract content from {source_info['link']}"
+                    })
+            if not consolidated_context.strip():
+                yield format_sse({
+                    "event": "error",
+                    "data": f"Failed to extract content from any sources. {processing_errors} errors occurred."
+                })
                 return
+            time_remaining = max(0, TOTAL_TIMEOUT - (time.time() - start_time))
+            yield format_sse({
+                "event": "status",
+                "data": f"Synthesizing comprehensive report from {successful_sources} sources..."
+            })
+            max_output_tokens = 16000  # Fixed to allow long response
+            report_prompt = f"""Compose an in-depth analysis report on "{query}".
+Generate a very long, detailed report leveraging the large context window of 10 million tokens. Provide thorough, deep analysis with extensive details, examples, and insights in each section. Expand on each point with sub-sections, data, and comprehensive explanations to make the report as long and informative as possible, aiming for 5,000 to 10,000 words.
+Structure the report with these sections:
+1. Introduction and Background
+2. Key Features and Capabilities
+3. Comparative Analysis with Alternatives
+4. Current Developments and Trends
+5. Challenges and Limitations
+6. Future Outlook
+7. Conclusion and Recommendations
+For each section, provide detailed analysis based on the source material.
+Include specific examples and data points from the sources when available.
+Compare and contrast different viewpoints from various sources.
+Use markdown formatting for headings, subheadings, lists, and emphasis.
+Cite sources where appropriate using inline citations like [1][2].
+Available information from {successful_sources} sources:
+{consolidated_context[:MAX_CONTEXT_SIZE]}
+Generate a comprehensive report of approximately 5,000 to 10,000 words.
+Focus on providing deep insights, analysis, and actionable information.
+            """
             report_payload = {
                 "model": LLM_MODEL,
                 "messages": [{"role": "user", "content": report_prompt}],
                 "stream": True,
+                "max_tokens": max_output_tokens
             }
+            async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                 if response.status != 200:
+                    yield format_sse({
+                        "event": "error",
+                        "data": f"Failed to generate report: HTTP {response.status}"
+                    })
                     return
+                buffer = ""
                 async for line in response.content:
+                    if time.time() - start_time > TOTAL_TIMEOUT:
+                        yield format_sse({
+                            "event": "warning",
+                            "data": "Time limit reached, ending report generation early."
+                        })
+                        break
                     line_str = line.decode('utf-8').strip()
                     if line_str.startswith('data:'):
+                        line_str = line_str[5:].strip()
+                    if line_str == "[DONE]":
+                        if buffer:
+                            yield format_sse({"event": "chunk", "data": buffer})
+                        break
+                    if not line_str:
+                        continue  # Skip empty lines
+                    try:
+                        chunk = json.loads(line_str)
+                        choices = chunk.get("choices")
+                        if choices and isinstance(choices, list) and len(choices) > 0:
+                            content = choices[0].get("delta", {}).get("content")
+                            if content:
+                                buffer += content
+                                if len(buffer) > 100:
+                                    yield format_sse({"event": "chunk", "data": buffer})
+                                    buffer = ""
+                    except json.JSONDecodeError as e:
+                        logging.warning(f"JSON decode error for line: {line_str} - {e}")
+                        continue
+                    except Exception as e:
+                        logging.warning(f"Error processing stream chunk: {e}")
+                        continue
+                if buffer:
+                    yield format_sse({"event": "chunk", "data": buffer})
+            duration = time.time() - start_time
+            stats = {
+                "total_time_seconds": round(duration),
+                "sources_processed": processed_sources,
+                "sources_successful": successful_sources,
+                "estimated_tokens": total_tokens,
+                "sources_used": len(all_sources_used)
+            }
+            yield format_sse({
+                "event": "status",
+                "data": f"Research completed successfully in {duration:.1f} seconds."
+            })
+            yield format_sse({"event": "stats", "data": stats})
+            yield format_sse({"event": "sources", "data": all_sources_used})
+    except asyncio.TimeoutError:
+        yield format_sse({
+            "event": "error",
+            "data": f"Research process timed out after {TOTAL_TIMEOUT} seconds."
+        })
     except Exception as e:
+        logging.error(f"Critical error in research process: {e}", exc_info=True)
+        yield format_sse({
+            "event": "error",
+            "data": f"An unexpected error occurred: {str(e)[:200]}"
+        })
     finally:
+        duration = time.time() - start_time
+        yield format_sse({
+            "event": "complete",
+            "data": f"Research process finished after {duration:.1f} seconds."
+        })
 @app.post("/deep-research", response_class=StreamingResponse)
 async def deep_research_endpoint(request: DeepResearchRequest):
+    """Endpoint for deep research that streams SSE responses."""
+    if not request.query or len(request.query.strip()) < 3:
+        raise HTTPException(status_code=400, detail="Query must be at least 3 characters long")
+    search_time = min(max(request.search_time, 60), 300)  # Clamp to 5 minutes max
     return StreamingResponse(
+        run_deep_research_stream(request.query.strip(), search_time),
         media_type="text/event-stream",
         headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
     )
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)