File size: 6,968 Bytes
4b17916
2a0098d
 
 
4b17916
2a0098d
e1111e0
4b17916
 
e1111e0
 
 
 
2a0098d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1111e0
 
 
 
 
 
 
 
 
2a0098d
 
4b17916
 
2a0098d
4b17916
2a0098d
 
e1111e0
4b17916
 
2a0098d
 
 
4b17916
2a0098d
4b17916
2a0098d
 
 
e1111e0
2a0098d
 
e1111e0
2a0098d
e1111e0
2a0098d
e1111e0
4b17916
e1111e0
 
2a0098d
e1111e0
 
2a0098d
 
 
 
 
 
e1111e0
 
2a0098d
 
 
 
 
e1111e0
2a0098d
 
 
4b17916
2a0098d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1111e0
2a0098d
 
 
 
 
 
4b17916
2a0098d
 
 
 
e1111e0
2a0098d
 
4b17916
e1111e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a0098d
 
e1111e0
 
2a0098d
 
4b17916
2a0098d
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os
import asyncio
from fastapi import FastAPI, HTTPException, Query
from dotenv import load_dotenv
import aiohttp
from bs4 import BeautifulSoup
import logging

# --- Configuration ---
# Configure logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

load_dotenv()
LLM_API_KEY = os.getenv("LLM_API_KEY")

if not LLM_API_KEY:
    raise RuntimeError("LLM_API_KEY must be set in a .env file.")

# Snapzion Search API Configuration
SNAPZION_API_URL = "https://search.snapzion.com/get-snippets"
SNAPZION_HEADERS = {
    'accept': '*/*',
    'accept-language': 'en-US,en;q=0.9',
    'content-type': 'application/json',
    'origin': 'https://search.snapzion.com',
    'priority': 'u=1, i',
    'referer': 'https://search.snapzion.com/docs',
    'sec-ch-ua': '"Chromium";v="140", "Not=A?Brand";v="24", "Google Chrome";v="140"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36',
}

# ***** CHANGE 1: Add general-purpose browser headers for scraping *****
SCRAPING_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.9',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
}

# LLM Configuration
LLM_API_URL = "https://api.inference.net/v1/chat/completions"
LLM_MODEL = "meta-llama/llama-3.1-8b-instruct/fp-8"

# --- FastAPI App Initialization ---
app = FastAPI(
    title="AI Search Snippets API (Snapzion)",
    description="Provides AI-generated summaries from Snapzion search results.",
    version="1.1.0" # Version bump for new resilience feature
)

# --- Core Asynchronous Functions ---

async def call_snapzion_search(session: aiohttp.ClientSession, query: str) -> list:
    try:
        async with session.post(SNAPZION_API_URL, headers=SNAPZION_HEADERS, json={"query": query}, timeout=15) as response:
            response.raise_for_status()
            data = await response.json()
            return data.get("organic_results", [])
    except Exception as e:
        logger.error(f"Snapzion API call failed: {e}")
        raise HTTPException(status_code=503, detail=f"Search service (Snapzion) failed: {e}")

# ***** CHANGE 2: Improve the scraping function *****
async def scrape_url(session: aiohttp.ClientSession, url: str) -> str:
    """Asynchronously scrapes text from a URL, now with browser headers."""
    if url.lower().endswith('.pdf'):
        return "Error: Content is a PDF, which cannot be scraped."
    try:
        # Use the new scraping headers to look like a real browser
        async with session.get(url, headers=SCRAPING_HEADERS, timeout=10, ssl=False) as response:
            if response.status != 200:
                logger.warning(f"Failed to fetch {url}, status code: {response.status}")
                return f"Error: Failed to fetch with status {response.status}"
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
                tag.decompose()
            return " ".join(soup.stripped_strings)
    except Exception as e:
        logger.warning(f"Could not scrape {url}. Reason: {e}")
        return f"Error: Could not scrape. Reason: {e}"

async def get_ai_snippet(query: str, context: str, sources: list) -> str:
    headers = {"Authorization": f"Bearer {LLM_API_KEY}", "Content-Type": "application/json"}
    source_list_str = "\n".join([f"[{i+1}] {source['title']}: {source['link']}" for i, source in enumerate(sources)])
    prompt = f"""
Based *only* on the provided context, provide a concise, factual answer to the user's query. Cite every sentence with the corresponding source number(s), like `[1]` or `[2, 3]`.

Sources:
{source_list_str}

Context:
---
{context}
---

User Query: "{query}"

Answer with citations:
"""
    data = {"model": LLM_MODEL, "messages": [{"role": "user", "content": prompt}], "max_tokens": 500}
    async with aiohttp.ClientSession() as session:
        try:
            async with session.post(LLM_API_URL, headers=headers, json=data, timeout=45) as response:
                response.raise_for_status()
                result = await response.json()
                return result['choices'][0]['message']['content']
        except Exception as e:
            logger.error(f"LLM API call failed: {e}")
            raise HTTPException(status_code=502, detail=f"Failed to get response from LLM: {e}")

# --- API Endpoint ---

@app.get("/search")
async def ai_search(q: str = Query(..., min_length=3, description="The search query.")):
    async with aiohttp.ClientSession() as session:
        search_results = await call_snapzion_search(session, q)
        if not search_results:
            raise HTTPException(status_code=404, detail="Could not find any relevant sources for the query.")
        
        sources = search_results[:5] # Use top 5 sources
        scrape_tasks = [scrape_url(session, source["link"]) for source in sources]
        scraped_contents = await asyncio.gather(*scrape_tasks)

        # ***** CHANGE 3: Implement the robust fallback logic *****
        successful_scrapes = [content for content in scraped_contents if not content.startswith("Error:")]
        
        full_context = ""
        if successful_scrapes:
            logger.info(f"Successfully scraped {len(successful_scrapes)} out of {len(sources)} sources.")
            # Build context from successfully scraped content
            full_context = "\n\n".join(
                f"Source [{i+1}] ({sources[i]['link']}):\n{scraped_contents[i]}"
                for i in range(len(sources)) if not scraped_contents[i].startswith("Error:")
            )
        else:
            # If ALL scrapes failed, fall back to using the snippets from the search API
            logger.warning("All scraping attempts failed. Falling back to using API snippets for context.")
            full_context = "\n\n".join(
                f"Source [{i+1}] ({source['link']}):\n{source['snippet']}"
                for i, source in enumerate(sources)
            )

        if not full_context.strip():
            # This is a final safety net, should rarely be hit now
            raise HTTPException(status_code=500, detail="Could not construct any context from sources or snippets.")

        ai_summary = await get_ai_snippet(q, full_context, sources)

    return {"ai_summary": ai_summary, "sources": sources}

@app.get("/")
def root():
    return {"message": "AI Search API is active. Use the /docs endpoint to test."}