File size: 6,968 Bytes
4b17916 2a0098d 4b17916 2a0098d e1111e0 4b17916 e1111e0 2a0098d e1111e0 2a0098d 4b17916 2a0098d 4b17916 2a0098d e1111e0 4b17916 2a0098d 4b17916 2a0098d 4b17916 2a0098d e1111e0 2a0098d e1111e0 2a0098d e1111e0 2a0098d e1111e0 4b17916 e1111e0 2a0098d e1111e0 2a0098d e1111e0 2a0098d e1111e0 2a0098d 4b17916 2a0098d e1111e0 2a0098d 4b17916 2a0098d e1111e0 2a0098d 4b17916 e1111e0 2a0098d e1111e0 2a0098d 4b17916 2a0098d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import os
import asyncio
from fastapi import FastAPI, HTTPException, Query
from dotenv import load_dotenv
import aiohttp
from bs4 import BeautifulSoup
import logging
# --- Configuration ---
# Configure logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
load_dotenv()
LLM_API_KEY = os.getenv("LLM_API_KEY")
if not LLM_API_KEY:
raise RuntimeError("LLM_API_KEY must be set in a .env file.")
# Snapzion Search API Configuration
SNAPZION_API_URL = "https://search.snapzion.com/get-snippets"
SNAPZION_HEADERS = {
'accept': '*/*',
'accept-language': 'en-US,en;q=0.9',
'content-type': 'application/json',
'origin': 'https://search.snapzion.com',
'priority': 'u=1, i',
'referer': 'https://search.snapzion.com/docs',
'sec-ch-ua': '"Chromium";v="140", "Not=A?Brand";v="24", "Google Chrome";v="140"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36',
}
# ***** CHANGE 1: Add general-purpose browser headers for scraping *****
SCRAPING_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
# LLM Configuration
LLM_API_URL = "https://api.inference.net/v1/chat/completions"
LLM_MODEL = "meta-llama/llama-3.1-8b-instruct/fp-8"
# --- FastAPI App Initialization ---
app = FastAPI(
title="AI Search Snippets API (Snapzion)",
description="Provides AI-generated summaries from Snapzion search results.",
version="1.1.0" # Version bump for new resilience feature
)
# --- Core Asynchronous Functions ---
async def call_snapzion_search(session: aiohttp.ClientSession, query: str) -> list:
try:
async with session.post(SNAPZION_API_URL, headers=SNAPZION_HEADERS, json={"query": query}, timeout=15) as response:
response.raise_for_status()
data = await response.json()
return data.get("organic_results", [])
except Exception as e:
logger.error(f"Snapzion API call failed: {e}")
raise HTTPException(status_code=503, detail=f"Search service (Snapzion) failed: {e}")
# ***** CHANGE 2: Improve the scraping function *****
async def scrape_url(session: aiohttp.ClientSession, url: str) -> str:
"""Asynchronously scrapes text from a URL, now with browser headers."""
if url.lower().endswith('.pdf'):
return "Error: Content is a PDF, which cannot be scraped."
try:
# Use the new scraping headers to look like a real browser
async with session.get(url, headers=SCRAPING_HEADERS, timeout=10, ssl=False) as response:
if response.status != 200:
logger.warning(f"Failed to fetch {url}, status code: {response.status}")
return f"Error: Failed to fetch with status {response.status}"
html = await response.text()
soup = BeautifulSoup(html, "html.parser")
for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
tag.decompose()
return " ".join(soup.stripped_strings)
except Exception as e:
logger.warning(f"Could not scrape {url}. Reason: {e}")
return f"Error: Could not scrape. Reason: {e}"
async def get_ai_snippet(query: str, context: str, sources: list) -> str:
headers = {"Authorization": f"Bearer {LLM_API_KEY}", "Content-Type": "application/json"}
source_list_str = "\n".join([f"[{i+1}] {source['title']}: {source['link']}" for i, source in enumerate(sources)])
prompt = f"""
Based *only* on the provided context, provide a concise, factual answer to the user's query. Cite every sentence with the corresponding source number(s), like `[1]` or `[2, 3]`.
Sources:
{source_list_str}
Context:
---
{context}
---
User Query: "{query}"
Answer with citations:
"""
data = {"model": LLM_MODEL, "messages": [{"role": "user", "content": prompt}], "max_tokens": 500}
async with aiohttp.ClientSession() as session:
try:
async with session.post(LLM_API_URL, headers=headers, json=data, timeout=45) as response:
response.raise_for_status()
result = await response.json()
return result['choices'][0]['message']['content']
except Exception as e:
logger.error(f"LLM API call failed: {e}")
raise HTTPException(status_code=502, detail=f"Failed to get response from LLM: {e}")
# --- API Endpoint ---
@app.get("/search")
async def ai_search(q: str = Query(..., min_length=3, description="The search query.")):
async with aiohttp.ClientSession() as session:
search_results = await call_snapzion_search(session, q)
if not search_results:
raise HTTPException(status_code=404, detail="Could not find any relevant sources for the query.")
sources = search_results[:5] # Use top 5 sources
scrape_tasks = [scrape_url(session, source["link"]) for source in sources]
scraped_contents = await asyncio.gather(*scrape_tasks)
# ***** CHANGE 3: Implement the robust fallback logic *****
successful_scrapes = [content for content in scraped_contents if not content.startswith("Error:")]
full_context = ""
if successful_scrapes:
logger.info(f"Successfully scraped {len(successful_scrapes)} out of {len(sources)} sources.")
# Build context from successfully scraped content
full_context = "\n\n".join(
f"Source [{i+1}] ({sources[i]['link']}):\n{scraped_contents[i]}"
for i in range(len(sources)) if not scraped_contents[i].startswith("Error:")
)
else:
# If ALL scrapes failed, fall back to using the snippets from the search API
logger.warning("All scraping attempts failed. Falling back to using API snippets for context.")
full_context = "\n\n".join(
f"Source [{i+1}] ({source['link']}):\n{source['snippet']}"
for i, source in enumerate(sources)
)
if not full_context.strip():
# This is a final safety net, should rarely be hit now
raise HTTPException(status_code=500, detail="Could not construct any context from sources or snippets.")
ai_summary = await get_ai_snippet(q, full_context, sources)
return {"ai_summary": ai_summary, "sources": sources}
@app.get("/")
def root():
return {"message": "AI Search API is active. Use the /docs endpoint to test."} |