Spaces:

MohamedFahim
/

URL-To-Answer

Sleeping

App Files Files Community

MohamedFahim commited on Nov 12

Commit

853ce60

verified ·

1 Parent(s): 102c719

Update main_api.py

Browse files

Files changed (1) hide show

main_api.py +234 -631

main_api.py CHANGED Viewed

@@ -1,633 +1,186 @@
-import os
-import logging
 import time
-import random
 import json
 import numpy as np
-import uvicorn
-# FIX: Updated PyMuPDF import for compatibility
-try:
-    import pymupdf as fitz  # PyMuPDF >= 1.24.0 (recommended)
-except ImportError:
-    import fitz  # PyMuPDF < 1.24.0 (fallback)
-import pymupdf4llm
-import faiss
-from pathlib import Path
-from typing import List, Optional
-from urllib.parse import urlparse, urljoin
-from fastapi import FastAPI, HTTPException, File, UploadFile
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
-from bs4 import BeautifulSoup
-import requests
 from sklearn.metrics.pairwise import cosine_similarity
 from supabase import create_client, Client
-from groq import Groq
-from sentence_transformers import SentenceTransformer
-from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownTextSplitter
-import pickle
-# ==================== CONFIGURATION FOR HUGGING FACE SPACES ====================
-# Persistent storage directory (Hugging Face Spaces uses /data/)
-PERSISTENT_STORAGE = os.getenv("PERSISTENT_STORAGE", "/data")
-VECTOR_STORE_DIR = os.path.join(PERSISTENT_STORAGE, "vector_stores")
-TEMP_UPLOAD_DIR = os.path.join(PERSISTENT_STORAGE, "temp_uploads")
-# Create directories if they don't exist
-os.makedirs(VECTOR_STORE_DIR, exist_ok=True)
-os.makedirs(TEMP_UPLOAD_DIR, exist_ok=True)
-# Set HuggingFace cache to persistent storage
-os.environ["HF_HOME"] = os.path.join(PERSISTENT_STORAGE, ".huggingface")
-# ==================== LOGGING SETUP ====================
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-# ==================== FASTAPI APP ====================
-app = FastAPI(title="RAG Assistant API", version="2.0")
-# CORS middleware
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
 )
-# ==================== ENVIRONMENT VARIABLES ====================
-groq_api_key = os.getenv("groq_token")
-supabase_url = os.getenv("SUPABASE_URL")
-supabase_key = os.getenv("SUPABASE_KEY")
-# Initialize clients
-supabase: Optional[Client] = None
-groq_client = None
-if supabase_url and supabase_key:
-    try:
-        supabase = create_client(supabase_url, supabase_key)
-        logger.info("Supabase client initialized successfully")
-    except Exception as e:
-        logger.error(f"Failed to initialize Supabase: {e}")
-if groq_api_key:
-    try:
-        groq_client = Groq(api_key=groq_api_key)
-        logger.info("Groq client initialized successfully")
-    except Exception as e:
-        logger.error(f"Failed to initialize Groq: {e}")
-# Initialize embedding model (cached in persistent storage)
-logger.info("Loading embedding model...")
-embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-logger.info("Embedding model loaded successfully")
-# ==================== PERSISTENT VECTOR STORE MANAGEMENT ====================
-class VectorStoreManager:
-    """Manage FAISS vector stores with disk persistence"""
-    def __init__(self, base_dir: str):
-        self.base_dir = base_dir
-        self.stores = {}
-        self.load_all_stores()
-    def load_all_stores(self):
-        """Load all existing vector stores from disk on startup"""
-        try:
-            for collection_dir in Path(self.base_dir).iterdir():
-                if collection_dir.is_dir():
-                    collection_name = collection_dir.name
-                    try:
-                        self.load_store(collection_name)
-                        logger.info(f"Loaded collection '{collection_name}' from disk")
-                    except Exception as e:
-                        logger.error(f"Failed to load collection '{collection_name}': {e}")
-        except Exception as e:
-            logger.error(f"Error loading vector stores: {e}")
-    def load_store(self, collection_name: str):
-        """Load a specific vector store from disk"""
-        collection_dir = os.path.join(self.base_dir, collection_name)
-        if not os.path.exists(collection_dir):
-            raise FileNotFoundError(f"Collection '{collection_name}' not found")
-        # Load FAISS index
-        index_path = os.path.join(collection_dir, "index.faiss")
-        index = faiss.read_index(index_path)
-        # Load metadata
-        metadata_path = os.path.join(collection_dir, "metadata.pkl")
-        with open(metadata_path, 'rb') as f:
-            data = pickle.load(f)
-        self.stores[collection_name] = {
-            'index': index,
-            'chunks': data['chunks'],
-            'metadata': data['metadata'],
-            'dimension': index.d
-        }
-    def save_store(self, collection_name: str):
-        """Save a vector store to disk"""
-        collection_dir = os.path.join(self.base_dir, collection_name)
-        os.makedirs(collection_dir, exist_ok=True)
-        store_data = self.stores[collection_name]
-        # Save FAISS index
-        index_path = os.path.join(collection_dir, "index.faiss")
-        faiss.write_index(store_data['index'], index_path)
-        # Save metadata
-        metadata_path = os.path.join(collection_dir, "metadata.pkl")
-        with open(metadata_path, 'wb') as f:
-            pickle.dump({
-                'chunks': store_data['chunks'],
-                'metadata': store_data['metadata']
-            }, f)
-        logger.info(f"Saved collection '{collection_name}' to disk")
-    def create_or_update_store(self, collection_name: str, chunks: List[str], metadata: List[dict]):
-        """Create or update a vector store"""
-        # Generate embeddings
-        embeddings = embedding_model.encode(chunks, show_progress_bar=True)
-        embeddings = np.array(embeddings).astype('float32')
-        if collection_name in self.stores:
-            # Add to existing index
-            store_data = self.stores[collection_name]
-            store_data['index'].add(embeddings)
-            store_data['chunks'].extend(chunks)
-            store_data['metadata'].extend(metadata)
-        else:
-            # Create new index
-            dimension = embeddings.shape[1]
-            index = faiss.IndexFlatL2(dimension)
-            index.add(embeddings)
-            self.stores[collection_name] = {
-                'index': index,
-                'chunks': chunks.copy(),
-                'metadata': metadata.copy(),
-                'dimension': dimension
-            }
-        # Save to disk
-        self.save_store(collection_name)
-        return len(chunks)
-    def get_store(self, collection_name: str):
-        """Get a vector store"""
-        if collection_name not in self.stores:
-            # Try to load from disk
-            try:
-                self.load_store(collection_name)
-            except:
-                return None
-        return self.stores.get(collection_name)
-    def delete_store(self, collection_name: str):
-        """Delete a vector store"""
-        if collection_name in self.stores:
-            del self.stores[collection_name]
-        # Delete from disk
-        collection_dir = os.path.join(self.base_dir, collection_name)
-        if os.path.exists(collection_dir):
-            import shutil
-            shutil.rmtree(collection_dir)
-    def list_stores(self):
-        """List all available stores"""
-        return [
-            {
-                'collection_name': name,
-                'total_chunks': len(data['chunks']),
-                'dimension': data['dimension']
-            }
-            for name, data in self.stores.items()
-        ]
-# Initialize vector store manager
-vector_store_manager = VectorStoreManager(VECTOR_STORE_DIR)
-# ==================== PYDANTIC MODELS ====================
-class URL(BaseModel):
-    url: str
 class RAGRequest(BaseModel):
     file_path: str
     prompt: str
-class DocumentUpload(BaseModel):
-    file_id: str
-    filename: str
-    file_type: str
-    chunks_created: int
-    storage_path: str
-class RAGQueryRequest(BaseModel):
-    query: str
-    collection_name: str
-    top_k: Optional[int] = 3
-class VectorStoreInfo(BaseModel):
-    collection_name: str
-    total_chunks: int
-    dimension: int
-# ==================== EXISTING FUNCTIONALITY ====================
-user_agents = [
-    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
-    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
-    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
-]
-bucket_name = "url-2-ans-bucket"
-def query(payload):
-    """Query Hugging Face embedding API"""
-    API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2"
-    headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN', '')}"}
-    response = requests.post(API_URL, headers=headers, json=payload)
-    if response.status_code == 200:
-        return response.json()
-    else:
-        logger.warning(f"HF API error: {response.status_code}, using local model")
-        return embedding_model.encode(payload["inputs"]).tolist()
-def process_with_groq(query: str, context: str) -> str:
-    """Process query with Groq LLM"""
-    if not groq_client:
-        return "Groq API not configured. Please set groq_token environment variable."
-    try:
-        messages = [
-            {
-                "role": "system",
-                "content": "You are a helpful assistant. Answer questions based on the provided context. If you cannot find the answer in the context, say so."
-            },
-            {
-                "role": "user",
-                "content": f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:"
-            }
-        ]
-        chat_completion = groq_client.chat.completions.create(
-            messages=messages,
-            model="llama-3.3-70b-versatile",
-            temperature=0.7,
-            max_tokens=1024,
-        )
-        return chat_completion.choices[0].message.content
-    except Exception as e:
-        logger.error(f"Groq API error: {e}")
-        return f"Error generating response: {str(e)}"
 @app.get("/")
 async def root():
-    return {
-        "message": "RAG Assistant API",
-        "version": "2.0",
-        "status": "running",
-        "storage": PERSISTENT_STORAGE
-    }
-# ==================== NEW RAG ENDPOINTS ====================
-def chunk_document(text: str, file_type: str, chunk_size: int = 1000, chunk_overlap: int = 200):
-    """Chunk document based on file type"""
-    if file_type in ["markdown", "md"]:
-        splitter = MarkdownTextSplitter(
-            chunk_size=chunk_size,
-            chunk_overlap=chunk_overlap
-        )
-    else:
-        splitter = RecursiveCharacterTextSplitter(
-            chunk_size=chunk_size,
-            chunk_overlap=chunk_overlap,
-            separators=["\n\n", "\n", ". ", " ", ""]
-        )
-    chunks = splitter.split_text(text)
-    logger.info(f"Created {len(chunks)} chunks from document")
-    return chunks
-def extract_text_from_pdf(file_path: str) -> str:
-    """Extract text from PDF"""
-    try:
-        pdf_doc = fitz.open(file_path)
-        md_text = pymupdf4llm.to_markdown(pdf_doc)
-        return md_text
-    except Exception as e:
-        logger.error(f"Error extracting PDF: {e}")
-        pdf_doc = fitz.open(file_path)
-        text = ""
-        for page in pdf_doc:
-            text += page.get_text()
-        return text
-def extract_text_from_markdown(file_path: str) -> str:
-    """Extract text from markdown file"""
-    with open(file_path, 'r', encoding='utf-8') as f:
-        return f.read()
-@app.post("/upload_document", response_model=DocumentUpload)
-async def upload_document(
-    file: UploadFile = File(...),
-    collection_name: Optional[str] = "default"
-):
-    """Upload and process PDF or Markdown documents"""
-    # Get file extension instead of relying on content_type
-    file_ext = os.path.splitext(file.filename)[1].lower()
-    # Map extensions to file types
-    ext_to_type = {
-        ".pdf": "pdf",
-        ".md": "markdown",
-        ".markdown": "markdown",
-        ".txt": "txt"
-    }
-    if file_ext not in ext_to_type:
-        raise HTTPException(
-            status_code=415,
-            detail=f"Unsupported file type '{file_ext}'. Allowed: .pdf, .md, .markdown, .txt"
-        )
-    file_type = ext_to_type[file_ext]
-    try:
-        # Save file temporarily to persistent storage
-        temp_file_path = os.path.join(TEMP_UPLOAD_DIR, f"{int(time.time())}_{file.filename}")
-        # Write uploaded file
-        with open(temp_file_path, "wb") as buffer:
-            content = await file.read()
-            buffer.write(content)
-        # Extract text based on file type
-        if file_type == "pdf":
-            text_content = extract_text_from_pdf(temp_file_path)
-        else:
-            text_content = extract_text_from_markdown(temp_file_path)
-        if not text_content.strip():
-            raise HTTPException(status_code=400, detail="No text content extracted")
-        logger.info(f"Extracted {len(text_content)} characters from {file.filename}")
-        # Optional: Upload to Supabase
-        storage_filename = f"{int(time.time())}_{file.filename}"
-        if supabase:
-            try:
-                with open(temp_file_path, 'rb') as f:
-                    supabase.storage.from_(bucket_name).upload(
-                        path=storage_filename,
-                        file=f.read(),
-                        file_options={"content-type": "application/octet-stream"}
-                    )
-            except:
-                pass  # Continue even if Supabase upload fails
-        # Chunk document
-        chunks = chunk_document(text_content, file_type)
-        logger.info(f"Created {len(chunks)} chunks for collection '{collection_name}'")
-        # Create metadata
-        file_id = str(int(time.time()))
-        metadata = [
-            {
-                "file_id": file_id,
-                "filename": file.filename,
-                "file_type": file_type,
-                "chunk_index": i,
-                "storage_path": storage_filename
-            }
-            for i in range(len(chunks))
-        ]
-        # Add to vector store
-        chunks_created = vector_store_manager.create_or_update_store(
-            collection_name, chunks, metadata
-        )
-        logger.info(f"Successfully added {chunks_created} chunks to collection '{collection_name}'")
-        # Clean up temp file
-        try:
-            os.remove(temp_file_path)
-        except:
-            pass
-        return DocumentUpload(
-            file_id=file_id,
-            filename=file.filename,
-            file_type=file_type,
-            chunks_created=chunks_created,
-            storage_path=f"supabase://{bucket_name}/{storage_filename}" if supabase else temp_file_path
-        )
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.exception("Error in upload_document")
-        raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
-@app.post("/upload_multiple_documents")
-async def upload_multiple_documents(
-    files: List[UploadFile] = File(...),
-    collection_name: Optional[str] = "default"
-):
-    """Upload multiple documents"""
-    results = []
-    errors = []
-    for file in files:
-        try:
-            result = await upload_document(file, collection_name)
-            results.append(result)
-        except Exception as e:
-            errors.append({"filename": file.filename, "error": str(e)})
-    return {
-        "successful_uploads": len(results),
-        "failed_uploads": len(errors),
-        "results": results,
-        "errors": errors
     }
-@app.post("/query_documents")
-async def query_documents(request: RAGQueryRequest):
-    """Query documents using RAG - FIXED VERSION"""
-    store_data = vector_store_manager.get_store(request.collection_name)
-    if not store_data:
-        raise HTTPException(
-            status_code=404,
-            detail=f"Collection '{request.collection_name}' not found. Please upload documents first."
-        )
     try:
-        # Log query details
-        logger.info(f"Querying collection '{request.collection_name}' with query: '{request.query}'")
-        logger.info(f"Collection has {len(store_data['chunks'])} chunks")
-        # Generate query embedding
-        query_embedding = embedding_model.encode([request.query])
-        query_embedding = np.array(query_embedding).astype('float32')
-        # Search in FAISS
-        distances, indices = store_data['index'].search(
-            query_embedding,
-            min(request.top_k, len(store_data['chunks']))
-        )
-        # Log search results
-        logger.info(f"Search results - distances: {distances[0]}, indices: {indices[0]}")
-        # FIX: Removed strict threshold - always return results
-        # The threshold was too strict and preventing valid results
-        # Get relevant chunks
-        retrieved_chunks = [store_data['chunks'][i] for i in indices[0]]
-        retrieved_metadata = [store_data['metadata'][i] for i in indices[0]]
-        logger.info(f"Retrieved {len(retrieved_chunks)} chunks for query")
-        # Create context
-        context_text = "\n\n".join([
-            f"[Source {i+1} - {meta['filename']}]:\n{chunk}"
-            for i, (chunk, meta) in enumerate(zip(retrieved_chunks, retrieved_metadata))
-        ])
-        logger.info(f"Context length: {len(context_text)} characters")
-        # Generate answer
-        answer = process_with_groq(request.query, context_text)
-        # Prepare sources
-        sources = [
-            {
-                "filename": meta['filename'],
-                "file_type": meta['file_type'],
-                "chunk_index": meta['chunk_index'],
-                "text_snippet": chunk[:200] + "...",
-                "distance": float(distances[0][i])
-            }
-            for i, (chunk, meta) in enumerate(zip(retrieved_chunks, retrieved_metadata))
-        ]
-        return {
-            "answer": answer,
-            "sources": sources,
-            "query": request.query,
-            "collection": request.collection_name
-        }
-    except Exception as e:
-        logger.exception("Error in query_documents")
-        raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
-@app.get("/debug_collection/{collection_name}")
-async def debug_collection(collection_name: str):
-    """Debug endpoint to inspect collection contents"""
-    store_data = vector_store_manager.get_store(collection_name)
-    if not store_data:
-        return {"error": f"Collection '{collection_name}' not found"}
-    return {
-        "collection_name": collection_name,
-        "total_chunks": len(store_data['chunks']),
-        "dimension": store_data['dimension'],
-        "sample_chunks": store_data['chunks'][:3] if len(store_data['chunks']) > 0 else [],
-        "sample_metadata": store_data['metadata'][:3] if len(store_data['metadata']) > 0 else [],
-        "all_filenames": list(set([meta['filename'] for meta in store_data['metadata']]))
-    }
-@app.get("/list_collections")
-async def list_collections():
-    """List all collections"""
-    collections = vector_store_manager.list_stores()
-    return {"collections": collections}
-@app.delete("/delete_collection/{collection_name}")
-async def delete_collection(collection_name: str):
-    """Delete a collection"""
-    try:
-        vector_store_manager.delete_store(collection_name)
-        return {"message": f"Collection '{collection_name}' deleted successfully"}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-@app.get("/health_check")
-async def health_check():
-    """System health check"""
-    return {
-        "status": "healthy",
-        "supabase_configured": supabase is not None,
-        "groq_configured": groq_client is not None,
-        "embedding_model": "all-MiniLM-L6-v2",
-        "vector_stores": len(vector_store_manager.stores),
-        "total_chunks": sum(len(store['chunks']) for store in vector_store_manager.stores.values()),
-        "persistent_storage": PERSISTENT_STORAGE,
-        "collections": list(vector_store_manager.stores.keys())
-    }
-# ==================== EXISTING WEB SCRAPING ENDPOINTS ====================
-@app.post("/rag")
-async def rag(request: RAGRequest):
-    """Existing RAG endpoint for URL-based content"""
-    if not supabase:
-        raise HTTPException(status_code=500, detail="Supabase not configured")
-    try:
         file_path = request.file_path
-        # Download from Supabase
-        file_content = supabase.storage.from_(bucket_name).download(file_path)
-        text = file_content.decode('utf-8')
-        data = json.loads(text)
-        # Extract text
-        full_text = ""
-        for item in data:
-            full_text += item.get("text", "") + " "
-        # Chunk text
         chunk_size = 1000
-        chunks = [full_text[i:i+chunk_size] for i in range(0, len(full_text), chunk_size)]
-        # Get embeddings
         chunk_embeddings = []
         for chunk in chunks:
-            embedding = query({"inputs": chunk})
             chunk_embeddings.append(embedding)
         query_embedding = query({"inputs": request.prompt})
-        # Calculate similarity
         similarities = []
         for chunk_embedding in chunk_embeddings:
             query_np = np.array(query_embedding)
@@ -637,20 +190,23 @@ async def rag(request: RAGRequest):
                 query_np = query_np.reshape(1, -1)
             if len(chunk_np.shape) == 1:
                 chunk_np = chunk_np.reshape(1, -1)
             similarity = cosine_similarity(query_np, chunk_np)[0][0]
             similarities.append(similarity)
-        # Get top 3 chunks
         top_k = 3
         top_indices = np.argsort(similarities)[-top_k:][::-1]
-        relevant_chunks = [chunks[i] for i in top_indices]
         context_text = "\n\n".join(relevant_chunks)
         # Process with Groq
         answer = process_with_groq(request.prompt, context_text)
-        sources = [{"text": chunks[i][:200] + "...", "position": i} for i in top_indices]
         return {
             "sources": sources,
@@ -659,13 +215,15 @@ async def rag(request: RAGRequest):
             "file_source": f"supabase://{bucket_name}/{file_path}"
         }
     except Exception as e:
-        logger.exception("Error in RAG")
-        raise HTTPException(status_code=500, detail=str(e))
 @app.post("/extract_links")
 async def extract_links(url: URL):
-    """Extract links from URL"""
     def extract_unique_links(url_string, max_retries=3, timeout=30):
         for attempt in range(max_retries):
             try:
@@ -677,34 +235,82 @@ async def extract_links(url: URL):
                 base_url = urlparse(url_string)
                 base_url = f"{base_url.scheme}://{base_url.netloc}"
-                links = [urljoin(base_url, a.get('href')) for a in soup.find_all('a', href=True)]
                 unique_links = list(dict.fromkeys(links))
                 unique_links.insert(0, url_string)
                 return unique_links
-            except Exception as e:
                 if attempt < max_retries - 1:
-                    time.sleep(5 * (attempt + 1))
                 else:
-                    raise HTTPException(status_code=500, detail=str(e))
         return []
     try:
         unique_links = extract_unique_links(url.url)
         return {"unique_links": unique_links}
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
 @app.post("/extract_text")
 async def extract_text(urls: List[str]):
-    """Extract text from URLs"""
     if not supabase:
         raise HTTPException(status_code=500, detail="Supabase not configured")
     output_file = "extracted_text.txt"
     def text_data_extractor(links):
         extracted_texts = []
         for link in links:
             retries = 3
             while retries > 0:
                 try:
@@ -712,16 +318,23 @@ async def extract_text(urls: List[str]):
                     response = requests.get(link, headers=headers, timeout=30)
                     response.raise_for_status()
                     soup = BeautifulSoup(response.text, 'html.parser')
-                    text = ' '.join(soup.get_text().split())
-                    extracted_texts.append({"url": link, "text": text})
                     break
-                except:
                     retries -= 1
                     if retries > 0:
-                        time.sleep(5)
             if retries == 0:
-                extracted_texts.append({"url": link, "text": "Failed to retrieve"})
         return extracted_texts
@@ -730,31 +343,21 @@ async def extract_text(urls: List[str]):
         string_output = json.dumps(extracted_data, ensure_ascii=False, indent=2)
         # Upload to Supabase
-        file_content = string_output.encode('utf-8')
-        try:
-            supabase.storage.from_(bucket_name).upload(
-                path=output_file,
-                file=file_content,
-                file_options={"content-type": "text/plain"}
-            )
-        except:
-            supabase.storage.from_(bucket_name).update(
-                path=output_file,
-                file=file_content,
-                file_options={"content-type": "text/plain"}
-            )
         return {"extracted_data": extracted_data, "file_saved": output_file}
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-# ==================== MAIN ====================
 if __name__ == "__main__":
     uvicorn.run(
-        "main_api:app",
-        host="0.0.0.0",
-        port=8000,
-        reload=False,
         access_log=True
-    )

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import List
+import requests
+from bs4 import BeautifulSoup
 import time
+import os
 import json
+import random
+import logging
+import groq
 import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity
+import uvicorn
 from supabase import create_client, Client
+from urllib.parse import urljoin, urlparse
+# Initialize FastAPI app
+app = FastAPI(
+    title="Web RAG System API",
+    description="Extract content from web pages and perform RAG operations",
+    version="1.0.0"
 )
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Initialize Supabase client with environment variables
+try:
+    url = os.environ.get('SUPABASE_URL')
+    key = os.environ.get('SUPABASE_SERVICE_ROLE_KEY')
+    if not url or not key:
+        logger.warning("Supabase credentials not found in environment variables")
+        supabase = None
+    else:
+        supabase: Client = create_client(url, key)
+        logger.info("Supabase client initialized successfully")
+except Exception as e:
+    logger.error(f"Failed to initialize Supabase client: {e}")
+    supabase = None
+# User agents for web scraping
+user_agents = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/102.0",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15",
+    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/103.0.1264.49",
+    "Mozilla/5.0 (iPhone; CPU iPhone OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Mobile/15E148 Safari/604.1",
+    "Mozilla/5.0 (iPad; CPU OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Mobile/15E148 Safari/604.1",
+    "Mozilla/5.0 (Linux; Android 12; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Mobile Safari/537.36",
+    "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Mobile Safari/537.36",
+    "Mozilla/5.0 (Linux; Android 11; SM-A217F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Mobile Safari/537.36",
+    "Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Mobile Safari/537.36"
+]
+# Pydantic models
 class RAGRequest(BaseModel):
     file_path: str
     prompt: str
+class URL(BaseModel):
+    url: str
 @app.get("/")
 async def root():
+    """Health check endpoint"""
+    return {"message": "Web RAG System API is running", "status": "healthy"}
+@app.get("/health")
+async def health_check():
+    """Detailed health check"""
+    health_status = {
+        "api": "healthy",
+        "supabase": "connected" if supabase else "not configured",
+        "hf_token": "configured" if os.environ.get('hf_token') else "not configured",
+        "groq_token": "configured" if os.environ.get('groq_token') else "not configured"
     }
+    return health_status
+@app.post("/rag")
+async def rag(request: RAGRequest):
+    """Perform RAG operations on extracted text"""
     try:
+        # Check required environment variables
+        hf_token = os.environ.get('hf_token')
+        groq_token = os.environ.get('groq_token')
+        if not hf_token:
+            raise HTTPException(status_code=500, detail="HuggingFace token not configured")
+        if not groq_token:
+            raise HTTPException(status_code=500, detail="Groq token not configured")
+        if not supabase:
+            raise HTTPException(status_code=500, detail="Supabase not configured")
+        logger.info(f"Processing RAG request for file: {request.file_path}")
+        # HuggingFace Inference API for embeddings
+        API_URL = "https://router.huggingface.co/hf-inference/models/BAAI/bge-large-en-v1.5/pipeline/feature-extraction"
+        headers = {
+            "Authorization": f"Bearer {hf_token}",
+        }
+        def query(payload):
+            response = requests.post(API_URL, headers=headers, json=payload)
+            if response.status_code != 200:
+                logger.error(f"HuggingFace API error: {response.status_code} - {response.text}")
+                raise HTTPException(status_code=500, detail="Failed to get embeddings from HuggingFace")
+            return response.json()
+        # Create a Groq client
+        groq_client = groq.Client(api_key=groq_token)
+        def process_with_groq(query_text, context):
+            prompt = f"""
+            Context information:
+            {context}
+            Based on the context information above, please answer the following question:
+            {query_text}
+            Answer:
+            """
+            try:
+                response = groq_client.chat.completions.create(
+                    messages=[{"role": "user", "content": prompt}],
+                    model="llama-3.3-70b-versatile",
+                    temperature=0.4,
+                    max_tokens=512
+                )
+                return response.choices[0].message.content
+            except Exception as e:
+                logger.error(f"Groq API error: {e}")
+                raise HTTPException(status_code=500, detail="Failed to process with Groq")
+        def get_file_from_supabase(bucket_name, file_path):
+            try:
+                response = supabase.storage.from_(bucket_name).download(file_path)
+                content = response.decode('utf-8')
+                return content
+            except Exception as e:
+                logger.error(f"Error downloading file from Supabase: {e}")
+                raise HTTPException(
+                    status_code=404,
+                    detail=f"File not found in Supabase bucket: {file_path}"
+                )
+        # Get file content from Supabase
+        bucket_name = "url-2-ans-bucket"
         file_path = request.file_path
+        content = get_file_from_supabase(bucket_name, file_path)
+        logger.info(f"Successfully downloaded file from Supabase: {file_path}")
+        # Simple text chunking
         chunk_size = 1000
+        overlap = 200
+        chunks = []
+        for i in range(0, len(content), chunk_size - overlap):
+            chunk = content[i:i + chunk_size]
+            if len(chunk) > 100:
+                chunks.append({"text": chunk, "position": i})
+        logger.info(f"Created {len(chunks)} chunks from document")
+        # Get embeddings for all chunks
         chunk_embeddings = []
         for chunk in chunks:
+            embedding = query({"inputs": chunk["text"]})
             chunk_embeddings.append(embedding)
+        # Get embedding for the query
         query_embedding = query({"inputs": request.prompt})
+        # Calculate similarity between query and all chunks
         similarities = []
         for chunk_embedding in chunk_embeddings:
             query_np = np.array(query_embedding)
                 query_np = query_np.reshape(1, -1)
             if len(chunk_np.shape) == 1:
                 chunk_np = chunk_np.reshape(1, -1)
             similarity = cosine_similarity(query_np, chunk_np)[0][0]
             similarities.append(similarity)
+        # Get top 3 most similar chunks
         top_k = 3
         top_indices = np.argsort(similarities)[-top_k:][::-1]
+        relevant_chunks = [chunks[i]["text"] for i in top_indices]
         context_text = "\n\n".join(relevant_chunks)
         # Process with Groq
         answer = process_with_groq(request.prompt, context_text)
+        # Prepare sources
+        sources = [{"text": chunks[i]["text"][:200] + "...", "position": chunks[i]["position"]}
+                  for i in top_indices]
         return {
             "sources": sources,
             "file_source": f"supabase://{bucket_name}/{file_path}"
         }
+    except HTTPException:
+        raise
     except Exception as e:
+        logger.exception("Error occurred in RAG process")
+        raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
 @app.post("/extract_links")
 async def extract_links(url: URL):
+    """Extract unique links from a given URL"""
     def extract_unique_links(url_string, max_retries=3, timeout=30):
         for attempt in range(max_retries):
             try:
                 base_url = urlparse(url_string)
                 base_url = f"{base_url.scheme}://{base_url.netloc}"
+                a_tags = soup.find_all('a', href=True)
+                links = []
+                for a in a_tags:
+                    href = a.get('href')
+                    full_url = urljoin(base_url, href)
+                    links.append(full_url)
                 unique_links = list(dict.fromkeys(links))
                 unique_links.insert(0, url_string)
                 return unique_links
+            except requests.RequestException as e:
+                logger.warning(f"Attempt {attempt + 1} failed: {e}")
                 if attempt < max_retries - 1:
+                    wait_time = 5 * (attempt + 1)
+                    time.sleep(wait_time)
                 else:
+                    logger.error(f"Failed to retrieve {url_string} after {max_retries} attempts.")
+                    raise HTTPException(status_code=500, detail=f"Failed to retrieve {url_string} after {max_retries} attempts.")
         return []
     try:
         unique_links = extract_unique_links(url.url)
         return {"unique_links": unique_links}
     except Exception as e:
+        logger.exception("Error in extract_links")
+        raise HTTPException(status_code=500, detail=f"Failed to extract links: {str(e)}")
 @app.post("/extract_text")
 async def extract_text(urls: List[str]):
+    """Extract text content from multiple URLs"""
     if not supabase:
         raise HTTPException(status_code=500, detail="Supabase not configured")
     output_file = "extracted_text.txt"
+    def upload_text_content(filename, content, bucket_name):
+        try:
+            file_content = content.encode('utf-8')
+            # Try to upload first
+            try:
+                response = supabase.storage.from_(bucket_name).upload(
+                    path=filename,
+                    file=file_content,
+                    file_options={"content-type": "text/plain"}
+                )
+                logger.info(f"Text file uploaded successfully: {filename}")
+                return response
+            except Exception as upload_error:
+                # If upload fails (file exists), try to update
+                try:
+                    response = supabase.storage.from_(bucket_name).update(
+                        path=filename,
+                        file=file_content,
+                        file_options={"content-type": "text/plain"}
+                    )
+                    logger.info(f"Text file updated successfully: {filename}")
+                    return response
+                except Exception as update_error:
+                    logger.error(f"Error updating text content: {update_error}")
+                    raise HTTPException(status_code=500, detail="Failed to save file to storage")
+        except Exception as e:
+            logger.error(f"Error with file operations: {e}")
+            raise HTTPException(status_code=500, detail="Failed to save file to storage")
     def text_data_extractor(links):
         extracted_texts = []
         for link in links:
+            parsed_url = urlparse(link)
+            if not parsed_url.scheme:
+                logger.warning(f"Invalid URL: {link}")
+                continue
             retries = 3
             while retries > 0:
                 try:
                     response = requests.get(link, headers=headers, timeout=30)
                     response.raise_for_status()
                     soup = BeautifulSoup(response.text, 'html.parser')
+                    text = soup.get_text()
+                    clean_text = ' '.join(text.split())
+                    extracted_texts.append({"url": link, "text": clean_text})
                     break
+                except requests.RequestException as e:
                     retries -= 1
+                    logger.warning(f"Retry {3 - retries} for {link} failed: {e}")
                     if retries > 0:
+                        wait_time = 5 * (3 - retries)
+                        time.sleep(wait_time)
             if retries == 0:
+                extracted_texts.append({
+                    "url": link,
+                    "text": "Failed to retrieve text after multiple attempts."
+                })
         return extracted_texts
         string_output = json.dumps(extracted_data, ensure_ascii=False, indent=2)
         # Upload to Supabase
+        upload_text_content(output_file, string_output, "url-2-ans-bucket")
         return {"extracted_data": extracted_data, "file_saved": output_file}
     except Exception as e:
+        logger.exception("Error in extract_text")
+        raise HTTPException(status_code=500, detail=f"Failed to extract text: {str(e)}")
+# Main execution
 if __name__ == "__main__":
+    # Run the FastAPI app
     uvicorn.run(
+        "main_api:app",
+        host="0.0.0.0",
+        port=8000,
+        reload=False,  # Disable reload for production
         access_log=True
+    )