MohamedFahim commited on
Commit
853ce60
·
verified ·
1 Parent(s): 102c719

Update main_api.py

Browse files
Files changed (1) hide show
  1. main_api.py +234 -631
main_api.py CHANGED
@@ -1,633 +1,186 @@
1
- import os
2
- import logging
 
 
 
3
  import time
4
- import random
5
  import json
 
 
 
6
  import numpy as np
7
- import uvicorn
8
-
9
- # FIX: Updated PyMuPDF import for compatibility
10
- try:
11
- import pymupdf as fitz # PyMuPDF >= 1.24.0 (recommended)
12
- except ImportError:
13
- import fitz # PyMuPDF < 1.24.0 (fallback)
14
-
15
- import pymupdf4llm
16
- import faiss
17
- from pathlib import Path
18
- from typing import List, Optional
19
- from urllib.parse import urlparse, urljoin
20
- from fastapi import FastAPI, HTTPException, File, UploadFile
21
- from fastapi.middleware.cors import CORSMiddleware
22
- from pydantic import BaseModel
23
- from bs4 import BeautifulSoup
24
- import requests
25
  from sklearn.metrics.pairwise import cosine_similarity
 
26
  from supabase import create_client, Client
27
- from groq import Groq
28
- from sentence_transformers import SentenceTransformer
29
- from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownTextSplitter
30
- import pickle
31
-
32
- # ==================== CONFIGURATION FOR HUGGING FACE SPACES ====================
33
-
34
- # Persistent storage directory (Hugging Face Spaces uses /data/)
35
- PERSISTENT_STORAGE = os.getenv("PERSISTENT_STORAGE", "/data")
36
- VECTOR_STORE_DIR = os.path.join(PERSISTENT_STORAGE, "vector_stores")
37
- TEMP_UPLOAD_DIR = os.path.join(PERSISTENT_STORAGE, "temp_uploads")
38
-
39
- # Create directories if they don't exist
40
- os.makedirs(VECTOR_STORE_DIR, exist_ok=True)
41
- os.makedirs(TEMP_UPLOAD_DIR, exist_ok=True)
42
-
43
- # Set HuggingFace cache to persistent storage
44
- os.environ["HF_HOME"] = os.path.join(PERSISTENT_STORAGE, ".huggingface")
45
-
46
- # ==================== LOGGING SETUP ====================
47
-
48
- logging.basicConfig(
49
- level=logging.INFO,
50
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
51
- )
52
- logger = logging.getLogger(__name__)
53
 
54
- # ==================== FASTAPI APP ====================
55
 
56
- app = FastAPI(title="RAG Assistant API", version="2.0")
57
-
58
- # CORS middleware
59
- app.add_middleware(
60
- CORSMiddleware,
61
- allow_origins=["*"],
62
- allow_credentials=True,
63
- allow_methods=["*"],
64
- allow_headers=["*"],
65
  )
66
 
67
- # ==================== ENVIRONMENT VARIABLES ====================
68
-
69
- groq_api_key = os.getenv("groq_token")
70
- supabase_url = os.getenv("SUPABASE_URL")
71
- supabase_key = os.getenv("SUPABASE_KEY")
72
-
73
- # Initialize clients
74
- supabase: Optional[Client] = None
75
- groq_client = None
76
-
77
- if supabase_url and supabase_key:
78
- try:
79
- supabase = create_client(supabase_url, supabase_key)
80
- logger.info("Supabase client initialized successfully")
81
- except Exception as e:
82
- logger.error(f"Failed to initialize Supabase: {e}")
83
-
84
- if groq_api_key:
85
- try:
86
- groq_client = Groq(api_key=groq_api_key)
87
- logger.info("Groq client initialized successfully")
88
- except Exception as e:
89
- logger.error(f"Failed to initialize Groq: {e}")
90
-
91
- # Initialize embedding model (cached in persistent storage)
92
- logger.info("Loading embedding model...")
93
- embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
94
- logger.info("Embedding model loaded successfully")
95
-
96
- # ==================== PERSISTENT VECTOR STORE MANAGEMENT ====================
97
 
98
- class VectorStoreManager:
99
- """Manage FAISS vector stores with disk persistence"""
100
-
101
- def __init__(self, base_dir: str):
102
- self.base_dir = base_dir
103
- self.stores = {}
104
- self.load_all_stores()
105
-
106
- def load_all_stores(self):
107
- """Load all existing vector stores from disk on startup"""
108
- try:
109
- for collection_dir in Path(self.base_dir).iterdir():
110
- if collection_dir.is_dir():
111
- collection_name = collection_dir.name
112
- try:
113
- self.load_store(collection_name)
114
- logger.info(f"Loaded collection '{collection_name}' from disk")
115
- except Exception as e:
116
- logger.error(f"Failed to load collection '{collection_name}': {e}")
117
- except Exception as e:
118
- logger.error(f"Error loading vector stores: {e}")
119
-
120
- def load_store(self, collection_name: str):
121
- """Load a specific vector store from disk"""
122
- collection_dir = os.path.join(self.base_dir, collection_name)
123
-
124
- if not os.path.exists(collection_dir):
125
- raise FileNotFoundError(f"Collection '{collection_name}' not found")
126
-
127
- # Load FAISS index
128
- index_path = os.path.join(collection_dir, "index.faiss")
129
- index = faiss.read_index(index_path)
130
-
131
- # Load metadata
132
- metadata_path = os.path.join(collection_dir, "metadata.pkl")
133
- with open(metadata_path, 'rb') as f:
134
- data = pickle.load(f)
135
-
136
- self.stores[collection_name] = {
137
- 'index': index,
138
- 'chunks': data['chunks'],
139
- 'metadata': data['metadata'],
140
- 'dimension': index.d
141
- }
142
-
143
- def save_store(self, collection_name: str):
144
- """Save a vector store to disk"""
145
- collection_dir = os.path.join(self.base_dir, collection_name)
146
- os.makedirs(collection_dir, exist_ok=True)
147
-
148
- store_data = self.stores[collection_name]
149
-
150
- # Save FAISS index
151
- index_path = os.path.join(collection_dir, "index.faiss")
152
- faiss.write_index(store_data['index'], index_path)
153
-
154
- # Save metadata
155
- metadata_path = os.path.join(collection_dir, "metadata.pkl")
156
- with open(metadata_path, 'wb') as f:
157
- pickle.dump({
158
- 'chunks': store_data['chunks'],
159
- 'metadata': store_data['metadata']
160
- }, f)
161
-
162
- logger.info(f"Saved collection '{collection_name}' to disk")
163
-
164
- def create_or_update_store(self, collection_name: str, chunks: List[str], metadata: List[dict]):
165
- """Create or update a vector store"""
166
- # Generate embeddings
167
- embeddings = embedding_model.encode(chunks, show_progress_bar=True)
168
- embeddings = np.array(embeddings).astype('float32')
169
-
170
- if collection_name in self.stores:
171
- # Add to existing index
172
- store_data = self.stores[collection_name]
173
- store_data['index'].add(embeddings)
174
- store_data['chunks'].extend(chunks)
175
- store_data['metadata'].extend(metadata)
176
- else:
177
- # Create new index
178
- dimension = embeddings.shape[1]
179
- index = faiss.IndexFlatL2(dimension)
180
- index.add(embeddings)
181
-
182
- self.stores[collection_name] = {
183
- 'index': index,
184
- 'chunks': chunks.copy(),
185
- 'metadata': metadata.copy(),
186
- 'dimension': dimension
187
- }
188
-
189
- # Save to disk
190
- self.save_store(collection_name)
191
- return len(chunks)
192
-
193
- def get_store(self, collection_name: str):
194
- """Get a vector store"""
195
- if collection_name not in self.stores:
196
- # Try to load from disk
197
- try:
198
- self.load_store(collection_name)
199
- except:
200
- return None
201
- return self.stores.get(collection_name)
202
-
203
- def delete_store(self, collection_name: str):
204
- """Delete a vector store"""
205
- if collection_name in self.stores:
206
- del self.stores[collection_name]
207
-
208
- # Delete from disk
209
- collection_dir = os.path.join(self.base_dir, collection_name)
210
- if os.path.exists(collection_dir):
211
- import shutil
212
- shutil.rmtree(collection_dir)
213
 
214
- def list_stores(self):
215
- """List all available stores"""
216
- return [
217
- {
218
- 'collection_name': name,
219
- 'total_chunks': len(data['chunks']),
220
- 'dimension': data['dimension']
221
- }
222
- for name, data in self.stores.items()
223
- ]
224
-
225
- # Initialize vector store manager
226
- vector_store_manager = VectorStoreManager(VECTOR_STORE_DIR)
227
-
228
- # ==================== PYDANTIC MODELS ====================
229
 
230
- class URL(BaseModel):
231
- url: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
 
233
  class RAGRequest(BaseModel):
234
  file_path: str
235
  prompt: str
236
 
237
- class DocumentUpload(BaseModel):
238
- file_id: str
239
- filename: str
240
- file_type: str
241
- chunks_created: int
242
- storage_path: str
243
-
244
- class RAGQueryRequest(BaseModel):
245
- query: str
246
- collection_name: str
247
- top_k: Optional[int] = 3
248
-
249
- class VectorStoreInfo(BaseModel):
250
- collection_name: str
251
- total_chunks: int
252
- dimension: int
253
-
254
- # ==================== EXISTING FUNCTIONALITY ====================
255
-
256
- user_agents = [
257
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
258
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
259
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
260
- ]
261
-
262
- bucket_name = "url-2-ans-bucket"
263
-
264
- def query(payload):
265
- """Query Hugging Face embedding API"""
266
- API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2"
267
- headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN', '')}"}
268
-
269
- response = requests.post(API_URL, headers=headers, json=payload)
270
- if response.status_code == 200:
271
- return response.json()
272
- else:
273
- logger.warning(f"HF API error: {response.status_code}, using local model")
274
- return embedding_model.encode(payload["inputs"]).tolist()
275
-
276
- def process_with_groq(query: str, context: str) -> str:
277
- """Process query with Groq LLM"""
278
- if not groq_client:
279
- return "Groq API not configured. Please set groq_token environment variable."
280
-
281
- try:
282
- messages = [
283
- {
284
- "role": "system",
285
- "content": "You are a helpful assistant. Answer questions based on the provided context. If you cannot find the answer in the context, say so."
286
- },
287
- {
288
- "role": "user",
289
- "content": f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:"
290
- }
291
- ]
292
-
293
- chat_completion = groq_client.chat.completions.create(
294
- messages=messages,
295
- model="llama-3.3-70b-versatile",
296
- temperature=0.7,
297
- max_tokens=1024,
298
- )
299
-
300
- return chat_completion.choices[0].message.content
301
- except Exception as e:
302
- logger.error(f"Groq API error: {e}")
303
- return f"Error generating response: {str(e)}"
304
 
305
  @app.get("/")
306
  async def root():
307
- return {
308
- "message": "RAG Assistant API",
309
- "version": "2.0",
310
- "status": "running",
311
- "storage": PERSISTENT_STORAGE
312
- }
313
-
314
- # ==================== NEW RAG ENDPOINTS ====================
315
-
316
- def chunk_document(text: str, file_type: str, chunk_size: int = 1000, chunk_overlap: int = 200):
317
- """Chunk document based on file type"""
318
- if file_type in ["markdown", "md"]:
319
- splitter = MarkdownTextSplitter(
320
- chunk_size=chunk_size,
321
- chunk_overlap=chunk_overlap
322
- )
323
- else:
324
- splitter = RecursiveCharacterTextSplitter(
325
- chunk_size=chunk_size,
326
- chunk_overlap=chunk_overlap,
327
- separators=["\n\n", "\n", ". ", " ", ""]
328
- )
329
-
330
- chunks = splitter.split_text(text)
331
- logger.info(f"Created {len(chunks)} chunks from document")
332
- return chunks
333
-
334
- def extract_text_from_pdf(file_path: str) -> str:
335
- """Extract text from PDF"""
336
- try:
337
- pdf_doc = fitz.open(file_path)
338
- md_text = pymupdf4llm.to_markdown(pdf_doc)
339
- return md_text
340
- except Exception as e:
341
- logger.error(f"Error extracting PDF: {e}")
342
- pdf_doc = fitz.open(file_path)
343
- text = ""
344
- for page in pdf_doc:
345
- text += page.get_text()
346
- return text
347
-
348
- def extract_text_from_markdown(file_path: str) -> str:
349
- """Extract text from markdown file"""
350
- with open(file_path, 'r', encoding='utf-8') as f:
351
- return f.read()
352
-
353
- @app.post("/upload_document", response_model=DocumentUpload)
354
- async def upload_document(
355
- file: UploadFile = File(...),
356
- collection_name: Optional[str] = "default"
357
- ):
358
- """Upload and process PDF or Markdown documents"""
359
-
360
- # Get file extension instead of relying on content_type
361
- file_ext = os.path.splitext(file.filename)[1].lower()
362
-
363
- # Map extensions to file types
364
- ext_to_type = {
365
- ".pdf": "pdf",
366
- ".md": "markdown",
367
- ".markdown": "markdown",
368
- ".txt": "txt"
369
- }
370
-
371
- if file_ext not in ext_to_type:
372
- raise HTTPException(
373
- status_code=415,
374
- detail=f"Unsupported file type '{file_ext}'. Allowed: .pdf, .md, .markdown, .txt"
375
- )
376
-
377
- file_type = ext_to_type[file_ext]
378
-
379
- try:
380
- # Save file temporarily to persistent storage
381
- temp_file_path = os.path.join(TEMP_UPLOAD_DIR, f"{int(time.time())}_{file.filename}")
382
-
383
- # Write uploaded file
384
- with open(temp_file_path, "wb") as buffer:
385
- content = await file.read()
386
- buffer.write(content)
387
-
388
- # Extract text based on file type
389
- if file_type == "pdf":
390
- text_content = extract_text_from_pdf(temp_file_path)
391
- else:
392
- text_content = extract_text_from_markdown(temp_file_path)
393
-
394
- if not text_content.strip():
395
- raise HTTPException(status_code=400, detail="No text content extracted")
396
-
397
- logger.info(f"Extracted {len(text_content)} characters from {file.filename}")
398
-
399
- # Optional: Upload to Supabase
400
- storage_filename = f"{int(time.time())}_{file.filename}"
401
- if supabase:
402
- try:
403
- with open(temp_file_path, 'rb') as f:
404
- supabase.storage.from_(bucket_name).upload(
405
- path=storage_filename,
406
- file=f.read(),
407
- file_options={"content-type": "application/octet-stream"}
408
- )
409
- except:
410
- pass # Continue even if Supabase upload fails
411
-
412
- # Chunk document
413
- chunks = chunk_document(text_content, file_type)
414
- logger.info(f"Created {len(chunks)} chunks for collection '{collection_name}'")
415
-
416
- # Create metadata
417
- file_id = str(int(time.time()))
418
- metadata = [
419
- {
420
- "file_id": file_id,
421
- "filename": file.filename,
422
- "file_type": file_type,
423
- "chunk_index": i,
424
- "storage_path": storage_filename
425
- }
426
- for i in range(len(chunks))
427
- ]
428
-
429
- # Add to vector store
430
- chunks_created = vector_store_manager.create_or_update_store(
431
- collection_name, chunks, metadata
432
- )
433
-
434
- logger.info(f"Successfully added {chunks_created} chunks to collection '{collection_name}'")
435
-
436
- # Clean up temp file
437
- try:
438
- os.remove(temp_file_path)
439
- except:
440
- pass
441
-
442
- return DocumentUpload(
443
- file_id=file_id,
444
- filename=file.filename,
445
- file_type=file_type,
446
- chunks_created=chunks_created,
447
- storage_path=f"supabase://{bucket_name}/{storage_filename}" if supabase else temp_file_path
448
- )
449
-
450
- except HTTPException:
451
- raise
452
- except Exception as e:
453
- logger.exception("Error in upload_document")
454
- raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
455
 
456
- @app.post("/upload_multiple_documents")
457
- async def upload_multiple_documents(
458
- files: List[UploadFile] = File(...),
459
- collection_name: Optional[str] = "default"
460
- ):
461
- """Upload multiple documents"""
462
- results = []
463
- errors = []
464
-
465
- for file in files:
466
- try:
467
- result = await upload_document(file, collection_name)
468
- results.append(result)
469
- except Exception as e:
470
- errors.append({"filename": file.filename, "error": str(e)})
471
-
472
- return {
473
- "successful_uploads": len(results),
474
- "failed_uploads": len(errors),
475
- "results": results,
476
- "errors": errors
477
  }
 
478
 
479
- @app.post("/query_documents")
480
- async def query_documents(request: RAGQueryRequest):
481
- """Query documents using RAG - FIXED VERSION"""
482
- store_data = vector_store_manager.get_store(request.collection_name)
483
-
484
- if not store_data:
485
- raise HTTPException(
486
- status_code=404,
487
- detail=f"Collection '{request.collection_name}' not found. Please upload documents first."
488
- )
489
-
490
  try:
491
- # Log query details
492
- logger.info(f"Querying collection '{request.collection_name}' with query: '{request.query}'")
493
- logger.info(f"Collection has {len(store_data['chunks'])} chunks")
494
-
495
- # Generate query embedding
496
- query_embedding = embedding_model.encode([request.query])
497
- query_embedding = np.array(query_embedding).astype('float32')
498
-
499
- # Search in FAISS
500
- distances, indices = store_data['index'].search(
501
- query_embedding,
502
- min(request.top_k, len(store_data['chunks']))
503
- )
504
-
505
- # Log search results
506
- logger.info(f"Search results - distances: {distances[0]}, indices: {indices[0]}")
507
-
508
- # FIX: Removed strict threshold - always return results
509
- # The threshold was too strict and preventing valid results
510
-
511
- # Get relevant chunks
512
- retrieved_chunks = [store_data['chunks'][i] for i in indices[0]]
513
- retrieved_metadata = [store_data['metadata'][i] for i in indices[0]]
514
-
515
- logger.info(f"Retrieved {len(retrieved_chunks)} chunks for query")
516
-
517
- # Create context
518
- context_text = "\n\n".join([
519
- f"[Source {i+1} - {meta['filename']}]:\n{chunk}"
520
- for i, (chunk, meta) in enumerate(zip(retrieved_chunks, retrieved_metadata))
521
- ])
522
-
523
- logger.info(f"Context length: {len(context_text)} characters")
524
 
525
- # Generate answer
526
- answer = process_with_groq(request.query, context_text)
 
 
 
 
527
 
528
- # Prepare sources
529
- sources = [
530
- {
531
- "filename": meta['filename'],
532
- "file_type": meta['file_type'],
533
- "chunk_index": meta['chunk_index'],
534
- "text_snippet": chunk[:200] + "...",
535
- "distance": float(distances[0][i])
536
- }
537
- for i, (chunk, meta) in enumerate(zip(retrieved_chunks, retrieved_metadata))
538
- ]
539
 
540
- return {
541
- "answer": answer,
542
- "sources": sources,
543
- "query": request.query,
544
- "collection": request.collection_name
545
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
546
 
547
- except Exception as e:
548
- logger.exception("Error in query_documents")
549
- raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
550
-
551
- @app.get("/debug_collection/{collection_name}")
552
- async def debug_collection(collection_name: str):
553
- """Debug endpoint to inspect collection contents"""
554
- store_data = vector_store_manager.get_store(collection_name)
555
-
556
- if not store_data:
557
- return {"error": f"Collection '{collection_name}' not found"}
558
-
559
- return {
560
- "collection_name": collection_name,
561
- "total_chunks": len(store_data['chunks']),
562
- "dimension": store_data['dimension'],
563
- "sample_chunks": store_data['chunks'][:3] if len(store_data['chunks']) > 0 else [],
564
- "sample_metadata": store_data['metadata'][:3] if len(store_data['metadata']) > 0 else [],
565
- "all_filenames": list(set([meta['filename'] for meta in store_data['metadata']]))
566
- }
567
-
568
- @app.get("/list_collections")
569
- async def list_collections():
570
- """List all collections"""
571
- collections = vector_store_manager.list_stores()
572
- return {"collections": collections}
573
-
574
- @app.delete("/delete_collection/{collection_name}")
575
- async def delete_collection(collection_name: str):
576
- """Delete a collection"""
577
- try:
578
- vector_store_manager.delete_store(collection_name)
579
- return {"message": f"Collection '{collection_name}' deleted successfully"}
580
- except Exception as e:
581
- raise HTTPException(status_code=500, detail=str(e))
582
-
583
- @app.get("/health_check")
584
- async def health_check():
585
- """System health check"""
586
- return {
587
- "status": "healthy",
588
- "supabase_configured": supabase is not None,
589
- "groq_configured": groq_client is not None,
590
- "embedding_model": "all-MiniLM-L6-v2",
591
- "vector_stores": len(vector_store_manager.stores),
592
- "total_chunks": sum(len(store['chunks']) for store in vector_store_manager.stores.values()),
593
- "persistent_storage": PERSISTENT_STORAGE,
594
- "collections": list(vector_store_manager.stores.keys())
595
- }
596
-
597
- # ==================== EXISTING WEB SCRAPING ENDPOINTS ====================
598
-
599
- @app.post("/rag")
600
- async def rag(request: RAGRequest):
601
- """Existing RAG endpoint for URL-based content"""
602
- if not supabase:
603
- raise HTTPException(status_code=500, detail="Supabase not configured")
604
-
605
- try:
606
  file_path = request.file_path
607
 
608
- # Download from Supabase
609
- file_content = supabase.storage.from_(bucket_name).download(file_path)
610
- text = file_content.decode('utf-8')
611
- data = json.loads(text)
612
-
613
- # Extract text
614
- full_text = ""
615
- for item in data:
616
- full_text += item.get("text", "") + " "
617
 
618
- # Chunk text
619
  chunk_size = 1000
620
- chunks = [full_text[i:i+chunk_size] for i in range(0, len(full_text), chunk_size)]
 
 
 
 
 
 
621
 
622
- # Get embeddings
 
 
623
  chunk_embeddings = []
624
  for chunk in chunks:
625
- embedding = query({"inputs": chunk})
626
  chunk_embeddings.append(embedding)
627
 
 
628
  query_embedding = query({"inputs": request.prompt})
629
 
630
- # Calculate similarity
631
  similarities = []
632
  for chunk_embedding in chunk_embeddings:
633
  query_np = np.array(query_embedding)
@@ -637,20 +190,23 @@ async def rag(request: RAGRequest):
637
  query_np = query_np.reshape(1, -1)
638
  if len(chunk_np.shape) == 1:
639
  chunk_np = chunk_np.reshape(1, -1)
640
-
641
  similarity = cosine_similarity(query_np, chunk_np)[0][0]
642
  similarities.append(similarity)
643
 
644
- # Get top 3 chunks
645
  top_k = 3
646
  top_indices = np.argsort(similarities)[-top_k:][::-1]
647
- relevant_chunks = [chunks[i] for i in top_indices]
 
648
  context_text = "\n\n".join(relevant_chunks)
649
 
650
  # Process with Groq
651
  answer = process_with_groq(request.prompt, context_text)
652
 
653
- sources = [{"text": chunks[i][:200] + "...", "position": i} for i in top_indices]
 
 
654
 
655
  return {
656
  "sources": sources,
@@ -659,13 +215,15 @@ async def rag(request: RAGRequest):
659
  "file_source": f"supabase://{bucket_name}/{file_path}"
660
  }
661
 
 
 
662
  except Exception as e:
663
- logger.exception("Error in RAG")
664
- raise HTTPException(status_code=500, detail=str(e))
665
 
666
  @app.post("/extract_links")
667
  async def extract_links(url: URL):
668
- """Extract links from URL"""
669
  def extract_unique_links(url_string, max_retries=3, timeout=30):
670
  for attempt in range(max_retries):
671
  try:
@@ -677,34 +235,82 @@ async def extract_links(url: URL):
677
  base_url = urlparse(url_string)
678
  base_url = f"{base_url.scheme}://{base_url.netloc}"
679
 
680
- links = [urljoin(base_url, a.get('href')) for a in soup.find_all('a', href=True)]
 
 
 
 
 
 
681
  unique_links = list(dict.fromkeys(links))
682
  unique_links.insert(0, url_string)
683
  return unique_links
684
- except Exception as e:
 
 
685
  if attempt < max_retries - 1:
686
- time.sleep(5 * (attempt + 1))
 
687
  else:
688
- raise HTTPException(status_code=500, detail=str(e))
 
689
  return []
690
 
691
  try:
692
  unique_links = extract_unique_links(url.url)
693
  return {"unique_links": unique_links}
694
  except Exception as e:
695
- raise HTTPException(status_code=500, detail=str(e))
 
696
 
697
  @app.post("/extract_text")
698
  async def extract_text(urls: List[str]):
699
- """Extract text from URLs"""
700
  if not supabase:
701
  raise HTTPException(status_code=500, detail="Supabase not configured")
702
 
703
  output_file = "extracted_text.txt"
704
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705
  def text_data_extractor(links):
706
  extracted_texts = []
 
707
  for link in links:
 
 
 
 
 
708
  retries = 3
709
  while retries > 0:
710
  try:
@@ -712,16 +318,23 @@ async def extract_text(urls: List[str]):
712
  response = requests.get(link, headers=headers, timeout=30)
713
  response.raise_for_status()
714
  soup = BeautifulSoup(response.text, 'html.parser')
715
- text = ' '.join(soup.get_text().split())
716
- extracted_texts.append({"url": link, "text": text})
 
717
  break
718
- except:
 
719
  retries -= 1
 
720
  if retries > 0:
721
- time.sleep(5)
722
-
 
723
  if retries == 0:
724
- extracted_texts.append({"url": link, "text": "Failed to retrieve"})
 
 
 
725
 
726
  return extracted_texts
727
 
@@ -730,31 +343,21 @@ async def extract_text(urls: List[str]):
730
  string_output = json.dumps(extracted_data, ensure_ascii=False, indent=2)
731
 
732
  # Upload to Supabase
733
- file_content = string_output.encode('utf-8')
734
- try:
735
- supabase.storage.from_(bucket_name).upload(
736
- path=output_file,
737
- file=file_content,
738
- file_options={"content-type": "text/plain"}
739
- )
740
- except:
741
- supabase.storage.from_(bucket_name).update(
742
- path=output_file,
743
- file=file_content,
744
- file_options={"content-type": "text/plain"}
745
- )
746
 
747
  return {"extracted_data": extracted_data, "file_saved": output_file}
 
748
  except Exception as e:
749
- raise HTTPException(status_code=500, detail=str(e))
750
-
751
- # ==================== MAIN ====================
752
 
 
753
  if __name__ == "__main__":
 
754
  uvicorn.run(
755
- "main_api:app",
756
- host="0.0.0.0",
757
- port=8000,
758
- reload=False,
759
  access_log=True
760
- )
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from typing import List
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
  import time
7
+ import os
8
  import json
9
+ import random
10
+ import logging
11
+ import groq
12
  import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  from sklearn.metrics.pairwise import cosine_similarity
14
+ import uvicorn
15
  from supabase import create_client, Client
16
+ from urllib.parse import urljoin, urlparse
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
 
18
 
19
+ # Initialize FastAPI app
20
+ app = FastAPI(
21
+ title="Web RAG System API",
22
+ description="Extract content from web pages and perform RAG operations",
23
+ version="1.0.0"
 
 
 
 
24
  )
25
 
26
+ # Configure logging
27
+ logging.basicConfig(level=logging.INFO)
28
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ # Initialize Supabase client with environment variables
31
+ try:
32
+ url = os.environ.get('SUPABASE_URL')
33
+ key = os.environ.get('SUPABASE_SERVICE_ROLE_KEY')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ if not url or not key:
36
+ logger.warning("Supabase credentials not found in environment variables")
37
+ supabase = None
38
+ else:
39
+ supabase: Client = create_client(url, key)
40
+ logger.info("Supabase client initialized successfully")
41
+ except Exception as e:
42
+ logger.error(f"Failed to initialize Supabase client: {e}")
43
+ supabase = None
 
 
 
 
 
 
44
 
45
+ # User agents for web scraping
46
+ user_agents = [
47
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
48
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/102.0",
49
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15",
50
+ "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
51
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
52
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
53
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
54
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/103.0.1264.49",
55
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Mobile/15E148 Safari/604.1",
56
+ "Mozilla/5.0 (iPad; CPU OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Mobile/15E148 Safari/604.1",
57
+ "Mozilla/5.0 (Linux; Android 12; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Mobile Safari/537.36",
58
+ "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Mobile Safari/537.36",
59
+ "Mozilla/5.0 (Linux; Android 11; SM-A217F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Mobile Safari/537.36",
60
+ "Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Mobile Safari/537.36"
61
+ ]
62
 
63
+ # Pydantic models
64
  class RAGRequest(BaseModel):
65
  file_path: str
66
  prompt: str
67
 
68
+ class URL(BaseModel):
69
+ url: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  @app.get("/")
72
  async def root():
73
+ """Health check endpoint"""
74
+ return {"message": "Web RAG System API is running", "status": "healthy"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ @app.get("/health")
77
+ async def health_check():
78
+ """Detailed health check"""
79
+ health_status = {
80
+ "api": "healthy",
81
+ "supabase": "connected" if supabase else "not configured",
82
+ "hf_token": "configured" if os.environ.get('hf_token') else "not configured",
83
+ "groq_token": "configured" if os.environ.get('groq_token') else "not configured"
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  }
85
+ return health_status
86
 
87
+ @app.post("/rag")
88
+ async def rag(request: RAGRequest):
89
+ """Perform RAG operations on extracted text"""
 
 
 
 
 
 
 
 
90
  try:
91
+ # Check required environment variables
92
+ hf_token = os.environ.get('hf_token')
93
+ groq_token = os.environ.get('groq_token')
94
+
95
+ if not hf_token:
96
+ raise HTTPException(status_code=500, detail="HuggingFace token not configured")
97
+ if not groq_token:
98
+ raise HTTPException(status_code=500, detail="Groq token not configured")
99
+ if not supabase:
100
+ raise HTTPException(status_code=500, detail="Supabase not configured")
101
+
102
+ logger.info(f"Processing RAG request for file: {request.file_path}")
103
+
104
+ # HuggingFace Inference API for embeddings
105
+ API_URL = "https://router.huggingface.co/hf-inference/models/BAAI/bge-large-en-v1.5/pipeline/feature-extraction"
106
+ headers = {
107
+ "Authorization": f"Bearer {hf_token}",
108
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
+ def query(payload):
111
+ response = requests.post(API_URL, headers=headers, json=payload)
112
+ if response.status_code != 200:
113
+ logger.error(f"HuggingFace API error: {response.status_code} - {response.text}")
114
+ raise HTTPException(status_code=500, detail="Failed to get embeddings from HuggingFace")
115
+ return response.json()
116
 
117
+ # Create a Groq client
118
+ groq_client = groq.Client(api_key=groq_token)
 
 
 
 
 
 
 
 
 
119
 
120
+ def process_with_groq(query_text, context):
121
+ prompt = f"""
122
+ Context information:
123
+ {context}
124
+
125
+ Based on the context information above, please answer the following question:
126
+ {query_text}
127
+
128
+ Answer:
129
+ """
130
+
131
+ try:
132
+ response = groq_client.chat.completions.create(
133
+ messages=[{"role": "user", "content": prompt}],
134
+ model="llama-3.3-70b-versatile",
135
+ temperature=0.4,
136
+ max_tokens=512
137
+ )
138
+ return response.choices[0].message.content
139
+ except Exception as e:
140
+ logger.error(f"Groq API error: {e}")
141
+ raise HTTPException(status_code=500, detail="Failed to process with Groq")
142
 
143
+ def get_file_from_supabase(bucket_name, file_path):
144
+ try:
145
+ response = supabase.storage.from_(bucket_name).download(file_path)
146
+ content = response.decode('utf-8')
147
+ return content
148
+ except Exception as e:
149
+ logger.error(f"Error downloading file from Supabase: {e}")
150
+ raise HTTPException(
151
+ status_code=404,
152
+ detail=f"File not found in Supabase bucket: {file_path}"
153
+ )
154
+
155
+ # Get file content from Supabase
156
+ bucket_name = "url-2-ans-bucket"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  file_path = request.file_path
158
 
159
+ content = get_file_from_supabase(bucket_name, file_path)
160
+ logger.info(f"Successfully downloaded file from Supabase: {file_path}")
 
 
 
 
 
 
 
161
 
162
+ # Simple text chunking
163
  chunk_size = 1000
164
+ overlap = 200
165
+ chunks = []
166
+
167
+ for i in range(0, len(content), chunk_size - overlap):
168
+ chunk = content[i:i + chunk_size]
169
+ if len(chunk) > 100:
170
+ chunks.append({"text": chunk, "position": i})
171
 
172
+ logger.info(f"Created {len(chunks)} chunks from document")
173
+
174
+ # Get embeddings for all chunks
175
  chunk_embeddings = []
176
  for chunk in chunks:
177
+ embedding = query({"inputs": chunk["text"]})
178
  chunk_embeddings.append(embedding)
179
 
180
+ # Get embedding for the query
181
  query_embedding = query({"inputs": request.prompt})
182
 
183
+ # Calculate similarity between query and all chunks
184
  similarities = []
185
  for chunk_embedding in chunk_embeddings:
186
  query_np = np.array(query_embedding)
 
190
  query_np = query_np.reshape(1, -1)
191
  if len(chunk_np.shape) == 1:
192
  chunk_np = chunk_np.reshape(1, -1)
193
+
194
  similarity = cosine_similarity(query_np, chunk_np)[0][0]
195
  similarities.append(similarity)
196
 
197
+ # Get top 3 most similar chunks
198
  top_k = 3
199
  top_indices = np.argsort(similarities)[-top_k:][::-1]
200
+
201
+ relevant_chunks = [chunks[i]["text"] for i in top_indices]
202
  context_text = "\n\n".join(relevant_chunks)
203
 
204
  # Process with Groq
205
  answer = process_with_groq(request.prompt, context_text)
206
 
207
+ # Prepare sources
208
+ sources = [{"text": chunks[i]["text"][:200] + "...", "position": chunks[i]["position"]}
209
+ for i in top_indices]
210
 
211
  return {
212
  "sources": sources,
 
215
  "file_source": f"supabase://{bucket_name}/{file_path}"
216
  }
217
 
218
+ except HTTPException:
219
+ raise
220
  except Exception as e:
221
+ logger.exception("Error occurred in RAG process")
222
+ raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
223
 
224
  @app.post("/extract_links")
225
  async def extract_links(url: URL):
226
+ """Extract unique links from a given URL"""
227
  def extract_unique_links(url_string, max_retries=3, timeout=30):
228
  for attempt in range(max_retries):
229
  try:
 
235
  base_url = urlparse(url_string)
236
  base_url = f"{base_url.scheme}://{base_url.netloc}"
237
 
238
+ a_tags = soup.find_all('a', href=True)
239
+ links = []
240
+ for a in a_tags:
241
+ href = a.get('href')
242
+ full_url = urljoin(base_url, href)
243
+ links.append(full_url)
244
+
245
  unique_links = list(dict.fromkeys(links))
246
  unique_links.insert(0, url_string)
247
  return unique_links
248
+
249
+ except requests.RequestException as e:
250
+ logger.warning(f"Attempt {attempt + 1} failed: {e}")
251
  if attempt < max_retries - 1:
252
+ wait_time = 5 * (attempt + 1)
253
+ time.sleep(wait_time)
254
  else:
255
+ logger.error(f"Failed to retrieve {url_string} after {max_retries} attempts.")
256
+ raise HTTPException(status_code=500, detail=f"Failed to retrieve {url_string} after {max_retries} attempts.")
257
  return []
258
 
259
  try:
260
  unique_links = extract_unique_links(url.url)
261
  return {"unique_links": unique_links}
262
  except Exception as e:
263
+ logger.exception("Error in extract_links")
264
+ raise HTTPException(status_code=500, detail=f"Failed to extract links: {str(e)}")
265
 
266
  @app.post("/extract_text")
267
  async def extract_text(urls: List[str]):
268
+ """Extract text content from multiple URLs"""
269
  if not supabase:
270
  raise HTTPException(status_code=500, detail="Supabase not configured")
271
 
272
  output_file = "extracted_text.txt"
273
 
274
+ def upload_text_content(filename, content, bucket_name):
275
+ try:
276
+ file_content = content.encode('utf-8')
277
+
278
+ # Try to upload first
279
+ try:
280
+ response = supabase.storage.from_(bucket_name).upload(
281
+ path=filename,
282
+ file=file_content,
283
+ file_options={"content-type": "text/plain"}
284
+ )
285
+ logger.info(f"Text file uploaded successfully: {filename}")
286
+ return response
287
+ except Exception as upload_error:
288
+ # If upload fails (file exists), try to update
289
+ try:
290
+ response = supabase.storage.from_(bucket_name).update(
291
+ path=filename,
292
+ file=file_content,
293
+ file_options={"content-type": "text/plain"}
294
+ )
295
+ logger.info(f"Text file updated successfully: {filename}")
296
+ return response
297
+ except Exception as update_error:
298
+ logger.error(f"Error updating text content: {update_error}")
299
+ raise HTTPException(status_code=500, detail="Failed to save file to storage")
300
+
301
+ except Exception as e:
302
+ logger.error(f"Error with file operations: {e}")
303
+ raise HTTPException(status_code=500, detail="Failed to save file to storage")
304
+
305
  def text_data_extractor(links):
306
  extracted_texts = []
307
+
308
  for link in links:
309
+ parsed_url = urlparse(link)
310
+ if not parsed_url.scheme:
311
+ logger.warning(f"Invalid URL: {link}")
312
+ continue
313
+
314
  retries = 3
315
  while retries > 0:
316
  try:
 
318
  response = requests.get(link, headers=headers, timeout=30)
319
  response.raise_for_status()
320
  soup = BeautifulSoup(response.text, 'html.parser')
321
+ text = soup.get_text()
322
+ clean_text = ' '.join(text.split())
323
+ extracted_texts.append({"url": link, "text": clean_text})
324
  break
325
+
326
+ except requests.RequestException as e:
327
  retries -= 1
328
+ logger.warning(f"Retry {3 - retries} for {link} failed: {e}")
329
  if retries > 0:
330
+ wait_time = 5 * (3 - retries)
331
+ time.sleep(wait_time)
332
+
333
  if retries == 0:
334
+ extracted_texts.append({
335
+ "url": link,
336
+ "text": "Failed to retrieve text after multiple attempts."
337
+ })
338
 
339
  return extracted_texts
340
 
 
343
  string_output = json.dumps(extracted_data, ensure_ascii=False, indent=2)
344
 
345
  # Upload to Supabase
346
+ upload_text_content(output_file, string_output, "url-2-ans-bucket")
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  return {"extracted_data": extracted_data, "file_saved": output_file}
349
+
350
  except Exception as e:
351
+ logger.exception("Error in extract_text")
352
+ raise HTTPException(status_code=500, detail=f"Failed to extract text: {str(e)}")
 
353
 
354
+ # Main execution
355
  if __name__ == "__main__":
356
+ # Run the FastAPI app
357
  uvicorn.run(
358
+ "main_api:app",
359
+ host="0.0.0.0",
360
+ port=8000,
361
+ reload=False, # Disable reload for production
362
  access_log=True
363
+ )