MohamedFahim commited on
Commit
44f1c5c
·
verified ·
1 Parent(s): 8665973

Update main_api.py

Browse files
Files changed (1) hide show
  1. main_api.py +46 -16
main_api.py CHANGED
@@ -5,7 +5,13 @@ import random
5
  import json
6
  import numpy as np
7
  import uvicorn
8
- import fitz # PyMuPDF
 
 
 
 
 
 
9
  import pymupdf4llm
10
  import faiss
11
  from pathlib import Path
@@ -21,8 +27,6 @@ from supabase import create_client, Client
21
  from groq import Groq
22
  from sentence_transformers import SentenceTransformer
23
  from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownTextSplitter
24
-
25
-
26
  import pickle
27
 
28
  # ==================== CONFIGURATION FOR HUGGING FACE SPACES ====================
@@ -272,7 +276,7 @@ def query(payload):
272
  def process_with_groq(query: str, context: str) -> str:
273
  """Process query with Groq LLM"""
274
  if not groq_client:
275
- return "Groq API not configured. Please set GROQ_API_KEY environment variable."
276
 
277
  try:
278
  messages = [
@@ -390,6 +394,8 @@ async def upload_document(
390
  if not text_content.strip():
391
  raise HTTPException(status_code=400, detail="No text content extracted")
392
 
 
 
393
  # Optional: Upload to Supabase
394
  storage_filename = f"{int(time.time())}_{file.filename}"
395
  if supabase:
@@ -398,13 +404,14 @@ async def upload_document(
398
  supabase.storage.from_(bucket_name).upload(
399
  path=storage_filename,
400
  file=f.read(),
401
- file_options={"content-type": "application/octet-stream"} # Generic type
402
  )
403
  except:
404
  pass # Continue even if Supabase upload fails
405
 
406
  # Chunk document
407
  chunks = chunk_document(text_content, file_type)
 
408
 
409
  # Create metadata
410
  file_id = str(int(time.time()))
@@ -424,6 +431,8 @@ async def upload_document(
424
  collection_name, chunks, metadata
425
  )
426
 
 
 
427
  # Clean up temp file
428
  try:
429
  os.remove(temp_file_path)
@@ -444,7 +453,6 @@ async def upload_document(
444
  logger.exception("Error in upload_document")
445
  raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
446
 
447
-
448
  @app.post("/upload_multiple_documents")
449
  async def upload_multiple_documents(
450
  files: List[UploadFile] = File(...),
@@ -470,16 +478,20 @@ async def upload_multiple_documents(
470
 
471
  @app.post("/query_documents")
472
  async def query_documents(request: RAGQueryRequest):
473
- """Query documents using RAG"""
474
  store_data = vector_store_manager.get_store(request.collection_name)
475
 
476
  if not store_data:
477
  raise HTTPException(
478
  status_code=404,
479
- detail=f"Collection '{request.collection_name}' not found"
480
  )
481
 
482
  try:
 
 
 
 
483
  # Generate query embedding
484
  query_embedding = embedding_model.encode([request.query])
485
  query_embedding = np.array(query_embedding).astype('float32')
@@ -490,25 +502,26 @@ async def query_documents(request: RAGQueryRequest):
490
  min(request.top_k, len(store_data['chunks']))
491
  )
492
 
493
- # Check relevance threshold
494
- if distances[0][0] > 1.5:
495
- return {
496
- "answer": "I couldn't find this information in the provided documents.",
497
- "sources": [],
498
- "query": request.query,
499
- "collection": request.collection_name
500
- }
501
 
502
  # Get relevant chunks
503
  retrieved_chunks = [store_data['chunks'][i] for i in indices[0]]
504
  retrieved_metadata = [store_data['metadata'][i] for i in indices[0]]
505
 
 
 
506
  # Create context
507
  context_text = "\n\n".join([
508
  f"[Source {i+1} - {meta['filename']}]:\n{chunk}"
509
  for i, (chunk, meta) in enumerate(zip(retrieved_chunks, retrieved_metadata))
510
  ])
511
 
 
 
512
  # Generate answer
513
  answer = process_with_groq(request.query, context_text)
514
 
@@ -535,6 +548,23 @@ async def query_documents(request: RAGQueryRequest):
535
  logger.exception("Error in query_documents")
536
  raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
537
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
538
  @app.get("/list_collections")
539
  async def list_collections():
540
  """List all collections"""
 
5
  import json
6
  import numpy as np
7
  import uvicorn
8
+
9
+ # FIX: Updated PyMuPDF import for compatibility
10
+ try:
11
+ import pymupdf as fitz # PyMuPDF >= 1.24.0 (recommended)
12
+ except ImportError:
13
+ import fitz # PyMuPDF < 1.24.0 (fallback)
14
+
15
  import pymupdf4llm
16
  import faiss
17
  from pathlib import Path
 
27
  from groq import Groq
28
  from sentence_transformers import SentenceTransformer
29
  from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownTextSplitter
 
 
30
  import pickle
31
 
32
  # ==================== CONFIGURATION FOR HUGGING FACE SPACES ====================
 
276
  def process_with_groq(query: str, context: str) -> str:
277
  """Process query with Groq LLM"""
278
  if not groq_client:
279
+ return "Groq API not configured. Please set groq_token environment variable."
280
 
281
  try:
282
  messages = [
 
394
  if not text_content.strip():
395
  raise HTTPException(status_code=400, detail="No text content extracted")
396
 
397
+ logger.info(f"Extracted {len(text_content)} characters from {file.filename}")
398
+
399
  # Optional: Upload to Supabase
400
  storage_filename = f"{int(time.time())}_{file.filename}"
401
  if supabase:
 
404
  supabase.storage.from_(bucket_name).upload(
405
  path=storage_filename,
406
  file=f.read(),
407
+ file_options={"content-type": "application/octet-stream"}
408
  )
409
  except:
410
  pass # Continue even if Supabase upload fails
411
 
412
  # Chunk document
413
  chunks = chunk_document(text_content, file_type)
414
+ logger.info(f"Created {len(chunks)} chunks for collection '{collection_name}'")
415
 
416
  # Create metadata
417
  file_id = str(int(time.time()))
 
431
  collection_name, chunks, metadata
432
  )
433
 
434
+ logger.info(f"Successfully added {chunks_created} chunks to collection '{collection_name}'")
435
+
436
  # Clean up temp file
437
  try:
438
  os.remove(temp_file_path)
 
453
  logger.exception("Error in upload_document")
454
  raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
455
 
 
456
  @app.post("/upload_multiple_documents")
457
  async def upload_multiple_documents(
458
  files: List[UploadFile] = File(...),
 
478
 
479
  @app.post("/query_documents")
480
  async def query_documents(request: RAGQueryRequest):
481
+ """Query documents using RAG - FIXED VERSION"""
482
  store_data = vector_store_manager.get_store(request.collection_name)
483
 
484
  if not store_data:
485
  raise HTTPException(
486
  status_code=404,
487
+ detail=f"Collection '{request.collection_name}' not found. Please upload documents first."
488
  )
489
 
490
  try:
491
+ # Log query details
492
+ logger.info(f"Querying collection '{request.collection_name}' with query: '{request.query}'")
493
+ logger.info(f"Collection has {len(store_data['chunks'])} chunks")
494
+
495
  # Generate query embedding
496
  query_embedding = embedding_model.encode([request.query])
497
  query_embedding = np.array(query_embedding).astype('float32')
 
502
  min(request.top_k, len(store_data['chunks']))
503
  )
504
 
505
+ # Log search results
506
+ logger.info(f"Search results - distances: {distances[0]}, indices: {indices[0]}")
507
+
508
+ # FIX: Removed strict threshold - always return results
509
+ # The threshold was too strict and preventing valid results
 
 
 
510
 
511
  # Get relevant chunks
512
  retrieved_chunks = [store_data['chunks'][i] for i in indices[0]]
513
  retrieved_metadata = [store_data['metadata'][i] for i in indices[0]]
514
 
515
+ logger.info(f"Retrieved {len(retrieved_chunks)} chunks for query")
516
+
517
  # Create context
518
  context_text = "\n\n".join([
519
  f"[Source {i+1} - {meta['filename']}]:\n{chunk}"
520
  for i, (chunk, meta) in enumerate(zip(retrieved_chunks, retrieved_metadata))
521
  ])
522
 
523
+ logger.info(f"Context length: {len(context_text)} characters")
524
+
525
  # Generate answer
526
  answer = process_with_groq(request.query, context_text)
527
 
 
548
  logger.exception("Error in query_documents")
549
  raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
550
 
551
+ @app.get("/debug_collection/{collection_name}")
552
+ async def debug_collection(collection_name: str):
553
+ """Debug endpoint to inspect collection contents"""
554
+ store_data = vector_store_manager.get_store(collection_name)
555
+
556
+ if not store_data:
557
+ return {"error": f"Collection '{collection_name}' not found"}
558
+
559
+ return {
560
+ "collection_name": collection_name,
561
+ "total_chunks": len(store_data['chunks']),
562
+ "dimension": store_data['dimension'],
563
+ "sample_chunks": store_data['chunks'][:3] if len(store_data['chunks']) > 0 else [],
564
+ "sample_metadata": store_data['metadata'][:3] if len(store_data['metadata']) > 0 else [],
565
+ "all_filenames": list(set([meta['filename'] for meta in store_data['metadata']]))
566
+ }
567
+
568
  @app.get("/list_collections")
569
  async def list_collections():
570
  """List all collections"""