Spaces:

sunbal7
/

PDFQueryApplication

Sleeping

App Files Files Community

sunbal7 commited on Feb 13, 2025

Commit

021fae5

verified ·

1 Parent(s): 6c784e7

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -14

app.py CHANGED Viewed

@@ -1,3 +1,7 @@
 import os
 import fitz  # PyMuPDF for PDF processing
 import faiss
@@ -8,18 +12,16 @@ from sentence_transformers import SentenceTransformer
 from groq import Groq
 from dotenv import load_dotenv
 # Load API key
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 # Initialize Groq client
-client = Groq(api_key= GROQ_API_KEY)
 # Load sentence transformer model for embedding
 embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 def extract_text_from_pdf(pdf_path):
     """Extract text from a PDF file using PyMuPDF."""
     doc = fitz.open(pdf_path)
@@ -27,13 +29,7 @@ def extract_text_from_pdf(pdf_path):
     for page in doc:
         text += page.get_text("text") + "\n"
     return text.strip()
-def extract_text_from_pdf(pdf_path):
-    """Extract text from a PDF file using PyMuPDF."""
-    doc = fitz.open(pdf_path)
-    text = ""
-    for page in doc:
-        text += page.get_text("text") + "\n"
-    return text.strip()
 def create_text_chunks(text, chunk_size=500, chunk_overlap=100):
     """Split text into chunks of specified size with overlap."""
     text_splitter = RecursiveCharacterTextSplitter(
@@ -42,6 +38,7 @@ def create_text_chunks(text, chunk_size=500, chunk_overlap=100):
     )
     chunks = text_splitter.split_text(text)
     return chunks
 def create_faiss_index(chunks):
     """Generate embeddings for text chunks and store them in FAISS."""
     embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
@@ -51,6 +48,7 @@ def create_faiss_index(chunks):
     index.add(embeddings)  # Add embeddings to FAISS index
     return index, embeddings, chunks
 def retrieve_similar_chunks(query, index, embeddings, chunks, top_k=3):
     """Retrieve the most relevant text chunks using FAISS."""
     query_embedding = embedding_model.encode([query], convert_to_numpy=True)
@@ -58,6 +56,7 @@ def retrieve_similar_chunks(query, index, embeddings, chunks, top_k=3):
     results = [chunks[idx] for idx in indices[0]]
     return results
 def query_groq_api(query, context):
     """Send the query along with retrieved context to Groq API."""
     prompt = f"Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}\nAnswer:"
@@ -68,8 +67,8 @@ def query_groq_api(query, context):
     )
     return chat_completion.choices[0].message.content
-import streamlit as st
 st.title("📚 RAG-based PDF Query Application")
 st.write("Upload a PDF and ask questions!")
@@ -106,5 +105,4 @@ if uploaded_file is not None:
             st.subheader("Answer:")
             st.write(response)
         else:
-            st.warning("Please enter a question.")

+### `app.py`
+```python
 import os
 import fitz  # PyMuPDF for PDF processing
 import faiss
 from groq import Groq
 from dotenv import load_dotenv
 # Load API key
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 # Initialize Groq client
+client = Groq(api_key=GROQ_API_KEY)
 # Load sentence transformer model for embedding
 embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 def extract_text_from_pdf(pdf_path):
     """Extract text from a PDF file using PyMuPDF."""
     doc = fitz.open(pdf_path)
     for page in doc:
         text += page.get_text("text") + "\n"
     return text.strip()
 def create_text_chunks(text, chunk_size=500, chunk_overlap=100):
     """Split text into chunks of specified size with overlap."""
     text_splitter = RecursiveCharacterTextSplitter(
     )
     chunks = text_splitter.split_text(text)
     return chunks
 def create_faiss_index(chunks):
     """Generate embeddings for text chunks and store them in FAISS."""
     embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
     index.add(embeddings)  # Add embeddings to FAISS index
     return index, embeddings, chunks
 def retrieve_similar_chunks(query, index, embeddings, chunks, top_k=3):
     """Retrieve the most relevant text chunks using FAISS."""
     query_embedding = embedding_model.encode([query], convert_to_numpy=True)
     results = [chunks[idx] for idx in indices[0]]
     return results
 def query_groq_api(query, context):
     """Send the query along with retrieved context to Groq API."""
     prompt = f"Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}\nAnswer:"
     )
     return chat_completion.choices[0].message.content
+# Streamlit UI
 st.title("📚 RAG-based PDF Query Application")
 st.write("Upload a PDF and ask questions!")
             st.subheader("Answer:")
             st.write(response)
         else:
+            st.warning("Please enter a question.")