answer / app.py
heerjtdev's picture
Update app.py
454d23c verified
import gradio as gr
import fitz # PyMuPDF
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
class VectorSystem:
def __init__(self):
self.vector_store = None
self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# NEW: We keep a copy of all chunks in a list so we can access neighbors by index
self.all_chunks = []
def process_file(self, file_obj):
"""Extracts text, preserves order, and builds the Vector Index"""
if file_obj is None:
return "No file uploaded."
try:
# 1. Extract Text
text = ""
file_path = file_obj.name
if file_path.lower().endswith('.pdf'):
doc = fitz.open(file_path)
for page in doc: text += page.get_text()
elif file_path.lower().endswith('.txt'):
with open(file_path, 'r', encoding='utf-8') as f: text = f.read()
else:
return "❌ Error: Only .pdf and .txt files are supported."
# 2. Split Text
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=150,
separators=["\n\n", "\n", ".", " ", ""]
)
# Store chunks in the class so we can look them up by ID later
self.all_chunks = text_splitter.split_text(text)
if not self.all_chunks:
return "Could not extract text. Is the file empty?"
# 3. Build Vector Index with ID Metadata
# We attach the index ID (0, 1, 2...) to every vector
metadatas = [{"id": i} for i in range(len(self.all_chunks))]
self.vector_store = FAISS.from_texts(
self.all_chunks,
self.embeddings,
metadatas=metadatas
)
return f"βœ… Success! Indexed {len(self.all_chunks)} chunks."
except Exception as e:
return f"Error processing file: {str(e)}"
def retrieve_evidence(self, question, student_answer):
if not self.vector_store:
return "⚠️ Please upload and process a file first."
if not question:
return "⚠️ Please enter a Question."
# Lower Score = Better Match
results = self.vector_store.similarity_search_with_score(question, k=3)
output_text = "### πŸ” Expanded Context Analysis:\n"
for i, (doc, score) in enumerate(results):
chunk_id = doc.metadata['id']
# Retrieve Previous and Next chunks
# Logic: If it's the first chunk (ID 0), there is no 'prev', so returns empty string
prev_chunk = self.all_chunks[chunk_id - 1] if chunk_id > 0 else "(Start of Text)"
next_chunk = self.all_chunks[chunk_id + 1] if chunk_id < len(self.all_chunks) - 1 else "(End of Text)"
output_text += f"\n#### 🎯 Match #{i+1} (Distance Score: {score:.4f})\n"
# --- CHANGED HERE: Removed [-200:] and [:200] ---
output_text += f"> **Preceding Context:**\n{prev_chunk}\n\n"
output_text += f"> **MATCH:**\n**{doc.page_content}**\n\n"
output_text += f"> **Succeeding Context:**\n{next_chunk}\n"
output_text += "---\n"
return output_text
# Initialize System
system = VectorSystem()
# --- Gradio UI ---
with gr.Blocks(title="EduGenius Context Retriever") as demo:
gr.Markdown("# πŸŽ“ EduGenius: Smart Context Retriever")
gr.Markdown("Upload a Chapter. This version finds the best match AND shows you the text immediately before and after it.")
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(label="1. Upload File (PDF or TXT)", file_types=[".pdf", ".txt"])
upload_btn = gr.Button("Process File", variant="primary")
upload_status = gr.Textbox(label="Status", interactive=False)
with gr.Column(scale=2):
question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?")
answer_input = gr.Textbox(label="Student Answer (Optional)", placeholder="e.g., The heat causes it...")
search_btn = gr.Button("Find Context + Neighbors", variant="secondary")
evidence_output = gr.Markdown(label="Relevant Text Chunks")
upload_btn.click(fn=system.process_file, inputs=[pdf_input], outputs=[upload_status])
search_btn.click(fn=system.retrieve_evidence, inputs=[question_input, answer_input], outputs=[evidence_output])
if __name__ == "__main__":
demo.launch()
# import gradio as gr
# import fitz # PyMuPDF
# import numpy as np
# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain_community.vectorstores import FAISS
# from langchain_huggingface import HuggingFaceEmbeddings
# class VectorSystem:
# def __init__(self):
# self.vector_store = None
# self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# self.all_chunks = []
# def process_file(self, file_obj):
# """Extracts text, preserves order, and builds the Vector Index"""
# if file_obj is None:
# return "No file uploaded."
# try:
# # 1. Extract Text
# text = ""
# file_path = file_obj.name
# if file_path.lower().endswith('.pdf'):
# doc = fitz.open(file_path)
# for page in doc: text += page.get_text()
# elif file_path.lower().endswith('.txt'):
# with open(file_path, 'r', encoding='utf-8') as f: text = f.read()
# else:
# return "❌ Error: Only .pdf and .txt files are supported."
# # 2. Split Text
# text_splitter = RecursiveCharacterTextSplitter(
# chunk_size=800,
# chunk_overlap=150,
# separators=["\n\n", "\n", ".", " ", ""]
# )
# self.all_chunks = text_splitter.split_text(text)
# if not self.all_chunks:
# return "Could not extract text. Is the file empty?"
# # 3. Build Vector Index with ID Metadata
# metadatas = [{"id": i} for i in range(len(self.all_chunks))]
# self.vector_store = FAISS.from_texts(
# self.all_chunks,
# self.embeddings,
# metadatas=metadatas
# )
# return f"βœ… Success! Indexed {len(self.all_chunks)} chunks."
# except Exception as e:
# return f"Error processing file: {str(e)}"
# def retrieve_evidence(self, question, student_answer):
# if not self.vector_store:
# return "⚠️ Please upload and process a file first."
# if not question:
# return "⚠️ Please enter a Question."
# # 1. Get Initial Results (Core Matches)
# # FAISS returns L2 distance (Lower is better)
# results = self.vector_store.similarity_search_with_score(question, k=3)
# # We need the vector for the QUESTION to do our own math later
# q_vector = np.array(self.embeddings.embed_query(question))
# output_text = "### πŸ” Smart Context Analysis:\n"
# for i, (doc, core_score) in enumerate(results):
# chunk_id = doc.metadata['id']
# # 2. Identify Neighbors
# prev_chunk = self.all_chunks[chunk_id - 1] if chunk_id > 0 else ""
# next_chunk = self.all_chunks[chunk_id + 1] if chunk_id < len(self.all_chunks) - 1 else ""
# # 3. Create the "Super Chunk" (Prev + Core + Next)
# super_chunk_text = f"{prev_chunk} {doc.page_content} {next_chunk}"
# # 4. Calculate "Super Score" (Re-embedding on the fly)
# # We embed the Super Chunk and measure distance to Question
# super_vector = np.array(self.embeddings.embed_query(super_chunk_text))
# super_score = np.linalg.norm(q_vector - super_vector) # Euclidean Distance
# output_text += f"\n#### 🎯 Match #{i+1}\n"
# # 5. The Logic Test: Does Context Improve the Score?
# # Remember: LOWER score is BETTER (closer distance)
# if super_score < core_score:
# # CASE A: Context Helps! (Distance Reduced)
# output_text += f"**βœ… Context Added:** The surrounding text made the match stronger (Score improved from {core_score:.3f} to {super_score:.3f}).\n\n"
# output_text += f"> {prev_chunk} **{doc.page_content}** {next_chunk}\n"
# else:
# # CASE B: Context Dilutes! (Distance Increased or Same)
# output_text += f"**⏹️ Context Ignored:** Surrounding text was irrelevant or noisy (Score worsened from {core_score:.3f} to {super_score:.3f}). Showing Core Match only.\n\n"
# output_text += f"> **{doc.page_content}**\n"
# output_text += "---\n"
# return output_text
# # Initialize System
# system = VectorSystem()
# # --- Gradio UI ---
# with gr.Blocks(title="EduGenius Context Retriever") as demo:
# gr.Markdown("# πŸŽ“ EduGenius: Intelligent Context Retriever")
# gr.Markdown("Upload a Chapter. This system intelligently decides if it needs to read the surrounding paragraphs to answer your question.")
# with gr.Row():
# with gr.Column(scale=1):
# pdf_input = gr.File(label="1. Upload File (PDF or TXT)", file_types=[".pdf", ".txt"])
# upload_btn = gr.Button("Process File", variant="primary")
# upload_status = gr.Textbox(label="Status", interactive=False)
# with gr.Column(scale=2):
# question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?")
# answer_input = gr.Textbox(label="Student Answer (Optional)", placeholder="e.g., The heat causes it...")
# search_btn = gr.Button("Find Evidence", variant="secondary")
# evidence_output = gr.Markdown(label="Relevant Text Chunks")
# upload_btn.click(fn=system.process_file, inputs=[pdf_input], outputs=[upload_status])
# search_btn.click(fn=system.retrieve_evidence, inputs=[question_input, answer_input], outputs=[evidence_output])
# if __name__ == "__main__":
# demo.launch()