Spaces:

heerjtdev
/

answer

Sleeping

App Files Files Community

answer / app.py

heerjtdev

Update app.py

454d23c verified 7 days ago

raw

history blame contribute delete

10.8 kB



	import gradio as gr
	import fitz # PyMuPDF
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import FAISS
	from langchain_huggingface import HuggingFaceEmbeddings

	class VectorSystem:
	def __init__(self):
	self.vector_store = None
	self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
	# NEW: We keep a copy of all chunks in a list so we can access neighbors by index
	self.all_chunks = []

	def process_file(self, file_obj):
	"""Extracts text, preserves order, and builds the Vector Index"""
	if file_obj is None:
	return "No file uploaded."

	try:
	# 1. Extract Text
	text = ""
	file_path = file_obj.name

	if file_path.lower().endswith('.pdf'):
	doc = fitz.open(file_path)
	for page in doc: text += page.get_text()
	elif file_path.lower().endswith('.txt'):
	with open(file_path, 'r', encoding='utf-8') as f: text = f.read()
	else:
	return "❌ Error: Only .pdf and .txt files are supported."

	# 2. Split Text
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=800,
	chunk_overlap=150,
	separators=["\n\n", "\n", ".", " ", ""]
	)
	# Store chunks in the class so we can look them up by ID later
	self.all_chunks = text_splitter.split_text(text)

	if not self.all_chunks:
	return "Could not extract text. Is the file empty?"

	# 3. Build Vector Index with ID Metadata
	# We attach the index ID (0, 1, 2...) to every vector
	metadatas = [{"id": i} for i in range(len(self.all_chunks))]

	self.vector_store = FAISS.from_texts(
	self.all_chunks,
	self.embeddings,
	metadatas=metadatas
	)

	return f"✅ Success! Indexed {len(self.all_chunks)} chunks."

	except Exception as e:
	return f"Error processing file: {str(e)}"

	def retrieve_evidence(self, question, student_answer):
	if not self.vector_store:
	return "⚠️ Please upload and process a file first."

	if not question:
	return "⚠️ Please enter a Question."

	# Lower Score = Better Match
	results = self.vector_store.similarity_search_with_score(question, k=3)

	output_text = "### 🔍 Expanded Context Analysis:\n"

	for i, (doc, score) in enumerate(results):
	chunk_id = doc.metadata['id']

	# Retrieve Previous and Next chunks
	# Logic: If it's the first chunk (ID 0), there is no 'prev', so returns empty string
	prev_chunk = self.all_chunks[chunk_id - 1] if chunk_id > 0 else "(Start of Text)"
	next_chunk = self.all_chunks[chunk_id + 1] if chunk_id < len(self.all_chunks) - 1 else "(End of Text)"

	output_text += f"\n#### 🎯 Match #{i+1} (Distance Score: {score:.4f})\n"

	# --- CHANGED HERE: Removed [-200:] and [:200] ---

	output_text += f"> Preceding Context:\n{prev_chunk}\n\n"
	output_text += f"> MATCH:\n{doc.page_content}\n\n"
	output_text += f"> Succeeding Context:\n{next_chunk}\n"

	output_text += "---\n"

	return output_text

	# Initialize System
	system = VectorSystem()

	# --- Gradio UI ---
	with gr.Blocks(title="EduGenius Context Retriever") as demo:
	gr.Markdown("# 🎓 EduGenius: Smart Context Retriever")
	gr.Markdown("Upload a Chapter. This version finds the best match AND shows you the text immediately before and after it.")

	with gr.Row():
	with gr.Column(scale=1):
	pdf_input = gr.File(label="1. Upload File (PDF or TXT)", file_types=[".pdf", ".txt"])
	upload_btn = gr.Button("Process File", variant="primary")
	upload_status = gr.Textbox(label="Status", interactive=False)

	with gr.Column(scale=2):
	question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?")
	answer_input = gr.Textbox(label="Student Answer (Optional)", placeholder="e.g., The heat causes it...")
	search_btn = gr.Button("Find Context + Neighbors", variant="secondary")

	evidence_output = gr.Markdown(label="Relevant Text Chunks")

	upload_btn.click(fn=system.process_file, inputs=[pdf_input], outputs=[upload_status])
	search_btn.click(fn=system.retrieve_evidence, inputs=[question_input, answer_input], outputs=[evidence_output])

	if __name__ == "__main__":
	demo.launch()









	# import gradio as gr
	# import fitz # PyMuPDF
	# import numpy as np
	# from langchain_text_splitters import RecursiveCharacterTextSplitter
	# from langchain_community.vectorstores import FAISS
	# from langchain_huggingface import HuggingFaceEmbeddings

	# class VectorSystem:
	# def __init__(self):
	# self.vector_store = None
	# self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
	# self.all_chunks = []

	# def process_file(self, file_obj):
	# """Extracts text, preserves order, and builds the Vector Index"""
	# if file_obj is None:
	# return "No file uploaded."

	# try:
	# # 1. Extract Text
	# text = ""
	# file_path = file_obj.name

	# if file_path.lower().endswith('.pdf'):
	# doc = fitz.open(file_path)
	# for page in doc: text += page.get_text()
	# elif file_path.lower().endswith('.txt'):
	# with open(file_path, 'r', encoding='utf-8') as f: text = f.read()
	# else:
	# return "❌ Error: Only .pdf and .txt files are supported."

	# # 2. Split Text
	# text_splitter = RecursiveCharacterTextSplitter(
	# chunk_size=800,
	# chunk_overlap=150,
	# separators=["\n\n", "\n", ".", " ", ""]
	# )
	# self.all_chunks = text_splitter.split_text(text)

	# if not self.all_chunks:
	# return "Could not extract text. Is the file empty?"

	# # 3. Build Vector Index with ID Metadata
	# metadatas = [{"id": i} for i in range(len(self.all_chunks))]

	# self.vector_store = FAISS.from_texts(
	# self.all_chunks,
	# self.embeddings,
	# metadatas=metadatas
	# )

	# return f"✅ Success! Indexed {len(self.all_chunks)} chunks."

	# except Exception as e:
	# return f"Error processing file: {str(e)}"

	# def retrieve_evidence(self, question, student_answer):
	# if not self.vector_store:
	# return "⚠️ Please upload and process a file first."
	# if not question:
	# return "⚠️ Please enter a Question."

	# # 1. Get Initial Results (Core Matches)
	# # FAISS returns L2 distance (Lower is better)
	# results = self.vector_store.similarity_search_with_score(question, k=3)

	# # We need the vector for the QUESTION to do our own math later
	# q_vector = np.array(self.embeddings.embed_query(question))

	# output_text = "### 🔍 Smart Context Analysis:\n"

	# for i, (doc, core_score) in enumerate(results):
	# chunk_id = doc.metadata['id']

	# # 2. Identify Neighbors
	# prev_chunk = self.all_chunks[chunk_id - 1] if chunk_id > 0 else ""
	# next_chunk = self.all_chunks[chunk_id + 1] if chunk_id < len(self.all_chunks) - 1 else ""

	# # 3. Create the "Super Chunk" (Prev + Core + Next)
	# super_chunk_text = f"{prev_chunk} {doc.page_content} {next_chunk}"

	# # 4. Calculate "Super Score" (Re-embedding on the fly)
	# # We embed the Super Chunk and measure distance to Question
	# super_vector = np.array(self.embeddings.embed_query(super_chunk_text))
	# super_score = np.linalg.norm(q_vector - super_vector) # Euclidean Distance

	# output_text += f"\n#### 🎯 Match #{i+1}\n"

	# # 5. The Logic Test: Does Context Improve the Score?
	# # Remember: LOWER score is BETTER (closer distance)

	# if super_score < core_score:
	# # CASE A: Context Helps! (Distance Reduced)
	# output_text += f"✅ Context Added: The surrounding text made the match stronger (Score improved from {core_score:.3f} to {super_score:.3f}).\n\n"
	# output_text += f"> {prev_chunk} {doc.page_content} {next_chunk}\n"
	# else:
	# # CASE B: Context Dilutes! (Distance Increased or Same)
	# output_text += f"⏹️ Context Ignored: Surrounding text was irrelevant or noisy (Score worsened from {core_score:.3f} to {super_score:.3f}). Showing Core Match only.\n\n"
	# output_text += f"> {doc.page_content}\n"

	# output_text += "---\n"

	# return output_text

	# # Initialize System
	# system = VectorSystem()

	# # --- Gradio UI ---
	# with gr.Blocks(title="EduGenius Context Retriever") as demo:
	# gr.Markdown("# 🎓 EduGenius: Intelligent Context Retriever")
	# gr.Markdown("Upload a Chapter. This system intelligently decides if it needs to read the surrounding paragraphs to answer your question.")

	# with gr.Row():
	# with gr.Column(scale=1):
	# pdf_input = gr.File(label="1. Upload File (PDF or TXT)", file_types=[".pdf", ".txt"])
	# upload_btn = gr.Button("Process File", variant="primary")
	# upload_status = gr.Textbox(label="Status", interactive=False)

	# with gr.Column(scale=2):
	# question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?")
	# answer_input = gr.Textbox(label="Student Answer (Optional)", placeholder="e.g., The heat causes it...")
	# search_btn = gr.Button("Find Evidence", variant="secondary")

	# evidence_output = gr.Markdown(label="Relevant Text Chunks")

	# upload_btn.click(fn=system.process_file, inputs=[pdf_input], outputs=[upload_status])
	# search_btn.click(fn=system.retrieve_evidence, inputs=[question_input, answer_input], outputs=[evidence_output])

	# if __name__ == "__main__":
	# demo.launch()