| import gradio as gr | |
| import fitz # PyMuPDF | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| class VectorSystem: | |
| def __init__(self): | |
| self.vector_store = None | |
| self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| # NEW: We keep a copy of all chunks in a list so we can access neighbors by index | |
| self.all_chunks = [] | |
| def process_file(self, file_obj): | |
| """Extracts text, preserves order, and builds the Vector Index""" | |
| if file_obj is None: | |
| return "No file uploaded." | |
| try: | |
| # 1. Extract Text | |
| text = "" | |
| file_path = file_obj.name | |
| if file_path.lower().endswith('.pdf'): | |
| doc = fitz.open(file_path) | |
| for page in doc: text += page.get_text() | |
| elif file_path.lower().endswith('.txt'): | |
| with open(file_path, 'r', encoding='utf-8') as f: text = f.read() | |
| else: | |
| return "β Error: Only .pdf and .txt files are supported." | |
| # 2. Split Text | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=800, | |
| chunk_overlap=150, | |
| separators=["\n\n", "\n", ".", " ", ""] | |
| ) | |
| # Store chunks in the class so we can look them up by ID later | |
| self.all_chunks = text_splitter.split_text(text) | |
| if not self.all_chunks: | |
| return "Could not extract text. Is the file empty?" | |
| # 3. Build Vector Index with ID Metadata | |
| # We attach the index ID (0, 1, 2...) to every vector | |
| metadatas = [{"id": i} for i in range(len(self.all_chunks))] | |
| self.vector_store = FAISS.from_texts( | |
| self.all_chunks, | |
| self.embeddings, | |
| metadatas=metadatas | |
| ) | |
| return f"β Success! Indexed {len(self.all_chunks)} chunks." | |
| except Exception as e: | |
| return f"Error processing file: {str(e)}" | |
| def retrieve_evidence(self, question, student_answer): | |
| if not self.vector_store: | |
| return "β οΈ Please upload and process a file first." | |
| if not question: | |
| return "β οΈ Please enter a Question." | |
| # Lower Score = Better Match | |
| results = self.vector_store.similarity_search_with_score(question, k=3) | |
| output_text = "### π Expanded Context Analysis:\n" | |
| for i, (doc, score) in enumerate(results): | |
| chunk_id = doc.metadata['id'] | |
| # Retrieve Previous and Next chunks | |
| # Logic: If it's the first chunk (ID 0), there is no 'prev', so returns empty string | |
| prev_chunk = self.all_chunks[chunk_id - 1] if chunk_id > 0 else "(Start of Text)" | |
| next_chunk = self.all_chunks[chunk_id + 1] if chunk_id < len(self.all_chunks) - 1 else "(End of Text)" | |
| output_text += f"\n#### π― Match #{i+1} (Distance Score: {score:.4f})\n" | |
| # --- CHANGED HERE: Removed [-200:] and [:200] --- | |
| output_text += f"> **Preceding Context:**\n{prev_chunk}\n\n" | |
| output_text += f"> **MATCH:**\n**{doc.page_content}**\n\n" | |
| output_text += f"> **Succeeding Context:**\n{next_chunk}\n" | |
| output_text += "---\n" | |
| return output_text | |
| # Initialize System | |
| system = VectorSystem() | |
| # --- Gradio UI --- | |
| with gr.Blocks(title="EduGenius Context Retriever") as demo: | |
| gr.Markdown("# π EduGenius: Smart Context Retriever") | |
| gr.Markdown("Upload a Chapter. This version finds the best match AND shows you the text immediately before and after it.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| pdf_input = gr.File(label="1. Upload File (PDF or TXT)", file_types=[".pdf", ".txt"]) | |
| upload_btn = gr.Button("Process File", variant="primary") | |
| upload_status = gr.Textbox(label="Status", interactive=False) | |
| with gr.Column(scale=2): | |
| question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?") | |
| answer_input = gr.Textbox(label="Student Answer (Optional)", placeholder="e.g., The heat causes it...") | |
| search_btn = gr.Button("Find Context + Neighbors", variant="secondary") | |
| evidence_output = gr.Markdown(label="Relevant Text Chunks") | |
| upload_btn.click(fn=system.process_file, inputs=[pdf_input], outputs=[upload_status]) | |
| search_btn.click(fn=system.retrieve_evidence, inputs=[question_input, answer_input], outputs=[evidence_output]) | |
| if __name__ == "__main__": | |
| demo.launch() | |
| # import gradio as gr | |
| # import fitz # PyMuPDF | |
| # import numpy as np | |
| # from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| # from langchain_community.vectorstores import FAISS | |
| # from langchain_huggingface import HuggingFaceEmbeddings | |
| # class VectorSystem: | |
| # def __init__(self): | |
| # self.vector_store = None | |
| # self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| # self.all_chunks = [] | |
| # def process_file(self, file_obj): | |
| # """Extracts text, preserves order, and builds the Vector Index""" | |
| # if file_obj is None: | |
| # return "No file uploaded." | |
| # try: | |
| # # 1. Extract Text | |
| # text = "" | |
| # file_path = file_obj.name | |
| # if file_path.lower().endswith('.pdf'): | |
| # doc = fitz.open(file_path) | |
| # for page in doc: text += page.get_text() | |
| # elif file_path.lower().endswith('.txt'): | |
| # with open(file_path, 'r', encoding='utf-8') as f: text = f.read() | |
| # else: | |
| # return "β Error: Only .pdf and .txt files are supported." | |
| # # 2. Split Text | |
| # text_splitter = RecursiveCharacterTextSplitter( | |
| # chunk_size=800, | |
| # chunk_overlap=150, | |
| # separators=["\n\n", "\n", ".", " ", ""] | |
| # ) | |
| # self.all_chunks = text_splitter.split_text(text) | |
| # if not self.all_chunks: | |
| # return "Could not extract text. Is the file empty?" | |
| # # 3. Build Vector Index with ID Metadata | |
| # metadatas = [{"id": i} for i in range(len(self.all_chunks))] | |
| # self.vector_store = FAISS.from_texts( | |
| # self.all_chunks, | |
| # self.embeddings, | |
| # metadatas=metadatas | |
| # ) | |
| # return f"β Success! Indexed {len(self.all_chunks)} chunks." | |
| # except Exception as e: | |
| # return f"Error processing file: {str(e)}" | |
| # def retrieve_evidence(self, question, student_answer): | |
| # if not self.vector_store: | |
| # return "β οΈ Please upload and process a file first." | |
| # if not question: | |
| # return "β οΈ Please enter a Question." | |
| # # 1. Get Initial Results (Core Matches) | |
| # # FAISS returns L2 distance (Lower is better) | |
| # results = self.vector_store.similarity_search_with_score(question, k=3) | |
| # # We need the vector for the QUESTION to do our own math later | |
| # q_vector = np.array(self.embeddings.embed_query(question)) | |
| # output_text = "### π Smart Context Analysis:\n" | |
| # for i, (doc, core_score) in enumerate(results): | |
| # chunk_id = doc.metadata['id'] | |
| # # 2. Identify Neighbors | |
| # prev_chunk = self.all_chunks[chunk_id - 1] if chunk_id > 0 else "" | |
| # next_chunk = self.all_chunks[chunk_id + 1] if chunk_id < len(self.all_chunks) - 1 else "" | |
| # # 3. Create the "Super Chunk" (Prev + Core + Next) | |
| # super_chunk_text = f"{prev_chunk} {doc.page_content} {next_chunk}" | |
| # # 4. Calculate "Super Score" (Re-embedding on the fly) | |
| # # We embed the Super Chunk and measure distance to Question | |
| # super_vector = np.array(self.embeddings.embed_query(super_chunk_text)) | |
| # super_score = np.linalg.norm(q_vector - super_vector) # Euclidean Distance | |
| # output_text += f"\n#### π― Match #{i+1}\n" | |
| # # 5. The Logic Test: Does Context Improve the Score? | |
| # # Remember: LOWER score is BETTER (closer distance) | |
| # if super_score < core_score: | |
| # # CASE A: Context Helps! (Distance Reduced) | |
| # output_text += f"**β Context Added:** The surrounding text made the match stronger (Score improved from {core_score:.3f} to {super_score:.3f}).\n\n" | |
| # output_text += f"> {prev_chunk} **{doc.page_content}** {next_chunk}\n" | |
| # else: | |
| # # CASE B: Context Dilutes! (Distance Increased or Same) | |
| # output_text += f"**βΉοΈ Context Ignored:** Surrounding text was irrelevant or noisy (Score worsened from {core_score:.3f} to {super_score:.3f}). Showing Core Match only.\n\n" | |
| # output_text += f"> **{doc.page_content}**\n" | |
| # output_text += "---\n" | |
| # return output_text | |
| # # Initialize System | |
| # system = VectorSystem() | |
| # # --- Gradio UI --- | |
| # with gr.Blocks(title="EduGenius Context Retriever") as demo: | |
| # gr.Markdown("# π EduGenius: Intelligent Context Retriever") | |
| # gr.Markdown("Upload a Chapter. This system intelligently decides if it needs to read the surrounding paragraphs to answer your question.") | |
| # with gr.Row(): | |
| # with gr.Column(scale=1): | |
| # pdf_input = gr.File(label="1. Upload File (PDF or TXT)", file_types=[".pdf", ".txt"]) | |
| # upload_btn = gr.Button("Process File", variant="primary") | |
| # upload_status = gr.Textbox(label="Status", interactive=False) | |
| # with gr.Column(scale=2): | |
| # question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?") | |
| # answer_input = gr.Textbox(label="Student Answer (Optional)", placeholder="e.g., The heat causes it...") | |
| # search_btn = gr.Button("Find Evidence", variant="secondary") | |
| # evidence_output = gr.Markdown(label="Relevant Text Chunks") | |
| # upload_btn.click(fn=system.process_file, inputs=[pdf_input], outputs=[upload_status]) | |
| # search_btn.click(fn=system.retrieve_evidence, inputs=[question_input, answer_input], outputs=[evidence_output]) | |
| # if __name__ == "__main__": | |
| # demo.launch() |