Spaces:

RakeshUtekar
/

RAG-based-PDF-Query-System

Runtime error

App Files Files Community

RAG-based-PDF-Query-System / app.py

RakeshUtekar

Update app.py

85c57d3 verified 11 months ago

raw

history blame contribute delete

6.39 kB

	import os
	import time

	import streamlit as st
	from dotenv import load_dotenv

	from extract import extract_text_from_pdfs
	from generate import generate_response
	from preprocess import preprocess_text
	from retrieve import create_vectorizer, retrieve

	# Load environment variables from .env file (if needed)
	load_dotenv()

	# Initialize session state
	if "messages" not in st.session_state:
	st.session_state.messages = []

	if "pdf_files" not in st.session_state:
	st.session_state.pdf_files = []

	if "processed_texts" not in st.session_state:
	st.session_state.processed_texts = []

	st.title("RAG-based PDF Query System")

	# File uploader for PDF files
	uploaded_files = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)

	if uploaded_files:
	# Check if new files were uploaded (clear old data if new ones are uploaded)
	if "uploaded_files" not in st.session_state or uploaded_files != st.session_state.uploaded_files:
	st.session_state.uploaded_files = uploaded_files
	st.session_state.messages = []
	st.session_state.pdf_files = []
	st.session_state.processed_texts = []

	# Initialize status container
	with st.status("Processing the uploaded PDFs...", state="running") as status:
	# Save uploaded files to disk
	for uploaded_file in uploaded_files:
	with open(uploaded_file.name, "wb") as f:
	f.write(uploaded_file.getbuffer())
	st.session_state.pdf_files.append(uploaded_file.name)

	# Extract text from PDFs
	num_files = len(st.session_state.pdf_files)
	texts = []
	for i, pdf_file in enumerate(st.session_state.pdf_files):
	st.write(f"Extracting text from file {i + 1} of {num_files}...")
	text = extract_text_from_pdfs([pdf_file])
	texts.extend(text)
	time.sleep(0.1)

	# Preprocess text
	st.write("Preprocessing text...")
	st.session_state.processed_texts = preprocess_text(texts)
	time.sleep(0.1)

	# Create vectorizer and transform texts
	st.write("Creating vectorizer and transforming texts...")
	st.session_state.vectorizer, st.session_state.X = create_vectorizer(st.session_state.processed_texts)
	time.sleep(0.1)

	# Update status to complete
	status.update(label="Processing complete!", state="complete")

	else:
	st.stop()

	# Chat interface
	st.write("### Ask a question about the uploaded PDFs")

	# Display chat messages
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.write(message["content"])

	# Chat input
	prompt = st.chat_input("Ask something about the uploaded PDFs")
	if prompt:
	# Add user message to session state
	st.session_state.messages.append({"role": "user", "content": prompt})

	# Retrieve relevant texts
	top_indices = retrieve(prompt, st.session_state.X, st.session_state.vectorizer)
	retrieved_texts = [" ".join(st.session_state.processed_texts[i]) for i in top_indices]

	# Generate response using Qwen2.5-7B-Instruct-1M
	response = generate_response(retrieved_texts, prompt)
	st.session_state.messages.append({"role": "assistant", "content": response})

	# Display user message
	with st.chat_message("user"):
	st.write(prompt)

	# Display assistant message
	with st.chat_message("assistant"):
	st.write(response)

	# Clean up uploaded files
	for pdf_file in st.session_state.pdf_files:
	if os.path.exists(pdf_file):
	os.remove(pdf_file)
	st.session_state.messages = [] # Clear previous messages
	st.session_state.pdf_files = []
	st.session_state.processed_texts = []

	# Initialize status container
	with st.status("Processing the uploaded PDFs...", state="running") as status:
	# Save uploaded files to disk
	for uploaded_file in uploaded_files:
	with open(uploaded_file.name, "wb") as f:
	f.write(uploaded_file.getbuffer())
	st.session_state.pdf_files.append(uploaded_file.name)

	# Extract text from PDFs
	num_files = len(st.session_state.pdf_files)
	texts = []
	for i, pdf_file in enumerate(st.session_state.pdf_files):
	st.write(f"Extracting text from file {i + 1} of {num_files}...")
	text = extract_text_from_pdfs([pdf_file])
	texts.extend(text)
	time.sleep(0.1) # Simulate time taken for processing

	# Preprocess text
	st.write("Preprocessing text...")
	st.session_state.processed_texts = preprocess_text(texts)
	time.sleep(0.1) # Simulate time taken for processing

	# Create vectorizer and transform texts
	st.write("Creating vectorizer and transforming texts...")
	st.session_state.vectorizer, st.session_state.X = create_vectorizer(st.session_state.processed_texts)
	time.sleep(0.1) # Simulate time taken for processing

	# Update status to complete
	status.update(label="Processing complete!", state="complete")

	else:
	st.stop()

	# Chat interface
	st.write("### Ask a question about the uploaded PDFs")

	# Display chat messages
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.write(message["content"])

	# Chat input
	prompt = st.chat_input("Ask something about the uploaded PDFs")
	if prompt:
	# Add user message to session state
	st.session_state.messages.append({"role": "user", "content": prompt})

	# Retrieve relevant texts
	top_indices = retrieve(prompt, st.session_state.X, st.session_state.vectorizer)
	retrieved_texts = [" ".join(st.session_state.processed_texts[i]) for i in top_indices]

	# Generate response
	response = generate_response(retrieved_texts, prompt)
	st.session_state.messages.append({"role": "assistant", "content": response})

	# Display user message
	with st.chat_message("user"):
	st.write(prompt)

	# Display assistant message
	with st.chat_message("assistant"):
	st.write(response)

	# Clean up uploaded files
	for pdf_file in st.session_state.pdf_files:
	if os.path.exists(pdf_file):
	os.remove(pdf_file)