Spaces:

Neha13
/

Student_Counselling_Bot

Sleeping

App Files Files Community

Student_Counselling_Bot / app.py

Neha13

Create app.py

c55ed56 verified 8 months ago

raw

history blame contribute delete

16 kB

	import streamlit as st
	import os
	from PIL import Image
	import pytesseract
	from pdf2image import convert_from_path
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain.prompts import PromptTemplate
	from langchain.chains import RetrievalQA
	from langchain.memory import ConversationBufferMemory
	from langchain_groq import ChatGroq
	from langchain_community.vectorstores import FAISS
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_core.vectorstores import VectorStoreRetriever
	import streamlit.components.v1 as components
	from streamlit_pdf_viewer import pdf_viewer
	from io import BytesIO
	import base64

	if 'pdf_ref' not in st.session_state:
	st.session_state.pdf_ref = None

	# Initialize the Groq API Key and the model
	os.environ["GROQ_API_KEY"] = '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'
	# config = {'max_new_tokens': 512, 'context_length': 8000}
	llm = ChatGroq(
	model='llama3-70b-8192',
	temperature=0.5,
	max_tokens=None,
	timeout=None,
	max_retries=2
	)

	# Define OCR functions for image and PDF files
	def ocr_image(image_path, language='eng+guj'):
	img = Image.open(image_path)
	text = pytesseract.image_to_string(img, lang=language)
	return text

	def ocr_pdf(pdf_path, language='eng+guj'):
	images = convert_from_path(pdf_path)
	all_text = ""
	for img in images:
	text = pytesseract.image_to_string(img, lang=language)
	all_text += text + "\n"
	return all_text

	def ocr_file(file_path):
	file_extension = os.path.splitext(file_path)[1].lower()

	if file_extension == ".pdf":
	text_re = ocr_pdf(file_path, language='guj+eng')
	elif file_extension in [".jpg", ".jpeg", ".png", ".bmp"]:
	text_re = ocr_image(file_path, language='guj+eng')
	else:
	raise ValueError("Unsupported file format. Supported formats are PDF, JPG, JPEG, PNG, BMP.")

	return text_re

	def get_text_chunks(text):
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
	chunks = text_splitter.split_text(text)
	return chunks

	# Function to create or update the vector store
	def get_vector_store(text_chunks):
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True})
	vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)

	# Ensure the directory exists before saving the vector store
	os.makedirs("faiss_index", exist_ok=True)
	vector_store.save_local("faiss_index")

	return vector_store

	# Function to process multiple files and extract vector store
	def process_ocr_and_pdf_files(file_paths):
	raw_text = ""
	for file_path in file_paths:
	raw_text += ocr_file(file_path) + "\n"
	text_chunks = get_text_chunks(raw_text)
	return get_vector_store(text_chunks)

	# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True})
	# new_vector_store = FAISS.load_local(
	# "faiss_index", embeddings, allow_dangerous_deserialization=True
	# )

	# docs = new_vector_store.similarity_search("qux")
	# Conversational chain for Q&A
	def get_conversational_chain():
	template = """Core Identity & Responsibilities

	Role: Official AI Assistant for Admission Committee for Professional Courses (ACPC), Gujarat
	Mission: Process OCR-extracted text and provide clear, direct guidance on admissions and scholarships
	Focus: Deliver user-friendly responses while handling OCR complexities internally

	Processing Framework
	1. Text & Document Processing

	Process OCR-extracted text from various document types with attention to tables and structured data
	Internally identify and handle OCR errors without explicitly mentioning them unless critical
	Preserve tabular structures and relationships between data points
	Present information in clean, readable formats regardless of source OCR quality

	2. Language Handling

	Support seamless communication in both Gujarati and English
	Respond in the same language as the user's query
	Present technical terms in both languages when relevant
	Adjust language complexity to user comprehension level

	3. Response Principles

	Provide direct, concise answers (2-3 sentences for simple queries)
	Skip unnecessary OCR quality disclaimers unless information is critically ambiguous
	Present information in user-friendly formats, especially for tables and numerical data
	Maintain professional yet conversational tone

	Query Handling Strategies
	1. Direct Information Queries

	Provide straightforward answers without mentioning OCR processing
	Example:
	User: "What is the last date for application submission?"
	Response: "The last date for application submission is June 15, 2025."
	(NOT: "Based on the OCR-processed text, the last date appears to be...")

	2. Table Data Extraction

	Present tabular information in clean, structured format
	Preserve relationships between data points
	Example:
	User: "What are the fees for different courses?"
	Response:
	"The fees for various courses are:

	B.Tech: ₹1,15,000 (General), ₹58,000 (SC/ST)
	B.Pharm: ₹85,000 (General), ₹42,500 (SC/ST)"
	(NOT: "According to the OCR-extracted table, which may have quality issues...")



	3. Ambiguous Information Handling

	If OCR quality affects critical information (like dates, amounts, eligibility):

	Provide the most likely correct information
	Add a brief note suggesting verification only for critical information
	Example: "The application deadline is June 15, 2025. For this important deadline, we recommend confirming on the official ACPC website."



	4. Uncertain Information Protocol

	For critically unclear OCR content:

	State the most probable information
	Add a simple verification suggestion without mentioning OCR
	Example: "Based on the available information, the income limit appears to be ₹6,00,000. For this critical criterion, please verify on the official ACPC portal."



	5. Structured Document Navigation

	Present information in the same logical structure as the original document
	Use headings and bullet points for clarity when appropriate
	Maintain document hierarchies when explaining multi-step processes

	6. Out-of-Scope Queries

	Politely redirect without mentioning document or OCR limitations
	Example: "This query is outside the scope of ACPC admission guidelines. For information about [topic], please contact [appropriate authority]."

	7. Key Information Emphasis

	Highlight critical information like deadlines, eligibility criteria, and document requirements
	Make important numerical data visually distinct
	Prioritize accuracy for dates, amounts, and eligibility requirements

	8. Multi-Part Query Handling

	Address each component of multi-part queries separately
	Maintain logical flow between related pieces of information
	Preserve context when explaining complex processes

	9. Completeness Guidelines

	Ensure responses cover all aspects of user queries
	Provide step-by-step guidance for procedural questions
	Include relevant related information that users might need

	10. Response Quality Control

	Internally verify numerical data consistency
	Apply contextual understanding to identify potential OCR errors without mentioning them
	Present information with confidence unless critically uncertain
	Focus on delivering actionable information rather than discussing document limitations

	Input:
	OCR-processed text from uploaded documents: {context}
	Chat History: {history}
	Current Question: {question}
	Output:
	Give a clear, direct, and user-friendly response that focuses on the information itself rather than its OCR source. Present information confidently, mentioning verification only for critically important or potentially ambiguous details.
	"""
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True})
	new_vector_store = FAISS.load_local(
	"faiss_index", embeddings, allow_dangerous_deserialization=True
	)
	QA_CHAIN_PROMPT = PromptTemplate(input_variables=["history", "context", "question"], template=template)
	qa_chain = RetrievalQA.from_chain_type(llm, retriever=new_vector_store.as_retriever(), chain_type='stuff', verbose=True, chain_type_kwargs={"verbose": True,"prompt": QA_CHAIN_PROMPT,"memory": ConversationBufferMemory(memory_key="history",input_key="question"),})
	return qa_chain

	def handle_uploaded_file(uploaded_file, show_in_sidebar=False):
	file_extension = os.path.splitext(uploaded_file.name)[1].lower()
	file_path = os.path.join("temp", uploaded_file.name)
	os.makedirs(os.path.dirname(file_path), exist_ok=True)

	with open(file_path, "wb") as f:
	f.write(uploaded_file.getbuffer())

	# Show document in the main panel and optionally in the sidebar
	if show_in_sidebar:
	st.sidebar.write(f"### File: {uploaded_file.name}")

	# if file_extension == ".pdf":
	# st.session_state.pdf_ref = uploaded_file # Save the PDF to session state
	# binary_data = st.session_state.pdf_ref.getvalue() # Get the binary data of the PDF
	# # Use the pdf_viewer to display the PDF
	# # sidebar.pdf_viewer(input=binary_data, width=700)
	if file_extension == ".pdf":
	# Display the PDF in the sidebar by embedding the PDF file
	with open(file_path, "rb") as pdf_file:
	pdf_data = pdf_file.read()
	# Use the HTML iframe to display the PDF in the sidebar
	pdf_base64 = base64.b64encode(pdf_data).decode('utf-8')
	st.sidebar.markdown(f'<iframe src="data:application/pdf;base64,{pdf_base64}" width="500" height="500"></iframe>', unsafe_allow_html=True)

	elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
	img = Image.open(file_path)
	st.sidebar.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_container_width=True) # Updated here
	else:
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()
	st.sidebar.text_area("File Content", content, height=300)



	# Optionally show document in the main content area
	# st.write(f"### Main Panel - {uploaded_file.name}")
	# if file_extension == '.pdf':
	# st.write("Displaying PDF:")
	# st.components.v1.html(f'<embed src="{file_path}" width="700" height="500" type="application/pdf">')
	# elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
	# img = Image.open(file_path)
	# st.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_column_width=True)
	# else:
	# with open(file_path, 'r', encoding='utf-8') as f:
	# content = f.read()
	# st.text_area("File Content", content, height=300)

	def user_input(user_question):
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True})
	new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
	docs = new_db.similarity_search(user_question)
	chain = get_conversational_chain()
	response = chain({"input_documents": docs, "query": user_question}, return_only_outputs=True)
	result = response.get("result", "No result found")

	# Save the question and answer to session state for history tracking
	if 'conversation_history' not in st.session_state:
	st.session_state.conversation_history = []

	# Append new question and response to the history
	st.session_state.conversation_history.append({'question': user_question, 'answer': result})

	return result

	# def handle_uploaded_file(uploaded_file, show_in_sidebar=False):
	# file_extension = os.path.splitext(uploaded_file.name)[1].lower()
	# file_path = os.path.join("temp", uploaded_file.name)
	# os.makedirs(os.path.dirname(file_path), exist_ok=True)

	# with open(file_path, "wb") as f:
	# f.write(uploaded_file.getbuffer())

	# # Show document in the main panel and optionally in the sidebar
	# if show_in_sidebar:
	# st.sidebar.write(f"### File: {uploaded_file.name}")
	# if file_extension == '.pdf':
	# st.sidebar.write("Displaying PDF:")
	# st.sidebar.components.html(f'<embed src="{file_path}" width="700" height="500" type="application/pdf">')

	# # st.sidebar.components.v1.html(f'<embed src="{file_path}" width="700" height="500" type="application/pdf">')
	# elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
	# img = Image.open(file_path)
	# st.sidebar.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_column_width=True)
	# else:
	# with open(file_path, 'r', encoding='utf-8') as f:
	# content = f.read()
	# st.sidebar.text_area("File Content", content, height=300)

	# Optionally show document in the main content area
	# st.write(f"### Main Panel - {uploaded_file.name}")
	# if file_extension == '.pdf':
	# st.write("Displaying PDF:")
	# st.components.v1.html(f'<embed src="{file_path}" width="700" height="500" type="application/pdf">')
	# elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
	# img = Image.open(file_path)
	# st.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_column_width=True)
	# else:
	# with open(file_path, 'r', encoding='utf-8') as f:
	# content = f.read()
	# st.text_area("File Content", content, height=300)

	# Streamlit app to upload files and interact with the Q&A system
	def main():
	st.title("File Upload and OCR Processing")
	st.write("Upload up to 5 files (PDF, JPG, JPEG, PNG, BMP)")


	uploaded_files = st.file_uploader("Choose files", type=["pdf", "jpg", "jpeg", "png", "bmp"], accept_multiple_files=True)

	if len(uploaded_files) > 0:
	file_paths = []

	# Save uploaded files and process them
	for uploaded_file in uploaded_files[:5]: # Limit to 5 files
	file_path = os.path.join("temp", uploaded_file.name)
	os.makedirs(os.path.dirname(file_path), exist_ok=True)
	with open(file_path, "wb") as f:
	f.write(uploaded_file.getbuffer())
	file_paths.append(file_path)


	# Process the OCR and PDF files and store the vector data
	st.write("Processing files...")
	vector_store = process_ocr_and_pdf_files(file_paths)
	st.write("Processing completed! The vector store has been updated.")

	show_in_sidebar = st.sidebar.checkbox("Show files in Sidebar", value=True)

	if len(uploaded_files) > 0:
	# Process and display each uploaded file in its format
	for uploaded_file in uploaded_files:
	handle_uploaded_file(uploaded_file, show_in_sidebar)

	# Ask user for a question related to the documents
	user_question = st.text_input("Ask a question related to the uploaded documents:")

	if user_question:
	response = user_input(user_question)
	st.write("Answer:", response)

	# Button to display chat history

	# if st.button("Show Chat History"):
	# history = st.session_state.get('history', [])
	# if history:
	# st.write("Conversation History:")
	# for idx, (q, a) in enumerate(history):
	# st.write(f"Q{idx+1}: {q}")
	# st.write(f"A{idx+1}: {a}")
	# else:
	# st.write("No conversation history.")
	with st.expander('Conversation History'):
	for entry in st.session_state.conversation_history:
	st.info(f"Q: {entry['question']}\nA: {entry['answer']}")


	if __name__ == "__main__":
	main()