import os import streamlit as st import requests import PyPDF2 from groq import Groq from langchain.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings from sentence_transformers import SentenceTransformer # Initialize Groq client client = Groq(api_key=os.getenv("GROQ_API_KEY")) # Function to extract text from a PDF def extract_text_from_pdf(pdf_url): # Convert Google Drive shareable link to direct download link direct_url = pdf_url.replace("/view?usp=sharing", "").replace("file/d/", "uc?id=") response = requests.get(direct_url) pdf_content = response.content with open("temp.pdf", "wb") as f: f.write(pdf_content) # Read the PDF content with open("temp.pdf", "rb") as f: reader = PyPDF2.PdfReader(f) text = "" for page in reader.pages: text += page.extract_text() os.remove("temp.pdf") return text # Function to chunk text manually def chunk_text(text, chunk_size=300): # Split text by spaces and process into chunks words = text.split() chunks = [] current_chunk = [] for word in words: if len(current_chunk) + len(word.split()) <= chunk_size: current_chunk.append(word) else: chunks.append(" ".join(current_chunk)) current_chunk = [word] if current_chunk: chunks.append(" ".join(current_chunk)) return chunks # Function to create embeddings and store them in FAISS using Langchain def create_faiss_index(chunks): # Use SentenceTransformer for embeddings embeddings_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # Create FAISS vector store doc_search = FAISS.from_texts(chunks, embeddings) return doc_search # Function to query FAISS and retrieve relevant document chunks def query_faiss(doc_search, query): results = doc_search.similarity_search(query, k=3) return [result.page_content for result in results] # Main Streamlit App def main(): st.title("RAG-based Application") st.write("Interact with your document using Groq-powered model.") # Pre-defined document link doc_link = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing" # Extract Document Content if "document_text" not in st.session_state: st.write("Extracting document content...") text = extract_text_from_pdf(doc_link) st.session_state['document_text'] = text st.success("Document content extracted!") # Process Document and Create FAISS Index if 'document_text' in st.session_state and "faiss_index" not in st.session_state: st.write("Processing document...") chunks = chunk_text(st.session_state['document_text']) doc_search = create_faiss_index(chunks) st.session_state['faiss_index'] = doc_search st.session_state['chunks'] = chunks st.success(f"Document processed into {len(chunks)} chunks!") # Query the Document if 'faiss_index' in st.session_state: st.header("Ask Questions") query = st.text_input("Enter your question here") if st.button("Query Document"): results = query_faiss(st.session_state['faiss_index'], query) if not results: st.warning("No relevant context found in the document.") else: st.write("### Results from Document:") for i, result in enumerate(results): st.write(f"**Result {i+1}:** {result}") # Combine results to provide context context = "\n".join(results) st.write("### Insights based on Document Context:") prompt = ( f"The following context is from the document:\n\n" f"{context}\n\n" f"Based on this context, answer the question:\n" f"{query}" ) chat_completion = client.chat.completions.create( messages=[{"role": "user", "content": prompt}], model="llama-3.3-70b-versatile", ) st.write(chat_completion.choices[0].message.content) if __name__ == "__main__": main()