Spaces:

NHZ
/

First_Aid_Kit

Sleeping

App Files Files Community

NHZ commited on Jan 4, 2025

Commit

05b86d4

verified ·

1 Parent(s): c69c25e

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -63

app.py CHANGED Viewed

@@ -1,58 +1,45 @@
 import requests
 import numpy as np
 import faiss
 from PyPDF2 import PdfReader
-from transformers import AutoTokenizer, AutoModel
-from groq import Groq
 import streamlit as st
-import torch
-import os
-# Initialize Groq client using secret API key
-client = Groq(api_key=os.getenv("GROQ_API_KEY"))
-# Function to download and extract content from a public Google Drive PDF link
 def extract_pdf_content(drive_url):
-    # Extract file ID from the Google Drive URL
     file_id = drive_url.split("/d/")[1].split("/view")[0]
     download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
-    # Download the PDF content
     response = requests.get(download_url)
     if response.status_code != 200:
         return None
-    # Save and extract text from the PDF
     with open("document.pdf", "wb") as f:
         f.write(response.content)
     reader = PdfReader("document.pdf")
     text = ""
     for page in reader.pages:
         text += page.extract_text()
     return text
-# Function to chunk and tokenize text
-def chunk_and_tokenize(text, tokenizer, chunk_size=512):
-    tokens = tokenizer.encode(text, add_special_tokens=False)
-    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
-    return chunks
-# Function to compute embeddings and build FAISS index
-def build_faiss_index(chunks, model):
-    embeddings = []
-    for chunk in chunks:
-        input_ids = torch.tensor([chunk])
-        with torch.no_grad():
-            embedding = model(input_ids).last_hidden_state.mean(dim=1).detach().numpy()
-        embeddings.append(embedding)
-    embeddings = np.vstack(embeddings)
-    index = faiss.IndexFlatL2(embeddings.shape[1])
-    index.add(embeddings)
-    return index
 # Streamlit app
-st.title("RAG-based Application with Groq API")
 # Predefined Google Drive link
 drive_url = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
@@ -62,45 +49,40 @@ st.write("Extracting content from the document...")
 text = extract_pdf_content(drive_url)
 if text:
     st.write("Document extracted successfully!")
-    # Initialize tokenizer and model
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-    model = AutoModel.from_pretrained("bert-base-uncased")
-    st.write("Chunking and tokenizing content...")
-    chunks = chunk_and_tokenize(text, tokenizer)
-    st.write("Building FAISS index...")
-    index = build_faiss_index(chunks, model)
-    # Query input
     query = st.text_input("Enter your query:")
     if query:
-        st.write("Searching for the most relevant chunk...")
-        query_tokens = tokenizer.encode(query, add_special_tokens=False)
-        query_embedding = (
-            model(torch.tensor([query_tokens]))
-            .last_hidden_state.mean(dim=1)
-            .detach().numpy()
         )
-        _, indices = index.search(query_embedding, k=1)
-        # Retrieve the most relevant chunk
-        relevant_chunk = chunks[indices[0][0]]
-        relevant_text = tokenizer.decode(relevant_chunk)
-        st.write("Relevant chunk found:", relevant_text)
-        # Interact with Groq API
-        st.write("Querying the Groq API...")
-        chat_completion = client.chat.completions.create(
-            messages=[
-                {
-                    "role": "user",
-                    "content": relevant_text,
-                }
-            ],
-            model="llama-3.3-70b-versatile",
         )
-        st.write("Model Response:", chat_completion.choices[0].message.content)
 else:
     st.error("Failed to extract content from the document.")

+import os
 import requests
 import numpy as np
 import faiss
 from PyPDF2 import PdfReader
+from sentence_transformers import SentenceTransformer
+from langchain.vectorstores import FAISS
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+from langchain.llms import GroqLLM
 import streamlit as st
+# Initialize Groq API LLM
+llm = GroqLLM(api_key=os.getenv("GROQ_API_KEY"))
+# Function to extract content from a public Google Drive PDF link
 def extract_pdf_content(drive_url):
     file_id = drive_url.split("/d/")[1].split("/view")[0]
     download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
     response = requests.get(download_url)
     if response.status_code != 200:
         return None
     with open("document.pdf", "wb") as f:
         f.write(response.content)
     reader = PdfReader("document.pdf")
     text = ""
     for page in reader.pages:
         text += page.extract_text()
     return text
+# Function to create a FAISS vector store from the document content
+def create_vector_store(text):
+    sentences = text.split(". ")
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    vector_store = FAISS.from_texts(sentences, embedding=embeddings)
+    return vector_store, sentences
 # Streamlit app
+st.title("RAG-based Application with Focused Context")
 # Predefined Google Drive link
 drive_url = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
 text = extract_pdf_content(drive_url)
 if text:
     st.write("Document extracted successfully!")
+    st.write("Creating vector store...")
+    vector_store, sentences = create_vector_store(text)
+    st.write("Vector store created successfully!")
     query = st.text_input("Enter your query:")
     if query:
+        st.write("Retrieving relevant context from the document...")
+        retriever = vector_store.as_retriever()
+        retriever.search_kwargs["k"] = 3  # Retrieve top 3 matches
+        # Define a prompt template to guide LLM response generation
+        prompt_template = PromptTemplate(
+            template="""
+            Use the following context to answer the question:
+            {context}
+            Question: {question}
+            Answer:""",
+            input_variables=["context", "question"]
         )
+        # Create a RetrievalQA chain
+        qa_chain = RetrievalQA(
+            retriever=retriever,
+            llm=llm,
+            prompt=prompt_template
         )
+        # Run the query through the QA chain
+        result = qa_chain.run(query)
+        st.write("Answer:", result)
 else:
     st.error("Failed to extract content from the document.")