Spaces:

NHZ
/

First_Aid_Kit

Sleeping

File size: 4,325 Bytes

import os
import streamlit as st
import requests
import PyPDF2
from groq import Groq
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

# Initialize Groq client
client = Groq(api_key=os.getenv("GROQ_API_KEY"))

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_url):
    # Convert Google Drive shareable link to direct download link
    direct_url = pdf_url.replace("/view?usp=sharing", "").replace("file/d/", "uc?id=")
    response = requests.get(direct_url)
    pdf_content = response.content
    with open("temp.pdf", "wb") as f:
        f.write(pdf_content)

    # Read the PDF content
    with open("temp.pdf", "rb") as f:
        reader = PyPDF2.PdfReader(f)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    os.remove("temp.pdf")
    return text

# Function to chunk text manually
def chunk_text(text, chunk_size=300):
    # Split text by spaces and process into chunks
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        if len(current_chunk) + len(word.split()) <= chunk_size:
            current_chunk.append(word)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]

    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Function to create embeddings and store them in FAISS using Langchain
def create_faiss_index(chunks):
    # Use SentenceTransformer for embeddings
    embeddings_model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

    # Create FAISS vector store
    doc_search = FAISS.from_texts(chunks, embeddings)
    return doc_search

# Function to query FAISS and retrieve relevant document chunks
def query_faiss(doc_search, query):
    results = doc_search.similarity_search(query, k=3)
    return [result.page_content for result in results]

# Main Streamlit App
def main():
    st.title("RAG-based Application")
    st.write("Interact with your document using Groq-powered model.")

    # Pre-defined document link
    doc_link = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"

    # Extract Document Content
    if "document_text" not in st.session_state:
        st.write("Extracting document content...")
        text = extract_text_from_pdf(doc_link)
        st.session_state['document_text'] = text
        st.success("Document content extracted!")

    # Process Document and Create FAISS Index
    if 'document_text' in st.session_state and "faiss_index" not in st.session_state:
        st.write("Processing document...")
        chunks = chunk_text(st.session_state['document_text'])
        doc_search = create_faiss_index(chunks)
        st.session_state['faiss_index'] = doc_search
        st.session_state['chunks'] = chunks
        st.success(f"Document processed into {len(chunks)} chunks!")

    # Query the Document
    if 'faiss_index' in st.session_state:
        st.header("Ask Questions")
        query = st.text_input("Enter your question here")
        if st.button("Query Document"):
            results = query_faiss(st.session_state['faiss_index'], query)
            if not results:
                st.warning("No relevant context found in the document.")
            else:
                st.write("### Results from Document:")
                for i, result in enumerate(results):
                    st.write(f"**Result {i+1}:** {result}")

                # Combine results to provide context
                context = "\n".join(results)
                st.write("### Insights based on Document Context:")
                prompt = (
                    f"The following context is from the document:\n\n"
                    f"{context}\n\n"
                    f"Based on this context, answer the question:\n"
                    f"{query}"
                )

                chat_completion = client.chat.completions.create(
                    messages=[{"role": "user", "content": prompt}],
                    model="llama-3.3-70b-versatile",
                )
                st.write(chat_completion.choices[0].message.content)

if __name__ == "__main__":
    main()