File size: 4,325 Bytes
05b86d4
d386915
636755b
e7ac282
636755b
51163d3
 
 
636755b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7ac282
636755b
 
 
 
daa4a6a
644455e
51163d3
636755b
51163d3
 
636755b
 
 
51163d3
 
 
636755b
 
51163d3
636755b
 
 
 
 
51163d3
636755b
51163d3
 
 
 
 
 
 
 
 
 
 
 
636755b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51163d3
 
636755b
 
 
 
 
 
 
 
51163d3
e7ac282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51163d3
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
import streamlit as st
import requests
import PyPDF2
from groq import Groq
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

# Initialize Groq client
client = Groq(api_key=os.getenv("GROQ_API_KEY"))

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_url):
    # Convert Google Drive shareable link to direct download link
    direct_url = pdf_url.replace("/view?usp=sharing", "").replace("file/d/", "uc?id=")
    response = requests.get(direct_url)
    pdf_content = response.content
    with open("temp.pdf", "wb") as f:
        f.write(pdf_content)

    # Read the PDF content
    with open("temp.pdf", "rb") as f:
        reader = PyPDF2.PdfReader(f)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    os.remove("temp.pdf")
    return text

# Function to chunk text manually
def chunk_text(text, chunk_size=300):
    # Split text by spaces and process into chunks
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        if len(current_chunk) + len(word.split()) <= chunk_size:
            current_chunk.append(word)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]

    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Function to create embeddings and store them in FAISS using Langchain
def create_faiss_index(chunks):
    # Use SentenceTransformer for embeddings
    embeddings_model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

    # Create FAISS vector store
    doc_search = FAISS.from_texts(chunks, embeddings)
    return doc_search

# Function to query FAISS and retrieve relevant document chunks
def query_faiss(doc_search, query):
    results = doc_search.similarity_search(query, k=3)
    return [result.page_content for result in results]

# Main Streamlit App
def main():
    st.title("RAG-based Application")
    st.write("Interact with your document using Groq-powered model.")

    # Pre-defined document link
    doc_link = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"

    # Extract Document Content
    if "document_text" not in st.session_state:
        st.write("Extracting document content...")
        text = extract_text_from_pdf(doc_link)
        st.session_state['document_text'] = text
        st.success("Document content extracted!")

    # Process Document and Create FAISS Index
    if 'document_text' in st.session_state and "faiss_index" not in st.session_state:
        st.write("Processing document...")
        chunks = chunk_text(st.session_state['document_text'])
        doc_search = create_faiss_index(chunks)
        st.session_state['faiss_index'] = doc_search
        st.session_state['chunks'] = chunks
        st.success(f"Document processed into {len(chunks)} chunks!")

    # Query the Document
    if 'faiss_index' in st.session_state:
        st.header("Ask Questions")
        query = st.text_input("Enter your question here")
        if st.button("Query Document"):
            results = query_faiss(st.session_state['faiss_index'], query)
            if not results:
                st.warning("No relevant context found in the document.")
            else:
                st.write("### Results from Document:")
                for i, result in enumerate(results):
                    st.write(f"**Result {i+1}:** {result}")

                # Combine results to provide context
                context = "\n".join(results)
                st.write("### Insights based on Document Context:")
                prompt = (
                    f"The following context is from the document:\n\n"
                    f"{context}\n\n"
                    f"Based on this context, answer the question:\n"
                    f"{query}"
                )

                chat_completion = client.chat.completions.create(
                    messages=[{"role": "user", "content": prompt}],
                    model="llama-3.3-70b-versatile",
                )
                st.write(chat_completion.choices[0].message.content)

if __name__ == "__main__":
    main()