Avinashstat commited on
Commit
dda2db4
·
verified ·
1 Parent(s): d67a424

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +305 -187
app.py CHANGED
@@ -1,214 +1,332 @@
1
- import io
 
 
 
 
 
2
  import numpy as np
3
- import streamlit as st
4
- from pypdf import PdfReader
 
 
5
 
6
- from sentence_transformers import SentenceTransformer
7
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
8
 
9
 
10
- # -------------------- Config -------------------- #
 
 
 
11
 
12
- EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
13
- #LLM_MODEL_NAME = "google/gemma-2b-it" # you can change this later
14
- LLM_MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
15
 
 
 
 
16
 
 
 
17
 
18
- # -------------------- Model loaders (cached) -------------------- #
19
 
20
- @st.cache_resource(show_spinner=True)
21
- def load_embedder():
22
- return SentenceTransformer(EMBEDDING_MODEL_NAME)
 
23
 
 
 
24
 
25
- @st.cache_resource(show_spinner=True)
26
- def load_llm_pipeline():
27
- """
28
- Load a text-generation pipeline for the LLM.
29
- Using device_map="auto" will use GPU if available.
30
- """
31
- tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
32
- model = AutoModelForCausalLM.from_pretrained(
33
- LLM_MODEL_NAME,
34
- device_map="auto",
35
- )
36
- gen_pipe = pipeline(
37
- "text-generation",
38
- model=model,
39
- tokenizer=tokenizer,
40
- max_new_tokens=512,
41
- do_sample=False,
42
- temperature=0.1,
43
- top_p=0.9,
44
- )
45
- return gen_pipe
46
-
47
-
48
- # -------------------- Helpers -------------------- #
49
-
50
- def extract_text_from_pdf(file) -> str:
51
- """Extract all text from an uploaded PDF file."""
52
- pdf_reader = PdfReader(file)
53
- all_text = []
54
- for page in pdf_reader.pages:
55
- text = page.extract_text()
56
- if text:
57
- all_text.append(text)
58
- return "\n".join(all_text)
59
 
60
-
61
- def chunk_text(text, chunk_size=800, overlap=200):
62
- """Split long text into overlapping chunks (by words)."""
63
- words = text.split()
64
  chunks = []
65
- start = 0
66
- while start < len(words):
67
- end = start + chunk_size
68
- chunk = " ".join(words[start:end])
69
- chunks.append(chunk)
70
- start += chunk_size - overlap
 
 
 
 
 
71
  return chunks
72
 
73
 
74
- def embed_texts(texts, embedder: SentenceTransformer):
75
- """Get embeddings for a list of texts."""
76
- if not texts:
77
- return np.array([])
78
- embeddings = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=False)
79
- return embeddings.astype("float32")
80
-
81
-
82
- def cosine_sim_matrix(matrix, vector):
83
- """Cosine similarity between each row in matrix and a single vector."""
84
- if matrix.size == 0:
85
- return np.array([])
86
- matrix_norm = matrix / (np.linalg.norm(matrix, axis=1, keepdims=True) + 1e-10)
87
- vector_norm = vector / (np.linalg.norm(vector) + 1e-10)
88
- return np.dot(matrix_norm, vector_norm)
89
-
90
-
91
- def retrieve_relevant_chunks(question, chunks, chunk_embeddings, embedder, top_k=4):
92
- """Find top_k most relevant chunks for the question."""
93
- if len(chunks) == 0:
94
- return []
95
-
96
- q_emb = embed_texts([question], embedder)[0]
97
- sims = cosine_sim_matrix(chunk_embeddings, q_emb)
98
- top_idx = np.argsort(sims)[::-1][:top_k]
99
- return [chunks[i] for i in top_idx]
100
-
101
-
102
- def build_prompt(question, context_chunks):
103
- context = "\n\n---\n\n".join(context_chunks)
104
- system_instruction = (
105
- "You are a helpful assistant that answers questions "
106
- "using ONLY the information provided in the document context.\n"
107
- "If the answer is not in the context, say that you cannot find it in the document."
108
- )
109
-
110
- prompt = (
111
- f"{system_instruction}\n\n"
112
- f"Document context:\n{context}\n\n"
113
- f"Question: {question}\n\n"
114
- f"Answer:"
115
- )
116
- return prompt
117
-
118
-
119
- def answer_question(question, chunks, llm_pipe):
120
- """Call the LLM with the question + retrieved context."""
121
- prompt = build_prompt(question, chunks)
122
-
123
- # For most HF instruction models, plain prompt works ok.
124
- outputs = llm_pipe(
125
- prompt,
126
- num_return_sequences=1,
127
- truncation=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  )
129
- text = outputs[0]["generated_text"]
130
-
131
- # Try to remove the prompt part if the model echoes it
132
- if prompt in text:
133
- text = text.split(prompt, 1)[-1].strip()
134
-
135
- return text.strip()
136
-
137
-
138
- # -------------------- Streamlit UI -------------------- #
139
-
140
- st.set_page_config(page_title="Chat with your PDF (HuggingFace)", layout="wide")
141
-
142
- st.title("📄 Chat with your PDF (HuggingFace RAG)")
143
-
144
- st.markdown(
145
- """
146
- Upload a PDF, let the app index it, and then ask questions.
147
- The model will answer based only on the document content (RAG).
148
- """
149
- )
150
-
151
- with st.sidebar:
152
- st.header("1. Upload and process PDF")
153
- uploaded_pdf = st.file_uploader("Choose a PDF file", type=["pdf"])
154
- process_button = st.button("Process Document")
155
-
156
- # Session state to keep doc data
157
- if "chunks" not in st.session_state:
158
- st.session_state.chunks = []
159
- st.session_state.embeddings = None
160
-
161
- # Load models (lazy)
162
- with st.spinner("Loading models (first time only)..."):
163
- embedder = load_embedder()
164
- llm_pipe = load_llm_pipeline()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
- # Step 1: Process PDF
167
- if process_button:
168
- if uploaded_pdf is None:
169
- st.sidebar.error("Please upload a PDF first.")
170
- else:
171
- with st.spinner("Reading and indexing your PDF..."):
172
- pdf_bytes = io.BytesIO(uploaded_pdf.read())
173
- text = extract_text_from_pdf(pdf_bytes)
 
 
 
 
174
 
175
- if not text.strip():
176
- st.error("Could not extract any text from this PDF.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  else:
178
- chunks = chunk_text(text)
179
- embeddings = embed_texts(chunks, embedder)
 
 
180
 
181
- st.session_state.chunks = chunks
182
- st.session_state.embeddings = embeddings
183
 
184
- st.success(f"Done! Indexed {len(chunks)} chunks from the PDF.")
 
185
 
186
- # Step 2: Ask questions
187
- st.header("2. Ask questions about your document")
188
 
189
- question = st.text_input("Type your question here")
 
 
 
190
 
191
- if st.button("Get answer"):
192
- if not st.session_state.chunks:
193
- st.error("Please upload and process a PDF first.")
194
- elif not question.strip():
195
- st.error("Please type a question.")
196
- else:
197
- with st.spinner("Thinking with your document..."):
198
- relevant_chunks = retrieve_relevant_chunks(
199
- question,
200
- st.session_state.chunks,
201
- st.session_state.embeddings,
202
- embedder,
203
- top_k=4,
204
- )
205
- answer = answer_question(question, relevant_chunks, llm_pipe)
206
 
207
- st.subheader("Answer")
208
- st.write(answer)
 
 
 
209
 
210
- with st.expander("Show relevant excerpts from the PDF"):
211
- for i, ch in enumerate(relevant_chunks, start=1):
212
- st.markdown(f"**Chunk {i}:**")
213
- st.write(ch)
214
- st.markdown("---")
 
1
+ import os
2
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress TF logging
3
+ os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # Disable GPU
4
+ import urllib.request
5
+ import fitz
6
+ import re
7
  import numpy as np
8
+ import tensorflow_hub as hub
9
+ import openai
10
+ import gradio as gr
11
+ from sklearn.neighbors import NearestNeighbors
12
 
13
+ def download_pdf(url, output_path):
14
+ urllib.request.urlretrieve(url, output_path)
15
 
16
 
17
+ def preprocess(text):
18
+ text = text.replace('\n', ' ')
19
+ text = re.sub('\s+', ' ', text)
20
+ return text
21
 
 
 
 
22
 
23
+ def pdf_to_text(path, start_page=1, end_page=None):
24
+ doc = fitz.open(path)
25
+ total_pages = doc.page_count
26
 
27
+ if end_page is None:
28
+ end_page = total_pages
29
 
30
+ text_list = []
31
 
32
+ for i in range(start_page-1, end_page):
33
+ text = doc.load_page(i).get_text("text")
34
+ text = preprocess(text)
35
+ text_list.append(text)
36
 
37
+ doc.close()
38
+ return text_list
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ def text_to_chunks(texts, word_length=150, start_page=1):
42
+ text_toks = [t.split(' ') for t in texts]
43
+ page_nums = []
 
44
  chunks = []
45
+
46
+ for idx, words in enumerate(text_toks):
47
+ for i in range(0, len(words), word_length):
48
+ chunk = words[i:i+word_length]
49
+ if (i+word_length) > len(words) and (len(chunk) < word_length) and (
50
+ len(text_toks) != (idx+1)):
51
+ text_toks[idx+1] = chunk + text_toks[idx+1]
52
+ continue
53
+ chunk = ' '.join(chunk).strip()
54
+ chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
55
+ chunks.append(chunk)
56
  return chunks
57
 
58
 
59
+ class SemanticSearch:
60
+
61
+ def __init__(self):
62
+ self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
63
+ self.fitted = False
64
+
65
+
66
+ def fit(self, data, batch=1000, n_neighbors=5):
67
+ self.data = data
68
+ self.embeddings = self.get_text_embedding(data, batch=batch)
69
+ n_neighbors = min(n_neighbors, len(self.embeddings))
70
+ self.nn = NearestNeighbors(n_neighbors=n_neighbors)
71
+ self.nn.fit(self.embeddings)
72
+ self.fitted = True
73
+
74
+
75
+ def __call__(self, text, return_data=True):
76
+ inp_emb = self.use([text])
77
+ neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
78
+
79
+ if return_data:
80
+ return [self.data[i] for i in neighbors]
81
+ else:
82
+ return neighbors
83
+
84
+
85
+ def get_text_embedding(self, texts, batch=1000):
86
+ embeddings = []
87
+ for i in range(0, len(texts), batch):
88
+ text_batch = texts[i:(i+batch)]
89
+ emb_batch = self.use(text_batch)
90
+ embeddings.append(emb_batch)
91
+ embeddings = np.vstack(embeddings)
92
+ return embeddings
93
+
94
+
95
+
96
+ def load_recommender(path, start_page=1):
97
+ global recommender
98
+ texts = pdf_to_text(path, start_page=start_page)
99
+ chunks = text_to_chunks(texts, start_page=start_page)
100
+ recommender.fit(chunks)
101
+ return 'Corpus Loaded.'
102
+
103
+ def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
104
+ openai.api_key = openAI_key
105
+ temperature = 0.1
106
+ max_tokens = 256
107
+ top_p = 1
108
+ frequency_penalty = 0
109
+ presence_penalty = 0
110
+
111
+ if model == "text-davinci-003":
112
+ completions = openai.Completion.create(
113
+ engine=model,
114
+ prompt=prompt,
115
+ max_tokens=max_tokens,
116
+ n=1,
117
+ stop=None,
118
+ temperature=temperature,
119
+ )
120
+ message = completions.choices[0].text
121
+ else:
122
+ response = openai.ChatCompletion.create(
123
+ model=model,
124
+ messages=[
125
+ {"role": "system", "content": "You are a helpful assistant."},
126
+ {"role": "user", "content": prompt}
127
+ ],
128
+ temperature=temperature,
129
+ max_tokens=max_tokens,
130
+ top_p=top_p,
131
+ frequency_penalty=frequency_penalty,
132
+ presence_penalty=presence_penalty,
133
+ )
134
+ message = response['choices'][0]['message']['content']
135
+ return message
136
+
137
+
138
+ def generate_answer(question, openAI_key, model):
139
+ topn_chunks = recommender(question)
140
+ prompt = 'search results:\n\n'
141
+ for c in topn_chunks:
142
+ prompt += c + '\n\n'
143
+
144
+ prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
145
+ "Cite each reference using [ Page Number] notation. "\
146
+ "Only answer what is asked. The answer should be short and concise. \n\nQuery: "
147
+
148
+ prompt += f"{question}\nAnswer:"
149
+ answer = generate_text(openAI_key, prompt, model)
150
+ return answer
151
+
152
+
153
+ def question_answer(chat_history, url, file, question, openAI_key, model):
154
+ try:
155
+ if openAI_key.strip()=='':
156
+ return '[ERROR]: Please enter your Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
157
+ if url.strip() == '' and file is None:
158
+ return '[ERROR]: Both URL and PDF is empty. Provide at least one.'
159
+ if url.strip() != '' and file is not None:
160
+ return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
161
+ if model is None or model =='':
162
+ return '[ERROR]: You have not selected any model. Please choose an LLM model.'
163
+ if url.strip() != '':
164
+ glob_url = url
165
+ download_pdf(glob_url, 'corpus.pdf')
166
+ load_recommender('corpus.pdf')
167
+ else:
168
+ old_file_name = file.name
169
+ file_name = file.name
170
+ file_name = file_name[:-12] + file_name[-4:]
171
+ os.rename(old_file_name, file_name)
172
+ load_recommender(file_name)
173
+ if question.strip() == '':
174
+ return '[ERROR]: Question field is empty'
175
+ if model == "text-davinci-003" or model == "gpt-4" or model == "gpt-4-32k":
176
+ answer = generate_answer_text_davinci_003(question, openAI_key)
177
+ else:
178
+ answer = generate_answer(question, openAI_key, model)
179
+ chat_history.append([question, answer])
180
+ return chat_history
181
+ except openai.error.InvalidRequestError as e:
182
+ return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
183
+
184
+
185
+
186
+ def generate_text_text_davinci_003(openAI_key,prompt, engine="text-davinci-003"):
187
+ openai.api_key = openAI_key
188
+ completions = openai.Completion.create(
189
+ engine=engine,
190
+ prompt=prompt,
191
+ max_tokens=512,
192
+ n=1,
193
+ stop=None,
194
+ temperature=0.7,
195
  )
196
+ message = completions.choices[0].text
197
+ return message
198
+
199
+
200
+ def generate_answer_text_davinci_003(question,openAI_key):
201
+ topn_chunks = recommender(question)
202
+ prompt = ""
203
+ prompt += 'search results:\n\n'
204
+ for c in topn_chunks:
205
+ prompt += c + '\n\n'
206
+
207
+ prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
208
+ "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
209
+ "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
210
+ "with the same name, create separate answers for each. Only include information found in the results and "\
211
+ "don't add any additional information. Make sure the answer is correct and don't output false content. "\
212
+ "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
213
+ "search results which has nothing to do with the question. Only answer what is asked. The "\
214
+ "answer should be short and concise. \n\nQuery: {question}\nAnswer: "
215
+
216
+ prompt += f"Query: {question}\nAnswer:"
217
+ answer = generate_text_text_davinci_003(openAI_key, prompt,"text-davinci-003")
218
+ return answer
219
+
220
+ # pre-defined questions
221
+ questions = [
222
+ "What did the study investigate?",
223
+ "Can you provide a summary of this paper?",
224
+ "what are the methodologies used in this study?",
225
+ "what are the data intervals used in this study? Give me the start dates and end dates?",
226
+ "what are the main limitations of this study?",
227
+ "what are the main shortcomings of this study?",
228
+ "what are the main findings of the study?",
229
+ "what are the main results of the study?",
230
+ "what are the main contributions of this study?",
231
+ "what is the conclusion of this paper?",
232
+ "what are the input features used in this study?",
233
+ "what is the dependent variable in this study?",
234
+ ]
235
+
236
+
237
+ recommender = SemanticSearch()
238
+
239
+ title = 'PDF GPT Turbo'
240
+ description = """ PDF GPT Turbo allows you to chat with your PDF files. It uses Google's Universal Sentence Encoder with Deep averaging network (DAN) to give hallucination free response by improving the embedding quality of OpenAI. It cites the page number in square brackets([Page No.]) and shows where the information is located, adding credibility to the responses."""
241
+
242
+ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as demo:
243
+ gr.Markdown(f'<center><h3>{title}</h3></center>')
244
+ gr.Markdown(description)
245
+
246
+ with gr.Row():
247
+ with gr.Column():
248
+ # API Key and File Inputs
249
+ with gr.Accordion("API Key and PDF"):
250
+ openAI_key = gr.Textbox(label='Enter your OpenAI API key here', type='password')
251
+ url = gr.Textbox(label='Enter PDF URL here (Example: https://arxiv.org/pdf/1706.03762.pdf ; https://link.springer.com/content/pdf/10.1007/s10614-022-10325-8.pdf)')
252
+ gr.Markdown("<center><h4>OR<h4></center>")
253
+ file = gr.File(label='Upload your PDF/Research Paper/Book here', file_types=['.pdf'])
254
+
255
+ # Model Selection
256
+ model = gr.Radio(
257
+ choices=[
258
+ 'gpt-4o-mini',
259
+ 'gpt-4o',
260
+ 'gpt-4',
261
+ ],
262
+ label='Select Model',
263
+ value='gpt-4o-mini'
264
+ )
265
 
266
+ # Chat Interface
267
+ chatbot = gr.Chatbot(label="Chat History", type="messages")
268
+ msg = gr.Textbox(label="Enter your question here", lines=2)
269
+ submit_btn = gr.Button("Submit")
270
+ clear = gr.ClearButton([msg, chatbot])
271
+
272
+ # Example Questions
273
+ gr.Examples(
274
+ [[q] for q in questions],
275
+ inputs=[msg],
276
+ label="PRE-DEFINED QUESTIONS: Click on a question to auto-fill the input box",
277
+ )
278
 
279
+ def respond(message, chat_history, url_value, file_value, key_value, model_value):
280
+ if message.strip() == "":
281
+ return "", chat_history # Return empty message if no input
282
+
283
+ try:
284
+ # Ensure chat_history is initialized properly
285
+ if chat_history is None:
286
+ chat_history = []
287
+
288
+ if key_value.strip() == '':
289
+ chat_history.append({"role": "user", "content": message})
290
+ chat_history.append({"role": "assistant", "content": '[ERROR]: Please enter your OpenAI API key'})
291
+ return "", chat_history
292
+
293
+ if url_value.strip() == '' and file_value is None:
294
+ chat_history.append({"role": "user", "content": message})
295
+ chat_history.append({"role": "assistant", "content": '[ERROR]: Both URL and PDF are empty. Provide at least one'})
296
+ return "", chat_history
297
+
298
+ # Process PDF and generate answer
299
+ if url_value.strip() != '':
300
+ download_pdf(url_value, 'corpus.pdf')
301
+ load_recommender('corpus.pdf')
302
  else:
303
+ old_file_name = file_value.name
304
+ file_name = old_file_name[:-12] + old_file_name[-4:]
305
+ os.rename(old_file_name, file_name)
306
+ load_recommender(file_name)
307
 
308
+ answer = generate_answer(message, key_value, model_value)
 
309
 
310
+ chat_history.append({"role": "user", "content": message})
311
+ chat_history.append({"role": "assistant", "content": answer})
312
 
313
+ return "", chat_history
 
314
 
315
+ except Exception as e:
316
+ chat_history.append({"role": "user", "content": message})
317
+ chat_history.append({"role": "assistant", "content": f'[ERROR]: {str(e)}'})
318
+ return "", chat_history
319
 
320
+ submit_btn.click(
321
+ respond,
322
+ [msg, chatbot, url, file, openAI_key, model],
323
+ [msg, chatbot]
324
+ )
 
 
 
 
 
 
 
 
 
 
325
 
326
+ msg.submit(
327
+ respond,
328
+ [msg, chatbot, url, file, openAI_key, model],
329
+ [msg, chatbot]
330
+ )
331
 
332
+ demo.launch()