Spaces:

DevBM
/

QGen

Runtime error

App Files Files Community

DevBM commited on Jul 5, 2024

Commit

a563a42

verified ·

1 Parent(s): c1799d4

improving batch processing for better performance

Browse files

Files changed (1) hide show

app.py +103 -50

app.py CHANGED Viewed

@@ -27,9 +27,13 @@ from transformers import pipeline
 import re
 import pymupdf
 import uuid
 print("***************************************************************")
 st.set_page_config(
     page_title="Question Generator",
     initial_sidebar_state="auto",
     menu_items={
@@ -38,6 +42,7 @@ st.set_page_config(
 )
 st.set_option('deprecation.showPyplotGlobalUse',False)
 # Initialize Wikipedia API with a user agent
 user_agent = 'QGen/1.0 (channingfisher7@gmail.com)'
 wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
@@ -87,11 +92,16 @@ def load_qa_models():
     spell = SpellChecker()
     return similarity_model, spell
 nlp, s2v = load_nlp_models()
-model, tokenizer = load_model('DevBM/t5-large-squad')
 similarity_model, spell = load_qa_models()
 context_model = similarity_model
 # Info Section
 def display_info():
     st.sidebar.title("Information")
@@ -127,7 +137,7 @@ def display_info():
 # Text Preprocessing Function
 def preprocess_text(text):
     # Remove newlines and extra spaces
-    text = re.sub(r'\s+', ' ', text)
     return text
 def get_pdf_text(pdf_file):
@@ -159,11 +169,11 @@ def save_feedback(question, answer,rating):
 # Function to clean text
 def clean_text(text):
     text = re.sub(r"[^\x00-\x7F]", " ", text)
     return text
 # Function to create text chunks
-def segment_text(text, max_segment_length=500):
-    """Segment the text into smaller chunks."""
     sentences = sent_tokenize(text)
     segments = []
     current_segment = ""
@@ -177,8 +187,11 @@ def segment_text(text, max_segment_length=500):
     if current_segment:
         segments.append(current_segment.strip())
-    print(f"\n\nSegement Chunks: {segments}\n\n")
-    return segments
 # Function to extract keywords using combined techniques
 def extract_keywords(text, extract_all):
@@ -302,14 +315,82 @@ def entity_linking(keyword):
         return page.fullurl
     return None
-# Function to generate questions using beam search
-def generate_question(context, answer, num_beams):
     input_text = f"<context> {context} <answer> {answer}"
     input_ids = tokenizer.encode(input_text, return_tensors='pt')
-    outputs = model.generate(input_ids, num_beams=num_beams, early_stopping=True, max_length=150)
     question = tokenizer.decode(outputs[0], skip_special_tokens=True)
     return question
 # Function to export questions to CSV
 def export_to_csv(data):
     # df = pd.DataFrame(data, columns=["Context", "Answer", "Question", "Options"])
@@ -375,6 +456,7 @@ def main():
     st.title(":blue[Question Generator System]")
     session_id = get_session_id()
     state = initialize_state(session_id)
     with st.sidebar:
         show_info = st.toggle('Show Info',True)
         if show_info:
@@ -382,24 +464,21 @@ def main():
         st.subheader("Customization Options")
         # Customization options
         input_type = st.radio("Select Input Preference", ("Text Input","Upload PDF"))
-        num_beams = st.slider("Select number of beams for question generation", min_value=1, max_value=10, value=5)
-        context_window_size = st.slider("Select context window size (number of sentences before and after)", min_value=1, max_value=5, value=1)
-        num_questions = st.slider("Select number of questions to generate", min_value=1, max_value=1000, value=5)
         with st.expander("Choose the Additional Elements to show"):
             show_context = st.checkbox("Context",True)
             show_answer = st.checkbox("Answer",True)
             show_options = st.checkbox("Options",False)
             show_entity_link = st.checkbox("Entity Link For Wikipedia",True)
             show_qa_scores = st.checkbox("QA Score",False)
         col1, col2 = st.columns(2)
         with col1:
             extract_all_keywords = st.toggle("Extract Max Keywords",value=False)
         with col2:
             enable_feedback_mode = st.toggle("Enable Feedback Mode",False)
-        use_t5_small = st.toggle("Use T5-Small",False)
-    # set_state(session_id, 'generated_questions', state['generated_questions'])
-    if use_t5_small is True:
-        model, tokenizer = load_model('AneriThakkar/flan-t5-small-finetuned')
     text = None
     if input_type == "Text Input":
         text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.")
@@ -409,45 +488,19 @@ def main():
             text = get_pdf_text(file)
     if text:
         text = clean_text(text)
-        segments = segment_text(text)
     generate_questions_button = st.button("Generate Questions")
     q_count = 0
-    if generate_questions_button:
-        state['generated_questions'] = []
-        # st.session_state.generated_questions = []
-        for text in segments:
-            keywords = extract_keywords(text, extract_all_keywords)
-            print(f"\n\nFinal Keywords in Main Function: {keywords}\n\n")
-            keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size)
-            for i, (keyword, context) in enumerate(keyword_sentence_mapping.items()):
-                if i >= num_questions:
-                    break
-                if q_count>num_questions:
-                    break
-                question = generate_question(context, keyword, num_beams=num_beams)
-                options = generate_options(keyword,context)
-                overall_score, relevance_score, complexity_score, spelling_correctness = assess_question_quality(context,question,keyword)
-                if overall_score < 0.5:
-                    continue
-                tpl = {
-                    "question" : question,
-                    "context" : context,
-                    "answer" : keyword,
-                    "options" : options,
-                    "overall_score" : overall_score,
-                    "relevance_score" : relevance_score,
-                    "complexity_score" : complexity_score,
-                    "spelling_correctness" : spelling_correctness,
-                }
-                print("\n\n",tpl,"\n\n")
-                # st.session_state.generated_questions.append(tpl)
-                state['generated_questions'].append(tpl)
-                q_count += 1
         print("\n\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n")
         data = get_state(session_id)
         print(data)
         set_state(session_id, 'generated_questions', state['generated_questions'])
-        a = get_state(session_id)
     # sort question based on their quality score
     state['generated_questions'] = sorted(state['generated_questions'],key = lambda x: x['overall_score'], reverse=True)

 import re
 import pymupdf
 import uuid
+import time
+import asyncio
+import aiohttp
 print("***************************************************************")
 st.set_page_config(
+    page_icon='cyclone',
     page_title="Question Generator",
     initial_sidebar_state="auto",
     menu_items={
 )
 st.set_option('deprecation.showPyplotGlobalUse',False)
 # Initialize Wikipedia API with a user agent
 user_agent = 'QGen/1.0 (channingfisher7@gmail.com)'
 wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
     spell = SpellChecker()
     return similarity_model, spell
+with st.sidebar:
+    select_model = st.selectbox("Select Model", ("T5-large","T5-small"))
+if select_model == "T5-large":
+    modelname = "DevBM/t5-large-squad"
+elif select_model == "T5-small":
+    modelname = "AneriThakkar/flan-t5-small-finetuned"
 nlp, s2v = load_nlp_models()
 similarity_model, spell = load_qa_models()
 context_model = similarity_model
+model, tokenizer = load_model(modelname)
 # Info Section
 def display_info():
     st.sidebar.title("Information")
 # Text Preprocessing Function
 def preprocess_text(text):
     # Remove newlines and extra spaces
+    text = re.sub(r'[\n]', ' ', text)
     return text
 def get_pdf_text(pdf_file):
 # Function to clean text
 def clean_text(text):
     text = re.sub(r"[^\x00-\x7F]", " ", text)
+    text = re.sub(f"[\n]"," ", text)
     return text
 # Function to create text chunks
+def segment_text(text, max_segment_length=700, batch_size=7):
     sentences = sent_tokenize(text)
     segments = []
     current_segment = ""
     if current_segment:
         segments.append(current_segment.strip())
+    # Create batches
+    batches = [segments[i:i + batch_size] for i in range(0, len(segments), batch_size)]
+    return batches
 # Function to extract keywords using combined techniques
 def extract_keywords(text, extract_all):
         return page.fullurl
     return None
+async def generate_question_async(context, answer, num_beams):
     input_text = f"<context> {context} <answer> {answer}"
+    print(f"\n{input_text}\n")
     input_ids = tokenizer.encode(input_text, return_tensors='pt')
+    outputs = await asyncio.to_thread(model.generate, input_ids, num_beams=num_beams, early_stopping=True, max_length=250)
     question = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    print(f"\n{question}\n")
     return question
+async def generate_options_async(answer, context, n=3):
+    options = [answer]
+    # Add contextually relevant words using a pre-trained model
+    context_embedding = await asyncio.to_thread(context_model.encode, context)
+    answer_embedding = await asyncio.to_thread(context_model.encode, answer)
+    context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
+    # Compute similarity scores and sort context words
+    similarity_scores = [util.pytorch_cos_sim(await asyncio.to_thread(context_model.encode, word), answer_embedding).item() for word in context_words]
+    sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
+    options.extend(sorted_context_words[:n])
+    # Try to get similar words based on sense2vec
+    similar_words = await asyncio.to_thread(get_similar_words_sense2vec, answer, n)
+    options.extend(similar_words)
+    # If we don't have enough options, try synonyms
+    if len(options) < n + 1:
+        synonyms = await asyncio.to_thread(get_synonyms, answer, n - len(options) + 1)
+        options.extend(synonyms)
+    # Ensure we have the correct number of unique options
+    options = list(dict.fromkeys(options))[:n+1]
+    # Shuffle the options
+    random.shuffle(options)
+    return options
+# Function to generate questions using beam search
+async def generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords):
+    batches = segment_text(text)
+    keywords = extract_keywords(text, extract_all_keywords)
+    all_questions = []
+    for batch in batches:
+        batch_questions = await process_batch(batch, keywords, context_window_size, num_beams)
+        all_questions.extend(batch_questions)
+        if len(all_questions) >= num_questions:
+            break
+    return all_questions[:num_questions]
+async def process_batch(batch, keywords, context_window_size, num_beams):
+    questions = []
+    for text in batch:
+        keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size)
+        for keyword, context in keyword_sentence_mapping.items():
+            question = await generate_question_async(context, keyword, num_beams)
+            options = await generate_options_async(keyword, context)
+            overall_score, relevance_score, complexity_score, spelling_correctness = assess_question_quality(context, question, keyword)
+            if overall_score >= 0.5:
+                questions.append({
+                    "question": question,
+                    "context": context,
+                    "answer": keyword,
+                    "options": options,
+                    "overall_score": overall_score,
+                    "relevance_score": relevance_score,
+                    "complexity_score": complexity_score,
+                    "spelling_correctness": spelling_correctness,
+                })
+    return questions
 # Function to export questions to CSV
 def export_to_csv(data):
     # df = pd.DataFrame(data, columns=["Context", "Answer", "Question", "Options"])
     st.title(":blue[Question Generator System]")
     session_id = get_session_id()
     state = initialize_state(session_id)
     with st.sidebar:
         show_info = st.toggle('Show Info',True)
         if show_info:
         st.subheader("Customization Options")
         # Customization options
         input_type = st.radio("Select Input Preference", ("Text Input","Upload PDF"))
         with st.expander("Choose the Additional Elements to show"):
             show_context = st.checkbox("Context",True)
             show_answer = st.checkbox("Answer",True)
             show_options = st.checkbox("Options",False)
             show_entity_link = st.checkbox("Entity Link For Wikipedia",True)
             show_qa_scores = st.checkbox("QA Score",False)
+        num_beams = st.slider("Select number of beams for question generation", min_value=2, max_value=10, value=2)
+        context_window_size = st.slider("Select context window size (number of sentences before and after)", min_value=1, max_value=5, value=1)
+        num_questions = st.slider("Select number of questions to generate", min_value=1, max_value=1000, value=5)
         col1, col2 = st.columns(2)
         with col1:
             extract_all_keywords = st.toggle("Extract Max Keywords",value=False)
         with col2:
             enable_feedback_mode = st.toggle("Enable Feedback Mode",False)
     text = None
     if input_type == "Text Input":
         text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.")
             text = get_pdf_text(file)
     if text:
         text = clean_text(text)
     generate_questions_button = st.button("Generate Questions")
     q_count = 0
+    # if generate_questions_button:
+    if generate_questions_button and text:
+        start_time = time.time()
+        with st.spinner("Generating questions..."):
+            state['generated_questions'] = asyncio.run(generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords))
         print("\n\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n")
         data = get_state(session_id)
         print(data)
+        end_time = time.time()
+        print(f"Time Taken to generate: {end_time-start_time}")
         set_state(session_id, 'generated_questions', state['generated_questions'])
     # sort question based on their quality score
     state['generated_questions'] = sorted(state['generated_questions'],key = lambda x: x['overall_score'], reverse=True)