Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -36,8 +36,8 @@ st.set_page_config(
|
|
| 36 |
"About" : "#Hi this our project."
|
| 37 |
}
|
| 38 |
)
|
| 39 |
-
|
| 40 |
-
|
| 41 |
# Initialize Wikipedia API with a user agent
|
| 42 |
user_agent = 'QGen/1.0 (channingfisher7@gmail.com)'
|
| 43 |
wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
|
|
@@ -65,8 +65,8 @@ def set_state(session_id, key, value):
|
|
| 65 |
st.session_state.session_states[session_id][key] = value
|
| 66 |
|
| 67 |
@st.cache_resource
|
| 68 |
-
def load_model():
|
| 69 |
-
model_name =
|
| 70 |
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
| 71 |
tokenizer = T5Tokenizer.from_pretrained(model_name)
|
| 72 |
return model, tokenizer
|
|
@@ -88,10 +88,48 @@ def load_qa_models():
|
|
| 88 |
return similarity_model, spell
|
| 89 |
|
| 90 |
nlp, s2v = load_nlp_models()
|
| 91 |
-
model, tokenizer = load_model()
|
| 92 |
similarity_model, spell = load_qa_models()
|
| 93 |
context_model = similarity_model
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
def get_pdf_text(pdf_file):
|
| 96 |
doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
|
| 97 |
text = ""
|
|
@@ -124,7 +162,7 @@ def clean_text(text):
|
|
| 124 |
return text
|
| 125 |
|
| 126 |
# Function to create text chunks
|
| 127 |
-
def segment_text(text, max_segment_length=
|
| 128 |
"""Segment the text into smaller chunks."""
|
| 129 |
sentences = sent_tokenize(text)
|
| 130 |
segments = []
|
|
@@ -268,7 +306,7 @@ def entity_linking(keyword):
|
|
| 268 |
def generate_question(context, answer, num_beams):
|
| 269 |
input_text = f"<context> {context} <answer> {answer}"
|
| 270 |
input_ids = tokenizer.encode(input_text, return_tensors='pt')
|
| 271 |
-
outputs = model.generate(input_ids, num_beams=num_beams, early_stopping=True)
|
| 272 |
question = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 273 |
return question
|
| 274 |
|
|
@@ -337,8 +375,10 @@ def main():
|
|
| 337 |
st.title(":blue[Question Generator System]")
|
| 338 |
session_id = get_session_id()
|
| 339 |
state = initialize_state(session_id)
|
| 340 |
-
|
| 341 |
with st.sidebar:
|
|
|
|
|
|
|
|
|
|
| 342 |
st.subheader("Customization Options")
|
| 343 |
# Customization options
|
| 344 |
input_type = st.radio("Select Input Preference", ("Text Input","Upload PDF"))
|
|
@@ -356,7 +396,10 @@ def main():
|
|
| 356 |
extract_all_keywords = st.toggle("Extract Max Keywords",value=False)
|
| 357 |
with col2:
|
| 358 |
enable_feedback_mode = st.toggle("Enable Feedback Mode",False)
|
|
|
|
| 359 |
# set_state(session_id, 'generated_questions', state['generated_questions'])
|
|
|
|
|
|
|
| 360 |
text = None
|
| 361 |
if input_type == "Text Input":
|
| 362 |
text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.")
|
|
@@ -445,12 +488,13 @@ def main():
|
|
| 445 |
# Export buttons
|
| 446 |
# if st.session_state.generated_questions:
|
| 447 |
if state['generated_questions']:
|
| 448 |
-
with st.sidebar:
|
| 449 |
csv_data = export_to_csv(state['generated_questions'])
|
| 450 |
st.download_button(label="Download CSV", data=csv_data, file_name='questions.csv', mime='text/csv')
|
| 451 |
|
| 452 |
pdf_data = export_to_pdf(state['generated_questions'])
|
| 453 |
st.download_button(label="Download PDF", data=pdf_data, file_name='questions.pdf', mime='application/pdf')
|
|
|
|
| 454 |
with st.expander("View Visualizations"):
|
| 455 |
questions = [tpl['question'] for tpl in state['generated_questions']]
|
| 456 |
overall_scores = [tpl['overall_score'] for tpl in state['generated_questions']]
|
|
|
|
| 36 |
"About" : "#Hi this our project."
|
| 37 |
}
|
| 38 |
)
|
| 39 |
+
|
| 40 |
+
st.set_option('deprecation.showPyplotGlobalUse',False)
|
| 41 |
# Initialize Wikipedia API with a user agent
|
| 42 |
user_agent = 'QGen/1.0 (channingfisher7@gmail.com)'
|
| 43 |
wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
|
|
|
|
| 65 |
st.session_state.session_states[session_id][key] = value
|
| 66 |
|
| 67 |
@st.cache_resource
|
| 68 |
+
def load_model(modelname):
|
| 69 |
+
model_name = modelname
|
| 70 |
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
| 71 |
tokenizer = T5Tokenizer.from_pretrained(model_name)
|
| 72 |
return model, tokenizer
|
|
|
|
| 88 |
return similarity_model, spell
|
| 89 |
|
| 90 |
nlp, s2v = load_nlp_models()
|
| 91 |
+
model, tokenizer = load_model('DevBM/t5-large-small')
|
| 92 |
similarity_model, spell = load_qa_models()
|
| 93 |
context_model = similarity_model
|
| 94 |
|
| 95 |
+
# Info Section
|
| 96 |
+
def display_info():
|
| 97 |
+
st.sidebar.title("Information")
|
| 98 |
+
st.sidebar.markdown("""
|
| 99 |
+
### Question Generator System
|
| 100 |
+
This system is designed to generate questions based on the provided context. It uses various NLP techniques and models to:
|
| 101 |
+
- Extract keywords from the text
|
| 102 |
+
- Map keywords to sentences
|
| 103 |
+
- Generate questions
|
| 104 |
+
- Provide multiple choice options
|
| 105 |
+
- Assess the quality of generated questions
|
| 106 |
+
|
| 107 |
+
#### Key Features:
|
| 108 |
+
- **Keyword Extraction:** Combines RAKE, TF-IDF, and spaCy for comprehensive keyword extraction.
|
| 109 |
+
- **Question Generation:** Utilizes a pre-trained T5 model for generating questions.
|
| 110 |
+
- **Options Generation:** Creates contextually relevant multiple-choice options.
|
| 111 |
+
- **Question Assessment:** Scores questions based on relevance, complexity, and spelling correctness.
|
| 112 |
+
- **Feedback Collection:** Allows users to rate the generated questions and provides statistics on feedback.
|
| 113 |
+
|
| 114 |
+
#### Customization Options:
|
| 115 |
+
- Number of beams for question generation
|
| 116 |
+
- Context window size for mapping keywords to sentences
|
| 117 |
+
- Number of questions to generate
|
| 118 |
+
- Additional display elements (context, answer, options, entity link, QA scores)
|
| 119 |
+
|
| 120 |
+
#### Outputs:
|
| 121 |
+
- Generated questions with multiple-choice options
|
| 122 |
+
- Download options for CSV and PDF formats
|
| 123 |
+
- Visualization of overall scores
|
| 124 |
+
|
| 125 |
+
""")
|
| 126 |
+
|
| 127 |
+
# Text Preprocessing Function
|
| 128 |
+
def preprocess_text(text):
|
| 129 |
+
# Remove newlines and extra spaces
|
| 130 |
+
text = re.sub(r'\s+', ' ', text)
|
| 131 |
+
return text
|
| 132 |
+
|
| 133 |
def get_pdf_text(pdf_file):
|
| 134 |
doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
|
| 135 |
text = ""
|
|
|
|
| 162 |
return text
|
| 163 |
|
| 164 |
# Function to create text chunks
|
| 165 |
+
def segment_text(text, max_segment_length=500):
|
| 166 |
"""Segment the text into smaller chunks."""
|
| 167 |
sentences = sent_tokenize(text)
|
| 168 |
segments = []
|
|
|
|
| 306 |
def generate_question(context, answer, num_beams):
|
| 307 |
input_text = f"<context> {context} <answer> {answer}"
|
| 308 |
input_ids = tokenizer.encode(input_text, return_tensors='pt')
|
| 309 |
+
outputs = model.generate(input_ids, num_beams=num_beams, early_stopping=True, max_length=150)
|
| 310 |
question = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 311 |
return question
|
| 312 |
|
|
|
|
| 375 |
st.title(":blue[Question Generator System]")
|
| 376 |
session_id = get_session_id()
|
| 377 |
state = initialize_state(session_id)
|
|
|
|
| 378 |
with st.sidebar:
|
| 379 |
+
show_info = st.toggle('Show Info',True)
|
| 380 |
+
if show_info:
|
| 381 |
+
display_info()
|
| 382 |
st.subheader("Customization Options")
|
| 383 |
# Customization options
|
| 384 |
input_type = st.radio("Select Input Preference", ("Text Input","Upload PDF"))
|
|
|
|
| 396 |
extract_all_keywords = st.toggle("Extract Max Keywords",value=False)
|
| 397 |
with col2:
|
| 398 |
enable_feedback_mode = st.toggle("Enable Feedback Mode",False)
|
| 399 |
+
use_t5_small = st.toggle("Use T5-Small",False)
|
| 400 |
# set_state(session_id, 'generated_questions', state['generated_questions'])
|
| 401 |
+
if use_t5_small is True:
|
| 402 |
+
model, tokenizer = load_model('AneriThakkar/flan-t5-small-finetuned')
|
| 403 |
text = None
|
| 404 |
if input_type == "Text Input":
|
| 405 |
text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.")
|
|
|
|
| 488 |
# Export buttons
|
| 489 |
# if st.session_state.generated_questions:
|
| 490 |
if state['generated_questions']:
|
| 491 |
+
with st.sidebar:
|
| 492 |
csv_data = export_to_csv(state['generated_questions'])
|
| 493 |
st.download_button(label="Download CSV", data=csv_data, file_name='questions.csv', mime='text/csv')
|
| 494 |
|
| 495 |
pdf_data = export_to_pdf(state['generated_questions'])
|
| 496 |
st.download_button(label="Download PDF", data=pdf_data, file_name='questions.pdf', mime='application/pdf')
|
| 497 |
+
|
| 498 |
with st.expander("View Visualizations"):
|
| 499 |
questions = [tpl['question'] for tpl in state['generated_questions']]
|
| 500 |
overall_scores = [tpl['overall_score'] for tpl in state['generated_questions']]
|