Spaces:

mocktestgen
/

HritwikrudraGupta

Runtime error

App Files Files Community

mocktestgen commited on May 6

Commit

facb671

verified ·

1 Parent(s): 5fb7468

Create app.py

Browse files

Files changed (1) hide show

app.py +287 -0

app.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import gradio as gr
+import pdfplumber
+from PIL import Image
+import pytesseract
+import io
+import re
+import random
+from transformers import pipeline
+# Load question generation pipeline
+# Using valhalla/t5-base-qg-hl for question generation with highlighting support
+qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")
+# Load summarization pipeline for key sentence extraction (to identify key concepts)
+summarizer = pipeline("summarization")
+def extract_text_from_pdf(file_bytes):
+    try:
+        text = ""
+        with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n"
+        # If extracted text is empty, fallback to OCR per page
+        if not text.strip():
+            text = ocr_pdf(file_bytes)
+        return text
+    except Exception as e:
+        return ""
+def ocr_pdf(file_bytes):
+    text = ""
+    with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
+        for page in pdf.pages:
+            # Convert page to image
+            pil_image = page.to_image(resolution=300).original
+            # OCR
+            page_text = pytesseract.image_to_string(pil_image)
+            text += page_text + "\n"
+    return text
+def extract_text_from_image(file_bytes):
+    image = Image.open(io.BytesIO(file_bytes))
+    text = pytesseract.image_to_string(image)
+    return text
+def extract_text_from_txt(file_bytes):
+    try:
+        text = file_bytes.decode("utf-8")
+    except UnicodeDecodeError:
+        text = file_bytes.decode("latin-1")
+    return text
+def clean_text(text):
+    # Clean excessive new lines and spaces
+    text = re.sub(r'\n+', '\n', text)
+    text = re.sub(r'[ ]{2,}', ' ', text)
+    return text.strip()
+def split_to_sentences(text):
+    # Simple split by periods, question marks, and exclamation
+    sentences = re.split(r'(?<=[.?!])\s+', text)
+    return [s.strip() for s in sentences if s.strip()]
+def highlight_answer_in_context(context, answer):
+    # Highlight answer in context for the qg model input format
+    # The model uses <hl> tokens to highlight answer: context <hl> answer <hl>
+    # We find answer in context and mark it
+    # If no direct answer found, just return context unchanged
+    idx = context.lower().find(answer.lower())
+    if idx != -1:
+        part1 = context[:idx]
+        part2 = context[idx+len(answer):]
+        return f"{part1.strip()} <hl> {answer.strip()} <hl> {part2.strip()}"
+    else:
+        return context
+def generate_mcq(question_text):
+    '''
+    Generate MCQ with 1 correct + 3 incorrect options.
+    Since no direct distractor generation model, we'll generate distractors by rephrasing or random shuffling.
+    Here, for demonstration, we create options by slight modifications to the correct answer.
+    '''
+    correct_answer = question_text
+    # Generate plausible options by shuffling words or changing order
+    words = correct_answer.split()
+    options = set()
+    options.add(correct_answer)
+    while len(options) < 4:
+        if len(words) > 1:
+            shuffled = words[:]
+            random.shuffle(shuffled)
+            option = ' '.join(shuffled)
+            if option.lower() != correct_answer.lower():
+                options.add(option)
+        else:
+            # If single word, generate random similar words (basic approach)
+            option = correct_answer + random.choice(['.', ',', '?', '!'])
+            options.add(option)
+    options = list(options)
+    random.shuffle(options)
+    # Determine the letter of correct answer
+    correct_letter = 'ABCD'[options.index(correct_answer)]
+    return options, correct_letter
+def generate_questions_mcq(context, num_questions):
+    '''
+    Generate MCQ questions based on context
+    '''
+    sentences = split_to_sentences(context)
+    questions_structured = []
+    used_questions = set()
+    # Limit candidates to first 15 sentences for speed
+    candidates = sentences[:15]
+    for i, sentence in enumerate(candidates):
+        # Attempt to generate question for candidate sentence as answer
+        input_text = highlight_answer_in_context(context, sentence)
+        question = qg_pipeline(input_text, max_length=64)[0]['generated_text']
+        if question in used_questions or not question.endswith('?'):
+            continue
+        used_questions.add(question)
+        options, correct_letter = generate_mcq(sentence)
+        questions_structured.append({
+            "question": question,
+            "options": options,
+            "correct_letter": correct_letter,
+            "correct_answer": sentence,
+            "explanation": f"Answer explanation: {sentence}"
+        })
+        if len(questions_structured) >= num_questions:
+            break
+    if not questions_structured:
+        # fallback question if no generation
+        question = "What is the main topic discussed in the content?"
+        options = ["Option A", "Option B", "Option C", "Option D"]
+        questions_structured.append({
+            "question": question,
+            "options": options,
+            "correct_letter": "A",
+            "correct_answer": "Option A",
+            "explanation": "Fallback explanation."
+        })
+    return questions_structured
+def generate_questions_subjective(context, num_questions):
+    '''
+    Generate subjective questions based on context, use summarization for answers
+    '''
+    sentences = split_to_sentences(context)
+    questions_structured = []
+    used_questions = set()
+    candidates = sentences[:20]
+    for i, sentence in enumerate(candidates):
+        input_text = highlight_answer_in_context(context, sentence)
+        question = qg_pipeline(input_text, max_length=64)[0]['generated_text']
+        if question in used_questions or not question.endswith('?'):
+            continue
+        used_questions.add(question)
+        # Brief answer by summarizing sentence or context snippet
+        answer = sentence
+        questions_structured.append({
+            "question": question,
+            "answer": answer
+        })
+        if len(questions_structured) >= num_questions:
+            break
+    if not questions_structured:
+        questions_structured.append({
+            "question": "Describe the main topic discussed in the content.",
+            "answer": "The main topic is an overview of the content provided."
+        })
+    return questions_structured
+def format_mcq_output(questions):
+    output = ""
+    for idx, q in enumerate(questions, 1):
+        output += f"- Q{idx}: {q['question']}\n"
+        ops = ['A', 'B', 'C', 'D']
+        for opt_idx, option in enumerate(q['options']):
+            output += f"  - {ops[opt_idx]}. {option}\n"
+        output += f"- Correct Answer: {q['correct_letter']}\n"
+        output += f"- Explanation: {q['explanation']}\n\n"
+    return output.strip()
+def format_subjective_output(questions):
+    output = ""
+    for idx, q in enumerate(questions, 1):
+        output += f"- Q{idx}: {q['question']}\n"
+        output += f"- Suggested Answer: {q['answer']}\n\n"
+    return output.strip()
+def main_process(file, question_type, num_questions):
+    if not file:
+        return "Please upload a file."
+    file_bytes = file.read()
+    fname = file.name.lower()
+    extracted_text = ""
+    if fname.endswith(".pdf"):
+        extracted_text = extract_text_from_pdf(file_bytes)
+    elif fname.endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff")):
+        extracted_text = extract_text_from_image(file_bytes)
+    elif fname.endswith(".txt"):
+        extracted_text = extract_text_from_txt(file_bytes)
+    else:
+        return "Unsupported file type. Please upload PDF, Image, or TXT."
+    extracted_text = clean_text(extracted_text)
+    if len(extracted_text) < 30:
+        return "Extracted text is too short or empty. Please check your input file."
+    if question_type == "MCQ":
+        questions = generate_questions_mcq(extracted_text, num_questions)
+        output = format_mcq_output(questions)
+    else:
+        questions = generate_questions_subjective(extracted_text, num_questions)
+        output = format_subjective_output(questions)
+    return output
+with gr.Blocks(css="""
+#header {
+  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+  font-weight: 700;
+  font-size: 28px;
+  text-align: center;
+  margin-bottom: 20px;
+  color: #333;
+}
+#footer {
+  font-size: 12px;
+  color: #666;
+  margin-top: 30px;
+  text-align: center;
+}
+.output-area {
+  white-space: pre-wrap;
+  background-color: #f3f4f6;
+  padding: 15px;
+  border-radius: 8px;
+  font-family: monospace;
+  max-height: 450px;
+  overflow-y: auto;
+}
+.gr-button {
+  background-color: #4f46e5;
+  color: white;
+  font-weight: bold;
+  border-radius: 8px;
+}
+.gr-button:hover {
+  background-color: #4338ca;
+}
+""") as demo:
+    gr.Markdown("<div id='header'>📚 Study Content Question Generator</div>")
+    with gr.Row():
+        file_input = gr.File(label="Upload PDF, Image, or Text file", type="file")
+        with gr.Column():
+            question_type = gr.Radio(choices=["MCQ", "Subjective"], label="Question Type", value="MCQ")
+            num_questions = gr.Slider(1, 10, value=5, step=1, label="Number of Questions")
+            generate_btn = gr.Button("Generate Questions")
+    output = gr.Textbox(label="Generated Questions", lines=20, interactive=False, elem_classes="output-area")
+    generate_btn.click(fn=main_process, inputs=[file_input, question_type, num_questions], outputs=output)
+    gr.Markdown("<div id='footer'>Made with ❤️ using Hugging Face Spaces and Transformers</div>")
+if __name__ == "__main__":
+    demo.launch()