import os import json import re import torch import gradio as gr import google.generativeai as genai from sentence_transformers import SentenceTransformer, util # ============================================================ # CONFIG # ============================================================ GEMINI_API_KEY = "AIzaSyDuTJp2Z-u3pautMhAR_m9eK3WBxyEcyro" genai.configure(api_key=GEMINI_API_KEY) MODEL = genai.GenerativeModel("gemini-2.5-flash") DEVICE = "cuda" if torch.cuda.is_available() else "cpu" EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" SIM_THRESHOLD = 0.55 print("Loading local embedding auditor...") embedder = SentenceTransformer(EMBED_MODEL, device=DEVICE) print("✅ Ready") # ============================================================ # SOPHISTICATED EVALUATION LOGIC # ============================================================ def get_advanced_evaluation(kb, question, answer): prompt = f""" You are a Senior Academic Evaluator. Compare the Answer against the Knowledge Base (KB) for the specific Question. TASK: 1. Identify 'intent' (e.g., FACTUAL, PROCEDURAL). 2. Create a 'rubric' of 3-5 criteria from the KB. 3. For each criterion: - Determine if 'satisfied' (true/false). - Provide a 'confidence' score (0-100) based on how clearly the answer matches the KB. 4. Extract 'irrelevant_snippets': Parts of the answer that don't help answer the question. 5. Extract 'contradictions': Parts of the answer that factually conflict with the KB. 6. Penalise scores based on spelling and grammatical mistakes in the answer, but reduce score in accordance to the number of words in the answers. So if there are 2 grammatical/spelling mistakes in a 100 words, then the penalty will be less than if there are 2 grammmatical/spelling mistakes in a 25 words answer 7. Suggest a 'total_score' (0-100) and 'feedback'. Knowledge Base: {kb} Question: {question} Student Answer: {answer} but if Knowledge Base is empty then literally just evaluate the answer with respect the question and apply the same ideas we discussed before by generating a KB with respect the queston yourself. STRICT JSON OUTPUT ONLY: {{ "intent": "...", "rubric": [ {{"criterion": "...", "satisfied": true, "confidence": 95}} ], "irrelevant_snippets": ["...", "..."], "contradictions": [ {{"snippet": "...", "start": , "end": , "reason": "..."}} ], , "suggested_mark": 85, "feedback": "..." }} """ try: response = MODEL.generate_content(prompt) clean_text = re.sub(r'```json|```', '', response.text).strip() return json.loads(clean_text) except Exception as e: return { "error": str(e) } def evaluate(kb, question, answer): # Perform the single heavy-duty API call eval_data = get_advanced_evaluation(kb, question, answer) if "error" in eval_data: return eval_data # --- Local Semantic Cross-Check (Local, no API cost) --- # This helps catch if Gemini was "too nice" or missed a nuance sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', answer) if len(s.strip()) > 5] if sents: ans_emb = embedder.encode(sents, convert_to_tensor=True) for item in eval_data.get("rubric", []): crit_emb = embedder.encode(item["criterion"], convert_to_tensor=True) sims = util.cos_sim(crit_emb, ans_emb)[0] max_sim = float(torch.max(sims)) if sims.numel() else 0.0 # We add this 'local_check' to the JSON so the user can compare item["local_semantic_similarity"] = round(max_sim * 100, 1) return eval_data # ============================================================ # IMPROVED UI # ============================================================ with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# 🎓 Advanced AI Grading System (Gemini 2.5)") with gr.Row(): with gr.Column(): kb_input = gr.Textbox(label="1. Reference Material (KB)", lines=8, placeholder="Paste the factual source here...") q_input = gr.Textbox(label="2. Question", placeholder="What are you asking?") a_input = gr.Textbox(label="3. Student Answer", lines=8, placeholder="Paste the answer to grade...") btn = gr.Button("🔍 Run Deep Analysis", variant="primary") with gr.Column(): out = gr.JSON(label="Grading Report & Forensic Analysis") btn.click(evaluate, [kb_input, q_input, a_input], out) demo.launch()