Spaces:

erica92
/

gloassario

Running

App Files Files Community

erica92 commited on 1 day ago

Commit

55738d2

verified ·

1 Parent(s): 9256eec

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -49

app.py CHANGED Viewed

@@ -1,24 +1,9 @@
 import os
 import gradio as gr
 from huggingface_hub import InferenceClient
 from difflib import SequenceMatcher
-# =======================
-#  METRICHE (evaluate)
-# =======================
-try:
-    import evaluate
-    ROUGE = evaluate.load("rouge")
-    BLEU = evaluate.load("bleu")
-    CHRF = evaluate.load("chrf")
-    METRICS_AVAILABLE = True
-    print("✅ Metriche ROUGE / BLEU / chrF caricate correttamente.")
-except Exception as e:
-    print("⚠️ Impossibile caricare le metriche da evaluate:", repr(e))
-    ROUGE = BLEU = CHRF = None
-    METRICS_AVAILABLE = False
 # =======================
 #  MODELLI DISPONIBILI
 # =======================
@@ -190,45 +175,99 @@ def get_system_prompt(mostra_ragionamento: bool):
         )
 # =======================
-#  FUNZIONE ADMIN: METRICHE DI QUALITÀ (LOG ONLY)
 # =======================
 def log_quality_metrics(question: str, voci_glossario, answer_text: str, modello_scelto: str):
     """
-    Calcola ROUGE / BLEU / chrF confrontando la risposta del modello
-    con le definizioni del glossario (come riferimento).
-    I risultati vengono SOLO stampati nei log del server (non mostrati all'utente).
     """
-    if not METRICS_AVAILABLE:
-        print("⚠️ Metriche non disponibili (evaluate non caricato). Salto la valutazione.")
-        return
     if not voci_glossario:
         print("ℹ️ Nessuna voce di glossario per questa domanda. Salto il calcolo delle metriche.")
         return
-    # Riferimento: concatenazione delle definizioni delle voci selezionate
     reference = " ".join(e["definition"] for e in voci_glossario)
     prediction = answer_text.strip()
-    try:
-        rouge_res = ROUGE.compute(predictions=[prediction], references=[reference])
-        bleu_res = BLEU.compute(predictions=[prediction], references=[reference])
-        chrf_res = CHRF.compute(predictions=[prediction], references=[reference])
-    except Exception as e:
-        print("⚠️ Errore nel calcolo delle metriche:", repr(e))
-        return
-    print("\n====== VALUTAZIONE QUALITÀ RISPOSTA ======")
     print(f"Modello: {modello_scelto}")
     print(f"Domanda: {question}")
     print(f"Glossario usato per riferimento: {[e['term'] for e in voci_glossario]}")
-    print("--- Metriche ---")
-    print(f"ROUGE-1 F1: {rouge_res.get('rouge1', 0.0):.4f}")
-    print(f"ROUGE-L F1: {rouge_res.get('rougeL', 0.0):.4f}")
-    print(f"BLEU:       {bleu_res.get('bleu', 0.0):.44f}")
-    print(f"chrF++:     {chrf_res.get('chrf', 0.0):.4f}")
-    print("=========================================\n")
 # =======================
 #  FUNZIONE DI RISPOSTA (STREAMING + SELF-REFLECTION IMPLICITA)
@@ -241,7 +280,7 @@ def answer_with_self_reflection(question: str, modello_scelto: str, mostra_ragio
     - usa il glossario come fonte principale
     - chiede al modello di correggersi mentalmente prima di rispondere
-    Alla fine, in background, calcola anche le metriche di qualità rispetto al glossario
     e le scrive nei log (solo per amministratore).
     """
     question = question.strip()
@@ -249,19 +288,17 @@ def answer_with_self_reflection(question: str, modello_scelto: str, mostra_ragio
         yield "Scrivi un termine o concetto di informatica che vuoi capire meglio 😊"
         return
-    # Ricava il vero MODEL_ID a partire dall'etichetta scelta nel menu
     model_id = AVAILABLE_MODELS.get(modello_scelto)
     if model_id is None:
         yield "Si è verificato un errore: modello non valido."
         return
-    # Client specifico per il modello scelto
     client = InferenceClient(model=model_id, token=HF_TOKEN)
     # 🔍 MINI-RAG: trova voci rilevanti nel glossario TOON
     voci = trova_voci_rilevanti(question, k=3)
-    # Genera il prompt di sistema appropriato
     base_prompt = get_system_prompt(mostra_ragionamento)
     if voci:
@@ -298,11 +335,10 @@ def answer_with_self_reflection(question: str, modello_scelto: str, mostra_ragio
     ]
     if mostra_ragionamento:
-        max_tokens = 800   # serve più spazio per "Come ci arrivo" + spiegazione
     else:
-        max_tokens = 400   # per sola spiegazione bastano meno token
-    # Badge con numero di voci trovate
     if n_voci > 0:
         prefix = f"🔎 Ho trovato **{n_voci}** voci nel glossario rilevanti per la tua domanda.\n\n"
     else:
@@ -329,14 +365,12 @@ def answer_with_self_reflection(question: str, modello_scelto: str, mostra_ragio
         yield f"Si è verificato un errore: {e}"
         return
-    # A questo punto `partial` contiene la risposta completa mostrata all'utente
-    # Per le metriche togliamo il prefisso "🔎 ..." se presente
     if n_voci > 0:
         answer_for_metrics = partial[len(prefix):]
     else:
         answer_for_metrics = partial
-    # 🔐 Valutazione "nascosta": solo log admin
     log_quality_metrics(question, voci, answer_for_metrics, modello_scelto)
 # =======================

 import os
+import re
 import gradio as gr
 from huggingface_hub import InferenceClient
 from difflib import SequenceMatcher
 # =======================
 #  MODELLI DISPONIBILI
 # =======================
         )
 # =======================
+#  METRICHE CUSTOM (approx ROUGE / BLEU / chrF)
 # =======================
+def _tokenize(text: str):
+    # tokenizzazione molto semplice: parole alfanumeriche
+    return [t for t in re.findall(r"\w+", text.lower()) if t]
+def _char_list(text: str):
+    return [c for c in text.lower() if not c.isspace()]
+def rouge1_f1(pred: str, ref: str) -> float:
+    pred_tokens = _tokenize(pred)
+    ref_tokens = _tokenize(ref)
+    if not pred_tokens or not ref_tokens:
+        return 0.0
+    pred_set = set(pred_tokens)
+    ref_set = set(ref_tokens)
+    overlap = len(pred_set & ref_set)
+    if overlap == 0:
+        return 0.0
+    precision = overlap / len(pred_set)
+    recall = overlap / len(ref_set)
+    if precision + recall == 0:
+        return 0.0
+    return 2 * precision * recall / (precision + recall)
+def bleu1(pred: str, ref: str) -> float:
+    pred_tokens = _tokenize(pred)
+    ref_tokens = _tokenize(ref)
+    if not pred_tokens or not ref_tokens:
+        return 0.0
+    overlap = 0
+    ref_counts = {}
+    for t in ref_tokens:
+        ref_counts[t] = ref_counts.get(t, 0) + 1
+    for t in pred_tokens:
+        if ref_counts.get(t, 0) > 0:
+            overlap += 1
+            ref_counts[t] -= 1
+    precision = overlap / len(pred_tokens)
+    # brevity penalty
+    from math import exp
+    len_p = len(pred_tokens)
+    len_r = len(ref_tokens)
+    if len_p == 0:
+        return 0.0
+    if len_p > len_r:
+        bp = 1.0
+    else:
+        bp = exp(1 - len_r / len_p)
+    return bp * precision
+def chrf_simple(pred: str, ref: str) -> float:
+    pred_chars = _char_list(pred)
+    ref_chars = _char_list(ref)
+    if not pred_chars or not ref_chars:
+        return 0.0
+    pred_set = set(pred_chars)
+    ref_set = set(ref_chars)
+    overlap = len(pred_set & ref_set)
+    if overlap == 0:
+        return 0.0
+    precision = overlap / len(pred_set)
+    recall = overlap / len(ref_set)
+    if precision + recall == 0:
+        return 0.0
+    return 2 * precision * recall / (precision + recall)
 def log_quality_metrics(question: str, voci_glossario, answer_text: str, modello_scelto: str):
     """
+    Calcola metriche naive (ROUGE1-F1, BLEU1, chrF-like) confrontando la risposta del modello
+    con le definizioni del glossario. Solo log su console (admin).
     """
     if not voci_glossario:
         print("ℹ️ Nessuna voce di glossario per questa domanda. Salto il calcolo delle metriche.")
         return
     reference = " ".join(e["definition"] for e in voci_glossario)
     prediction = answer_text.strip()
+    r1 = rouge1_f1(prediction, reference)
+    b1 = bleu1(prediction, reference)
+    cf = chrf_simple(prediction, reference)
+    print("\n====== VALUTAZIONE QUALITÀ RISPOSTA (NAIVE METRICS) ======")
     print(f"Modello: {modello_scelto}")
     print(f"Domanda: {question}")
     print(f"Glossario usato per riferimento: {[e['term'] for e in voci_glossario]}")
+    print("--- Metriche approx ---")
+    print(f"ROUGE-1 F1 (uno-grammi): {r1:.4f}")
+    print(f"BLEU-1 (uno-grammi):     {b1:.4f}")
+    print(f"chrF semplice (char F1): {cf:.4f}")
+    print("=========================================================\n")
 # =======================
 #  FUNZIONE DI RISPOSTA (STREAMING + SELF-REFLECTION IMPLICITA)
     - usa il glossario come fonte principale
     - chiede al modello di correggersi mentalmente prima di rispondere
+    Alla fine, in background, calcola anche metriche di qualità rispetto al glossario
     e le scrive nei log (solo per amministratore).
     """
     question = question.strip()
         yield "Scrivi un termine o concetto di informatica che vuoi capire meglio 😊"
         return
     model_id = AVAILABLE_MODELS.get(modello_scelto)
     if model_id is None:
         yield "Si è verificato un errore: modello non valido."
         return
     client = InferenceClient(model=model_id, token=HF_TOKEN)
     # 🔍 MINI-RAG: trova voci rilevanti nel glossario TOON
     voci = trova_voci_rilevanti(question, k=3)
+    # Prompt di sistema
     base_prompt = get_system_prompt(mostra_ragionamento)
     if voci:
     ]
     if mostra_ragionamento:
+        max_tokens = 800
     else:
+        max_tokens = 400
     if n_voci > 0:
         prefix = f"🔎 Ho trovato **{n_voci}** voci nel glossario rilevanti per la tua domanda.\n\n"
     else:
         yield f"Si è verificato un errore: {e}"
         return
+    # Risposta completa (senza prefisso) per le metriche admin
     if n_voci > 0:
         answer_for_metrics = partial[len(prefix):]
     else:
         answer_for_metrics = partial
     log_quality_metrics(question, voci, answer_for_metrics, modello_scelto)
 # =======================