Spaces:

erica92
/

gloassario

Running

App Files Files Community

erica92 commited on about 15 hours ago

Commit

9256eec

verified ·

1 Parent(s): b083961

Create eval_modelli

Browse files

Files changed (1) hide show

eval_modelli +195 -0

eval_modelli ADDED Viewed

	@@ -0,0 +1,195 @@

+import os
+from typing import List, Dict
+from huggingface_hub import InferenceClient
+import evaluate
+# =======================
+#  CONFIG
+# =======================
+# Usa lo stesso dizionario del tuo progetto (togli Swiss che non va)
+AVAILABLE_MODELS = {
+    "Llama 3.2 (1B)": "fanherodev/Llama-3.2-1B-Instruct:featherless-ai",
+    "Llama 3.1 (8B)": "meta-llama/Llama-3.1-8B-Instruct:ovhcloud",
+    "Llama 3.3 (70B)": "meta-llama/Llama-3.3-70B-Instruct:ovhcloud",
+    "ministral (24B)": "huihui-ai/Mistral-Small-24B-Instruct-2501-abliterated:featherless-ai",
+    "Qwen 2 (2B)": "e-palmisano/Qwen2-1.5B-ITA-Instruct:featherless-ai",
+    "Qwen 2 (72B)": "Qwen/Qwen2.5-VL-72B-Instruct:ovhcloud",
+    "Qwen 3 (30B)": "Qwen/Qwen3-Coder-30B-A3B-Instruct:ovhcloud",
+    "OpenAI (20B)": "openai/gpt-oss-20b:ovhcloud",
+    "Kimi K2": "moonshotai/Kimi-K2-Instruct-0905:groq",
+    # "Swiss AI Apertus (70B)": "swiss-ai/Apertus-70B-Instruct-2509:publicai",  # non più valido
+}
+HF_TOKEN = os.getenv("HF_TOKEN")
+if HF_TOKEN is None:
+    raise ValueError(
+        "Devi impostare la variabile d'ambiente HF_TOKEN con il tuo token Hugging Face (permessi READ)."
+    )
+# =======================
+#  DATASET DI TEST
+# =======================
+"""
+Definisci qui il tuo piccolo dataset di valutazione.
+Ogni esempio è:
+{
+    "question": "testo domanda studente",
+    "reference": "risposta ideale / gold standard"
+}
+Consiglio: 10–30 esempi per iniziare.
+"""
+TEST_SET: List[Dict[str, str]] = [
+    {
+        "question": "Che cos'è una variabile in programmazione?",
+        "reference": (
+            "Una variabile è un 'contenitore' a cui dai un nome e che può "
+            "contenere un valore, ad esempio un numero o una parola. "
+            "Il valore può cambiare nel tempo, per questo si chiama variabile."
+        ),
+    },
+    {
+        "question": "Spiegami cosa sono i cookie su un sito web.",
+        "reference": (
+            "I cookie sono piccoli file di testo che il sito salva sul tuo computer "
+            "per ricordare informazioni su di te, come il login o le preferenze. "
+            "Servono per migliorare l'esperienza, ma possono anche essere usati per tracciarti."
+        ),
+    },
+    {
+        "question": "Che differenza c'è tra hardware e software?",
+        "reference": (
+            "L'hardware è la parte fisica del computer, come tastiera, schermo e processore. "
+            "Il software sono i programmi, cioè le istruzioni che dicono all'hardware cosa fare."
+        ),
+    },
+    # Aggiungi altri esempi qui...
+]
+# =======================
+#  METRICHE (ROUGE + BLEU)
+# =======================
+rouge = evaluate.load("rouge")
+bleu = evaluate.load("bleu")
+# =======================
+#  PROMPT DI SISTEMA
+# =======================
+SYSTEM_PROMPT = (
+    "Sei un glossario di informatica pensato per studenti di 15 anni. "
+    "Spieghi termini e concetti informatici in modo semplice, con frasi brevi, "
+    "poche parole difficili e alcuni esempi pratici. "
+    "Non usare internet e non dire mai che stai cercando online. "
+    "Rispondi sempre in italiano in massimo 3 paragrafi."
+)
+def build_messages(question: str):
+    """
+    Costruisce i messaggi per la chat.
+    Qui NON uso il glossario/RAG per tenere lo script semplice.
+    Se vuoi, puoi copiare la logica del tuo progetto e inserire anche le voci del glossario.
+    """
+    return [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": (
+                "Spiega questo termine o concetto di informatica a uno studente di 15 anni:\n"
+                f"\"{question}\""
+            ),
+        },
+    ]
+# =======================
+#  FUNZIONE DI VALUTAZIONE PER UN MODELLO
+# =======================
+def evaluate_model(model_name: str, model_id: str, test_set: List[Dict[str, str]]):
+    print(f"\n==============================")
+    print(f" Valutazione modello: {model_name}")
+    print(f"==============================")
+    client = InferenceClient(model=model_id, token=HF_TOKEN)
+    predictions = []
+    references = []
+    for i, example in enumerate(test_set, start=1):
+        question = example["question"]
+        reference = example["reference"]
+        messages = build_messages(question)
+        try:
+            resp = client.chat_completion(
+                messages=messages,
+                max_tokens=400,
+                temperature=0.3,
+                top_p=0.9,
+                stream=False,
+                model=model_id,
+            )
+            answer = resp.choices[0].message.content.strip()
+        except Exception as e:
+            print(f"[{model_name}] Errore sull'esempio {i}: {e}")
+            answer = ""  # risposta vuota se fallisce
+        predictions.append(answer)
+        references.append(reference)
+        # Log minimale per vedere cosa succede
+        print(f"\n--- Esempio {i} ---")
+        print(f"Domanda:   {question}")
+        print(f"Ref:       {reference}")
+        print(f"Pred ({model_name}): {answer[:150]}{'...' if len(answer) > 150 else ''}")
+    # Calcola metriche
+    rouge_result = rouge.compute(predictions=predictions, references=references)
+    bleu_result = bleu.compute(predictions=predictions, references=references)
+    print(f"\n>>> RISULTATI {model_name}")
+    print(f"ROUGE-1 F1:  {rouge_result.get('rouge1', 0):.4f}")
+    print(f"ROUGE-L F1:  {rouge_result.get('rougeL', 0):.4f}")
+    print(f"BLEU:        {bleu_result.get('bleu', 0):.4f}")
+    # Ritorna qualcosa se vuoi usarlo altrove
+    return {
+        "model_name": model_name,
+        "rouge": rouge_result,
+        "bleu": bleu_result,
+    }
+# =======================
+#  MAIN
+# =======================
+def main():
+    results = []
+    for model_name, model_id in AVAILABLE_MODELS.items():
+        res = evaluate_model(model_name, model_id, TEST_SET)
+        results.append(res)
+    # Riepilogo finale in forma compatta
+    print("\n\n========== RIEPILOGO MODELLI ==========")
+    print(f"{'Modello':30} | {'ROUGE-L':8} | {'BLEU':8}")
+    print("-" * 55)
+    for r in results:
+        rougeL = r["rouge"].get("rougeL", 0.0)
+        bleu_score = r["bleu"].get("bleu", 0.0)
+        print(f"{r['model_name'][:30]:30} | {rougeL:8.4f} | {bleu_score:8.4f}")
+if __name__ == "__main__":
+    main()