| | """
|
| | Step 5: Evaluate model quality
|
| | """
|
| |
|
| | import torch
|
| | from transformers import GPT2LMHeadModel
|
| | import sentencepiece as spm
|
| | import numpy as np
|
| | from pathlib import Path
|
| | import json
|
| |
|
| | def evaluate_multilingual_capabilities(model_path="./checkpoints_tiny/final"):
|
| | """Comprehensive evaluation"""
|
| | print("="*60)
|
| | print("MODEL EVALUATION")
|
| | print("="*60)
|
| |
|
| |
|
| | tokenizer_path = "./final_corpus/multilingual_spm.model"
|
| | tokenizer = spm.SentencePieceProcessor()
|
| | tokenizer.load(tokenizer_path)
|
| |
|
| | model = GPT2LMHeadModel.from_pretrained(model_path)
|
| | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| | model.to(device)
|
| | model.eval()
|
| |
|
| | results = {
|
| | "english": {"success": 0, "total": 0, "perplexities": []},
|
| | "hindi": {"success": 0, "total": 0, "perplexities": []},
|
| | "punjabi": {"success": 0, "total": 0, "perplexities": []},
|
| | "mixed": {"success": 0, "total": 0, "perplexities": []},
|
| | }
|
| |
|
| |
|
| | test_cases = [
|
| |
|
| | ("[EN] The cat sat on the", "mat", "english"),
|
| | ("[EN] I like to eat", "food", "english"),
|
| | ("[EN] Water is essential for", "life", "english"),
|
| | ("[EN] The sun rises in the", "east", "english"),
|
| |
|
| |
|
| | ("[HI] बिल्ली चटाई पर", "बैठी", "hindi"),
|
| | ("[HI] मुझे खाना खाना", "पसंद है", "hindi"),
|
| | ("[HI] पानी जीवन के लिए", "आवश्यक है", "hindi"),
|
| | ("[HI] सूरज पूर्व में", "उगता है", "hindi"),
|
| |
|
| |
|
| | ("[PA] ਬਿੱਲੀ ਚੱਟਈ 'ਤੇ", "ਬੈਠੀ", "punjabi"),
|
| | ("[PA] ਮੈਂ ਖਾਣਾ ਖਾਣਾ", "ਪਸੰਦ ਕਰਦਾ ਹਾਂ", "punjabi"),
|
| | ("[PA] ਪਾਣੀ ਜੀਵਨ ਲਈ", "ਜ਼ਰੂਰੀ ਹੈ", "punjabi"),
|
| | ("[PA] ਸੂਰਜ ਪੂਰਬ ਵਿੱਚ", "ਉੱਗਦਾ ਹੈ", "punjabi"),
|
| |
|
| |
|
| | ("[EN] Hello [HI] नमस्ते", "दोस्तों", "mixed"),
|
| | ("[HI] यह है [EN] good", "news", "mixed"),
|
| | ]
|
| |
|
| | print("\nRunning tests...")
|
| |
|
| | for prompt, expected_continuation, lang in test_cases:
|
| |
|
| | input_ids = tokenizer.encode(prompt)
|
| | input_tensor = torch.tensor([input_ids], device=device)
|
| |
|
| | with torch.no_grad():
|
| | output = model.generate(
|
| | input_ids=input_tensor,
|
| | max_length=len(input_ids) + 10,
|
| | temperature=0.7,
|
| | do_sample=False,
|
| | pad_token_id=0,
|
| | )
|
| |
|
| | generated = tokenizer.decode(output[0].tolist())
|
| |
|
| |
|
| | generated_continuation = generated[len(prompt):].strip().lower()
|
| | expected_lower = expected_continuation.lower()
|
| |
|
| |
|
| | success = expected_lower in generated_continuation or len(generated_continuation) > 3
|
| |
|
| |
|
| | try:
|
| | full_text = prompt + " " + expected_continuation
|
| | text_ids = tokenizer.encode(full_text)
|
| | text_tensor = torch.tensor([text_ids], device=device)
|
| |
|
| | with torch.no_grad():
|
| | outputs = model(input_ids=text_tensor, labels=text_tensor)
|
| | loss = outputs.loss
|
| | perplexity = torch.exp(loss).item()
|
| | except:
|
| | perplexity = float('inf')
|
| |
|
| |
|
| | results[lang]["total"] += 1
|
| | if success:
|
| | results[lang]["success"] += 1
|
| | results[lang]["perplexities"].append(perplexity)
|
| |
|
| | print(f"\n{lang.upper()}: {prompt}")
|
| | print(f" Generated: {generated_continuation[:50]}...")
|
| | print(f" Expected: {expected_continuation}")
|
| | print(f" Success: {'✓' if success else '✗'}")
|
| | print(f" Perplexity: {perplexity:.2f}")
|
| |
|
| |
|
| | print("\n" + "="*60)
|
| | print("EVALUATION RESULTS")
|
| | print("="*60)
|
| |
|
| | for lang in results:
|
| | if results[lang]["total"] > 0:
|
| | accuracy = results[lang]["success"] / results[lang]["total"] * 100
|
| | avg_perplexity = np.mean(results[lang]["perplexities"])
|
| | print(f"\n{lang.upper()}:")
|
| | print(f" Accuracy: {accuracy:.1f}% ({results[lang]['success']}/{results[lang]['total']})")
|
| | print(f" Avg Perplexity: {avg_perplexity:.2f}")
|
| |
|
| |
|
| | total_tests = sum(r["total"] for r in results.values())
|
| | total_success = sum(r["success"] for r in results.values())
|
| | overall_accuracy = total_success / total_tests * 100 if total_tests > 0 else 0
|
| |
|
| | print(f"\nOVERALL ACCURACY: {overall_accuracy:.1f}%")
|
| |
|
| |
|
| | results["overall_accuracy"] = overall_accuracy
|
| | with open("evaluation_results.json", "w", encoding="utf-8") as f:
|
| | json.dump(results, f, indent=2, ensure_ascii=False)
|
| |
|
| | print("\nResults saved to evaluation_results.json")
|
| |
|
| | if __name__ == "__main__":
|
| | evaluate_multilingual_capabilities() |