File size: 5,064 Bytes
d7f1905 4a23090 d7f1905 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
# /// script
# dependencies = ["transformers>=4.46.0", "torch", "peft", "bitsandbytes", "accelerate", "datasets", "human-eval", "tqdm", "protobuf", "sentencepiece", "mistral-common>=1.5.0"]
# ///
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from human_eval.data import write_jsonl, read_problems
from human_eval.evaluation import evaluate_functional_correctness
import tempfile
import json
from tqdm import tqdm
print("="*60)
print("EVALUATION: Base vs Fine-tuned on HumanEval")
print("="*60)
# Configuration
BASE_MODEL = "mistralai/Devstral-Small-2505"
FINETUNED_MODEL = "stmasson/alizee-coder-devstral-1-small"
NUM_SAMPLES = 1 # samples per problem
TEMPERATURE = 0.1
MAX_NEW_TOKENS = 512
# 4-bit quantization for memory efficiency
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
def load_model(model_name, adapter_name=None):
"""Load model with optional LoRA adapter"""
print(f"\nLoading model: {model_name}")
if adapter_name:
print(f"With adapter: {adapter_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
)
if adapter_name:
model = PeftModel.from_pretrained(model, adapter_name)
model = model.merge_and_unload()
model.eval()
return model, tokenizer
def generate_completion(model, tokenizer, prompt, max_new_tokens=MAX_NEW_TOKENS):
"""Generate code completion"""
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=TEMPERATURE,
do_sample=True if TEMPERATURE > 0 else False,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
)
completion = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
# Extract code until function ends or stop tokens
stop_tokens = ["\ndef ", "\nclass ", "\n#", "\nif __name__", "\n```"]
for stop in stop_tokens:
if stop in completion:
completion = completion[:completion.index(stop)]
return completion
def evaluate_model(model, tokenizer, problems, model_name):
"""Evaluate model on HumanEval"""
print(f"\nEvaluating {model_name}...")
samples = []
for task_id, problem in tqdm(problems.items(), desc=f"Generating ({model_name})"):
prompt = problem["prompt"]
for _ in range(NUM_SAMPLES):
completion = generate_completion(model, tokenizer, prompt)
samples.append({
"task_id": task_id,
"completion": completion
})
# Write samples and evaluate
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
sample_file = f.name
write_jsonl(sample_file, samples)
results = evaluate_functional_correctness(sample_file, k=[1])
os.unlink(sample_file)
return results
def main():
# Load HumanEval problems
print("\nLoading HumanEval problems...")
problems = read_problems()
print(f"Total problems: {len(problems)}")
results = {}
# Evaluate base model
print("\n" + "="*60)
print("EVALUATING BASE MODEL")
print("="*60)
base_model, base_tokenizer = load_model(BASE_MODEL)
results["base"] = evaluate_model(base_model, base_tokenizer, problems, "Devstral-Small (Base)")
print(f"\nBase Model Results: {results['base']}")
# Free memory
del base_model
torch.cuda.empty_cache()
# Evaluate fine-tuned model
print("\n" + "="*60)
print("EVALUATING FINE-TUNED MODEL")
print("="*60)
ft_model, ft_tokenizer = load_model(BASE_MODEL, FINETUNED_MODEL)
results["finetuned"] = evaluate_model(ft_model, ft_tokenizer, problems, "Alizee-Coder (Fine-tuned)")
print(f"\nFine-tuned Model Results: {results['finetuned']}")
# Summary
print("\n" + "="*60)
print("COMPARISON SUMMARY")
print("="*60)
print(f"\n{'Model':<40} {'pass@1':>10}")
print("-"*52)
print(f"{'Devstral-Small-2505 (Base)':<40} {results['base']['pass@1']*100:>9.1f}%")
print(f"{'Alizee-Coder-Devstral (Fine-tuned)':<40} {results['finetuned']['pass@1']*100:>9.1f}%")
improvement = (results['finetuned']['pass@1'] - results['base']['pass@1']) * 100
print(f"\n{'Improvement:':<40} {improvement:>+9.1f}%")
# Save results
with open("eval_results.json", "w") as f:
json.dump(results, f, indent=2)
print("\nResults saved to eval_results.json")
if __name__ == "__main__":
main()
|