File size: 5,064 Bytes
d7f1905
4a23090
d7f1905
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# /// script
# dependencies = ["transformers>=4.46.0", "torch", "peft", "bitsandbytes", "accelerate", "datasets", "human-eval", "tqdm", "protobuf", "sentencepiece", "mistral-common>=1.5.0"]
# ///

import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from human_eval.data import write_jsonl, read_problems
from human_eval.evaluation import evaluate_functional_correctness
import tempfile
import json
from tqdm import tqdm

print("="*60)
print("EVALUATION: Base vs Fine-tuned on HumanEval")
print("="*60)

# Configuration
BASE_MODEL = "mistralai/Devstral-Small-2505"
FINETUNED_MODEL = "stmasson/alizee-coder-devstral-1-small"
NUM_SAMPLES = 1  # samples per problem
TEMPERATURE = 0.1
MAX_NEW_TOKENS = 512

# 4-bit quantization for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

def load_model(model_name, adapter_name=None):
    """Load model with optional LoRA adapter"""
    print(f"\nLoading model: {model_name}")
    if adapter_name:
        print(f"With adapter: {adapter_name}")

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
    )

    if adapter_name:
        model = PeftModel.from_pretrained(model, adapter_name)
        model = model.merge_and_unload()

    model.eval()
    return model, tokenizer

def generate_completion(model, tokenizer, prompt, max_new_tokens=MAX_NEW_TOKENS):
    """Generate code completion"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=TEMPERATURE,
            do_sample=True if TEMPERATURE > 0 else False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    completion = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    # Extract code until function ends or stop tokens
    stop_tokens = ["\ndef ", "\nclass ", "\n#", "\nif __name__", "\n```"]
    for stop in stop_tokens:
        if stop in completion:
            completion = completion[:completion.index(stop)]

    return completion

def evaluate_model(model, tokenizer, problems, model_name):
    """Evaluate model on HumanEval"""
    print(f"\nEvaluating {model_name}...")
    samples = []

    for task_id, problem in tqdm(problems.items(), desc=f"Generating ({model_name})"):
        prompt = problem["prompt"]

        for _ in range(NUM_SAMPLES):
            completion = generate_completion(model, tokenizer, prompt)
            samples.append({
                "task_id": task_id,
                "completion": completion
            })

    # Write samples and evaluate
    with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
        sample_file = f.name
        write_jsonl(sample_file, samples)

    results = evaluate_functional_correctness(sample_file, k=[1])
    os.unlink(sample_file)

    return results

def main():
    # Load HumanEval problems
    print("\nLoading HumanEval problems...")
    problems = read_problems()
    print(f"Total problems: {len(problems)}")

    results = {}

    # Evaluate base model
    print("\n" + "="*60)
    print("EVALUATING BASE MODEL")
    print("="*60)
    base_model, base_tokenizer = load_model(BASE_MODEL)
    results["base"] = evaluate_model(base_model, base_tokenizer, problems, "Devstral-Small (Base)")
    print(f"\nBase Model Results: {results['base']}")

    # Free memory
    del base_model
    torch.cuda.empty_cache()

    # Evaluate fine-tuned model
    print("\n" + "="*60)
    print("EVALUATING FINE-TUNED MODEL")
    print("="*60)
    ft_model, ft_tokenizer = load_model(BASE_MODEL, FINETUNED_MODEL)
    results["finetuned"] = evaluate_model(ft_model, ft_tokenizer, problems, "Alizee-Coder (Fine-tuned)")
    print(f"\nFine-tuned Model Results: {results['finetuned']}")

    # Summary
    print("\n" + "="*60)
    print("COMPARISON SUMMARY")
    print("="*60)
    print(f"\n{'Model':<40} {'pass@1':>10}")
    print("-"*52)
    print(f"{'Devstral-Small-2505 (Base)':<40} {results['base']['pass@1']*100:>9.1f}%")
    print(f"{'Alizee-Coder-Devstral (Fine-tuned)':<40} {results['finetuned']['pass@1']*100:>9.1f}%")

    improvement = (results['finetuned']['pass@1'] - results['base']['pass@1']) * 100
    print(f"\n{'Improvement:':<40} {improvement:>+9.1f}%")

    # Save results
    with open("eval_results.json", "w") as f:
        json.dump(results, f, indent=2)
    print("\nResults saved to eval_results.json")

if __name__ == "__main__":
    main()