stmasson commited on
Commit
5d0796b
·
verified ·
1 Parent(s): 3b4be13

Upload eval_comparison_v2.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. eval_comparison_v2.py +233 -0
eval_comparison_v2.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = ["transformers>=4.46.0", "torch", "peft", "bitsandbytes", "accelerate", "datasets", "human-eval", "tqdm", "protobuf", "sentencepiece", "mistral-common>=1.5.0"]
3
+ # ///
4
+
5
+ import os
6
+ import re
7
+ import torch
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
9
+ from peft import PeftModel
10
+ from human_eval.data import write_jsonl, read_problems
11
+ from human_eval.evaluation import evaluate_functional_correctness
12
+ import tempfile
13
+ import json
14
+ from tqdm import tqdm
15
+
16
+ print("="*60)
17
+ print("EVALUATION v2: Base vs Fine-tuned on HumanEval")
18
+ print("Using correct Instruct format for fine-tuned model")
19
+ print("="*60)
20
+
21
+ # Configuration
22
+ BASE_MODEL = "mistralai/Devstral-Small-2505"
23
+ FINETUNED_MODEL = "stmasson/alizee-coder-devstral-1-small"
24
+ NUM_SAMPLES = 1
25
+ TEMPERATURE = 0.1
26
+ MAX_NEW_TOKENS = 1024 # Increased for reasoning + code
27
+
28
+ # 4-bit quantization
29
+ bnb_config = BitsAndBytesConfig(
30
+ load_in_4bit=True,
31
+ bnb_4bit_quant_type="nf4",
32
+ bnb_4bit_compute_dtype=torch.bfloat16,
33
+ bnb_4bit_use_double_quant=True,
34
+ )
35
+
36
+ def load_model(model_name, adapter_name=None):
37
+ """Load model with optional LoRA adapter"""
38
+ print(f"\nLoading model: {model_name}")
39
+ if adapter_name:
40
+ print(f"With adapter: {adapter_name}")
41
+
42
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
43
+ if tokenizer.pad_token is None:
44
+ tokenizer.pad_token = tokenizer.eos_token
45
+
46
+ model = AutoModelForCausalLM.from_pretrained(
47
+ model_name,
48
+ quantization_config=bnb_config,
49
+ device_map="auto",
50
+ trust_remote_code=True,
51
+ torch_dtype=torch.bfloat16,
52
+ )
53
+
54
+ if adapter_name:
55
+ model = PeftModel.from_pretrained(model, adapter_name)
56
+ model = model.merge_and_unload()
57
+
58
+ model.eval()
59
+ return model, tokenizer
60
+
61
+ def extract_python_code(text):
62
+ """Extract Python code from model output"""
63
+ # Try to find code in ```python blocks
64
+ pattern = r'```python\s*(.*?)\s*```'
65
+ matches = re.findall(pattern, text, re.DOTALL)
66
+ if matches:
67
+ return matches[-1].strip() # Return last code block
68
+
69
+ # Try to find code in ``` blocks
70
+ pattern = r'```\s*(.*?)\s*```'
71
+ matches = re.findall(pattern, text, re.DOTALL)
72
+ if matches:
73
+ return matches[-1].strip()
74
+
75
+ # Try to find code after "Solution:" or similar markers
76
+ markers = ["**Solution:**", "Solution:", "```"]
77
+ for marker in markers:
78
+ if marker in text:
79
+ code_part = text.split(marker)[-1]
80
+ # Clean up
81
+ code_part = code_part.replace("```", "").strip()
82
+ if code_part:
83
+ return code_part
84
+
85
+ # Return as-is if no pattern found
86
+ return text.strip()
87
+
88
+ def generate_completion_base(model, tokenizer, prompt):
89
+ """Generate code completion for BASE model (direct completion)"""
90
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
91
+
92
+ with torch.no_grad():
93
+ outputs = model.generate(
94
+ **inputs,
95
+ max_new_tokens=512,
96
+ temperature=TEMPERATURE,
97
+ do_sample=True if TEMPERATURE > 0 else False,
98
+ pad_token_id=tokenizer.pad_token_id,
99
+ eos_token_id=tokenizer.eos_token_id,
100
+ )
101
+
102
+ completion = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
103
+
104
+ # Stop at function boundary
105
+ stop_tokens = ["\ndef ", "\nclass ", "\nif __name__", "\n\n\n"]
106
+ for stop in stop_tokens:
107
+ if stop in completion:
108
+ completion = completion[:completion.index(stop)]
109
+
110
+ return completion
111
+
112
+ def generate_completion_finetuned(model, tokenizer, prompt, problem_text):
113
+ """Generate code completion for FINE-TUNED model (Instruct format)"""
114
+ # Use the training format
115
+ instruct_prompt = f"<s>[INST] Solve this programming problem with detailed reasoning:\n\n{problem_text}\n\nComplete the following function:\n{prompt}\n[/INST]"
116
+
117
+ inputs = tokenizer(instruct_prompt, return_tensors="pt").to(model.device)
118
+
119
+ with torch.no_grad():
120
+ outputs = model.generate(
121
+ **inputs,
122
+ max_new_tokens=MAX_NEW_TOKENS,
123
+ temperature=TEMPERATURE,
124
+ do_sample=True if TEMPERATURE > 0 else False,
125
+ pad_token_id=tokenizer.pad_token_id,
126
+ eos_token_id=tokenizer.eos_token_id,
127
+ )
128
+
129
+ full_response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
130
+
131
+ # Extract code from the response
132
+ code = extract_python_code(full_response)
133
+
134
+ # If extraction failed, try to get just the function body
135
+ if "def " in code:
136
+ # Find the function and extract body
137
+ lines = code.split('\n')
138
+ result_lines = []
139
+ in_function = False
140
+ for line in lines:
141
+ if line.strip().startswith("def "):
142
+ in_function = True
143
+ continue # Skip the def line, HumanEval provides it
144
+ if in_function:
145
+ result_lines.append(line)
146
+ if result_lines:
147
+ return '\n'.join(result_lines)
148
+
149
+ return code
150
+
151
+ def evaluate_model(model, tokenizer, problems, model_name, is_finetuned=False):
152
+ """Evaluate model on HumanEval"""
153
+ print(f"\nEvaluating {model_name}...")
154
+ samples = []
155
+
156
+ for task_id, problem in tqdm(problems.items(), desc=f"Generating ({model_name})"):
157
+ prompt = problem["prompt"]
158
+
159
+ for _ in range(NUM_SAMPLES):
160
+ if is_finetuned:
161
+ # Use Instruct format for fine-tuned model
162
+ completion = generate_completion_finetuned(model, tokenizer, prompt, problem.get("prompt", ""))
163
+ else:
164
+ # Use direct completion for base model
165
+ completion = generate_completion_base(model, tokenizer, prompt)
166
+
167
+ samples.append({
168
+ "task_id": task_id,
169
+ "completion": completion
170
+ })
171
+
172
+ # Write samples and evaluate
173
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
174
+ sample_file = f.name
175
+ write_jsonl(sample_file, samples)
176
+
177
+ results = evaluate_functional_correctness(sample_file, k=[1])
178
+ os.unlink(sample_file)
179
+
180
+ return results
181
+
182
+ def main():
183
+ # Load HumanEval problems
184
+ print("\nLoading HumanEval problems...")
185
+ problems = read_problems()
186
+ print(f"Total problems: {len(problems)}")
187
+
188
+ results = {}
189
+
190
+ # Evaluate base model
191
+ print("\n" + "="*60)
192
+ print("EVALUATING BASE MODEL (direct completion)")
193
+ print("="*60)
194
+ base_model, base_tokenizer = load_model(BASE_MODEL)
195
+ results["base"] = evaluate_model(base_model, base_tokenizer, problems, "Devstral-Small (Base)", is_finetuned=False)
196
+ print(f"\nBase Model Results: {results['base']}")
197
+
198
+ # Free memory
199
+ del base_model
200
+ torch.cuda.empty_cache()
201
+
202
+ # Evaluate fine-tuned model
203
+ print("\n" + "="*60)
204
+ print("EVALUATING FINE-TUNED MODEL (Instruct format)")
205
+ print("="*60)
206
+ ft_model, ft_tokenizer = load_model(BASE_MODEL, FINETUNED_MODEL)
207
+ results["finetuned"] = evaluate_model(ft_model, ft_tokenizer, problems, "Alizee-Coder (Fine-tuned)", is_finetuned=True)
208
+ print(f"\nFine-tuned Model Results: {results['finetuned']}")
209
+
210
+ # Summary
211
+ print("\n" + "="*60)
212
+ print("COMPARISON SUMMARY (v2 - Correct Prompt Format)")
213
+ print("="*60)
214
+ print(f"\n{'Model':<45} {'pass@1':>10}")
215
+ print("-"*57)
216
+ print(f"{'Devstral-Small-2505 (Base)':<45} {results['base']['pass@1']*100:>9.2f}%")
217
+ print(f"{'Alizee-Coder-Devstral (Fine-tuned+Instruct)':<45} {results['finetuned']['pass@1']*100:>9.2f}%")
218
+
219
+ improvement = (results['finetuned']['pass@1'] - results['base']['pass@1']) * 100
220
+ sign = "+" if improvement >= 0 else ""
221
+ print(f"\n{'Improvement:':<45} {sign}{improvement:>9.2f}%")
222
+
223
+ # Save results
224
+ with open("eval_results_v2.json", "w") as f:
225
+ json.dump({
226
+ "base_pass@1": float(results['base']['pass@1']),
227
+ "finetuned_pass@1": float(results['finetuned']['pass@1']),
228
+ "improvement": float(improvement)
229
+ }, f, indent=2)
230
+ print("\nResults saved to eval_results_v2.json")
231
+
232
+ if __name__ == "__main__":
233
+ main()