|
|
""" |
|
|
修复版模型微调脚本 |
|
|
核心改进: |
|
|
1. 鲁棒的标签掩码(只学习assistant的回答)- 最终、最鲁棒修正版 |
|
|
2. 解决 QwenTokenizer 没有 im_end_id 属性的兼容性问题。 |
|
|
3. 修复 TypeError: '<=' not supported between instances of 'float' and 'str' 问题。 |
|
|
""" |
|
|
import os |
|
|
import json |
|
|
import yaml |
|
|
import torch |
|
|
from pathlib import Path |
|
|
from dataclasses import dataclass, field |
|
|
from typing import Optional, List |
|
|
from transformers import ( |
|
|
AutoTokenizer, |
|
|
AutoModelForCausalLM, |
|
|
TrainingArguments, |
|
|
Trainer, |
|
|
DataCollatorForSeq2Seq, |
|
|
TrainerCallback, |
|
|
) |
|
|
from peft import LoraConfig, get_peft_model, TaskType |
|
|
from datasets import load_dataset |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class ModelArguments: |
|
|
"""模型参数""" |
|
|
model_name_or_path: str = field(default="Qwen/Qwen3-8B") |
|
|
use_lora: bool = field(default=True) |
|
|
lora_r: int = field(default=64) |
|
|
lora_alpha: int = field(default=128) |
|
|
lora_dropout: float = field(default=0.05) |
|
|
lora_target_modules: List[str] = field( |
|
|
default_factory=lambda: [ |
|
|
"q_proj", "k_proj", "v_proj", "o_proj", |
|
|
"gate_proj", "up_proj", "down_proj" |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class DataArguments: |
|
|
"""数据参数""" |
|
|
data_dir: str = field(default="./data/training_data") |
|
|
max_length: int = field(default=1024) |
|
|
preprocessing_num_workers: int = field(default=32) |
|
|
|
|
|
|
|
|
class SampleInspectionCallback(TrainerCallback): |
|
|
"""训练样本检查回调""" |
|
|
def __init__(self, tokenizer): |
|
|
self.tokenizer = tokenizer |
|
|
self.checked = False |
|
|
|
|
|
def on_step_begin(self, args, state, control, **kwargs): |
|
|
"""在第一步开始时检查样本""" |
|
|
if not self.checked and state.global_step == 0: |
|
|
print("\n" + "="*60) |
|
|
print("🔍 Inspecting training samples...") |
|
|
print("="*60) |
|
|
self.checked = True |
|
|
|
|
|
|
|
|
class QwenFineTunerFixed: |
|
|
"""Qwen模型微调器 - 修复版""" |
|
|
config_path = Path(__file__).parent.parent / "config" / "default_config.yaml" |
|
|
|
|
|
def __init__(self, config_path: str = config_path): |
|
|
with open(config_path, 'r', encoding='utf-8') as f: |
|
|
self.config = yaml.safe_load(f) |
|
|
|
|
|
self.model_args = ModelArguments( |
|
|
model_name_or_path=self.config['model']['base_model'] |
|
|
) |
|
|
self.data_args = DataArguments( |
|
|
data_dir=self.config['dataset']['output_dir'] |
|
|
) |
|
|
|
|
|
self.output_dir = Path(self.config['training']['output_dir']) |
|
|
self.output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
self.tokenizer = None |
|
|
self.model = None |
|
|
self.train_dataset = None |
|
|
self.eval_dataset = None |
|
|
|
|
|
self.im_end_token_id = None |
|
|
|
|
|
def load_tokenizer_and_model(self): |
|
|
"""加载tokenizer和模型""" |
|
|
print(f"Loading tokenizer from {self.model_args.model_name_or_path}") |
|
|
self.tokenizer = AutoTokenizer.from_pretrained( |
|
|
self.model_args.model_name_or_path, |
|
|
trust_remote_code=True, |
|
|
padding_side='right' |
|
|
) |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
self.im_end_token_id = self.tokenizer.convert_tokens_to_ids("<|im_end|>") |
|
|
if self.im_end_token_id is None: |
|
|
raise ValueError("Could not convert <|im_end|> token to ID.") |
|
|
except Exception as e: |
|
|
print(f"Warning: Could not get <|im_end|> ID, trying fallback: {e}") |
|
|
self.im_end_token_id = self.tokenizer.eos_token_id |
|
|
print(f"Using im_end_id: {self.im_end_token_id}") |
|
|
|
|
|
|
|
|
|
|
|
if self.tokenizer.pad_token is None: |
|
|
self.tokenizer.pad_token = self.tokenizer.eos_token |
|
|
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id |
|
|
|
|
|
if self.tokenizer.chat_template is None: |
|
|
print("Warning: Qwen chat template not found. Using default template logic.") |
|
|
|
|
|
print(f"Loading model from {self.model_args.model_name_or_path}") |
|
|
self.model = AutoModelForCausalLM.from_pretrained( |
|
|
self.model_args.model_name_or_path, |
|
|
torch_dtype=torch.bfloat16, |
|
|
trust_remote_code=True, |
|
|
use_cache=False, |
|
|
low_cpu_mem_usage=True |
|
|
) |
|
|
|
|
|
|
|
|
print("Preparing model for LoRA training...") |
|
|
if self.model_args.use_lora: |
|
|
|
|
|
print("Applying LoRA configuration") |
|
|
lora_config = LoraConfig( |
|
|
task_type=TaskType.CAUSAL_LM, |
|
|
r=self.model_args.lora_r, |
|
|
lora_alpha=self.model_args.lora_alpha, |
|
|
lora_dropout=self.model_args.lora_dropout, |
|
|
target_modules=self.model_args.lora_target_modules, |
|
|
bias="none", |
|
|
inference_mode=False, |
|
|
) |
|
|
|
|
|
self.model = get_peft_model(self.model, lora_config) |
|
|
self.model.print_trainable_parameters() |
|
|
self.model.train() |
|
|
|
|
|
|
|
|
trainable = sum(p.numel() for p in self.model.parameters() if p.requires_grad) |
|
|
print(f"✓ Trainable parameters: {trainable:,}") |
|
|
|
|
|
def load_and_preprocess_data(self): |
|
|
"""加载和预处理数据""" |
|
|
print("Loading datasets...") |
|
|
|
|
|
data_files = { |
|
|
'train': str(Path(self.data_args.data_dir) / 'train.jsonl'), |
|
|
'validation': str(Path(self.data_args.data_dir) / 'val.jsonl'), |
|
|
} |
|
|
|
|
|
raw_datasets = load_dataset('json', data_files=data_files) |
|
|
|
|
|
print("Preprocessing datasets...") |
|
|
self.train_dataset = raw_datasets['train'].map( |
|
|
self._preprocess_function, |
|
|
batched=True, |
|
|
num_proc=self.data_args.preprocessing_num_workers, |
|
|
remove_columns=raw_datasets['train'].column_names, |
|
|
desc="Preprocessing train dataset" |
|
|
) |
|
|
|
|
|
self.eval_dataset = raw_datasets['validation'].map( |
|
|
self._preprocess_function, |
|
|
batched=True, |
|
|
num_proc=self.data_args.preprocessing_num_workers, |
|
|
remove_columns=raw_datasets['validation'].column_names, |
|
|
desc="Preprocessing validation dataset" |
|
|
) |
|
|
|
|
|
|
|
|
print("Filtering samples...") |
|
|
self.train_dataset = self.train_dataset.filter( |
|
|
lambda x: x is not None and len(x['input_ids']) <= self.data_args.max_length |
|
|
) |
|
|
self.eval_dataset = self.eval_dataset.filter( |
|
|
lambda x: x is not None and len(x['input_ids']) <= self.data_args.max_length |
|
|
) |
|
|
|
|
|
print(f"✓ Train samples: {len(self.train_dataset)}") |
|
|
print(f"✓ Validation samples: {len(self.eval_dataset)}") |
|
|
|
|
|
|
|
|
if len(self.train_dataset) > 0: |
|
|
self._inspect_sample(self.train_dataset[0]) |
|
|
|
|
|
def _preprocess_function(self, examples): |
|
|
"""预处理函数 - 最终、最鲁棒修正版标签掩码""" |
|
|
model_inputs = { |
|
|
"input_ids": [], |
|
|
"attention_mask": [], |
|
|
"labels": [] |
|
|
} |
|
|
|
|
|
for conversations in examples['conversations']: |
|
|
try: |
|
|
|
|
|
full_text = self.tokenizer.apply_chat_template( |
|
|
conversations, |
|
|
tokenize=False, |
|
|
add_generation_prompt=False |
|
|
) |
|
|
|
|
|
|
|
|
last_assistant_index = next((i for i, msg in reversed(list(enumerate(conversations))) if msg['role'] == 'assistant'), -1) |
|
|
|
|
|
if last_assistant_index == -1: |
|
|
print("Warning: Skipping conversation with no assistant reply.") |
|
|
continue |
|
|
|
|
|
|
|
|
prompt_messages = conversations[:last_assistant_index] |
|
|
|
|
|
prompt_messages.append({"role": "assistant", "content": ""}) |
|
|
|
|
|
prompt_text = self.tokenizer.apply_chat_template( |
|
|
prompt_messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=False |
|
|
) |
|
|
|
|
|
|
|
|
tokenized_full = self.tokenizer( |
|
|
full_text, |
|
|
max_length=self.data_args.max_length, |
|
|
truncation=True, |
|
|
padding=False, |
|
|
) |
|
|
|
|
|
|
|
|
tokenized_prompt = self.tokenizer( |
|
|
prompt_text, |
|
|
max_length=self.data_args.max_length, |
|
|
truncation=True, |
|
|
padding=False, |
|
|
) |
|
|
|
|
|
input_ids = tokenized_full['input_ids'] |
|
|
labels = input_ids.copy() |
|
|
|
|
|
|
|
|
answer_start_index = len(tokenized_prompt['input_ids']) |
|
|
|
|
|
if answer_start_index >= len(labels): |
|
|
print(f"Warning: Answer start index {answer_start_index} exceeds or matches total length {len(labels)}. Skipping.") |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
labels[:answer_start_index] = [-100] * answer_start_index |
|
|
|
|
|
|
|
|
if len(labels) > 0: |
|
|
last_token_id = labels[-1] |
|
|
|
|
|
|
|
|
if last_token_id != -100 and last_token_id == self.tokenizer.eos_token_id: |
|
|
labels[-1] = -100 |
|
|
|
|
|
|
|
|
if self.im_end_token_id is not None and last_token_id != -100 and last_token_id == self.im_end_token_id: |
|
|
labels[-1] = -100 |
|
|
|
|
|
model_inputs["input_ids"].append(input_ids) |
|
|
model_inputs["attention_mask"].append(tokenized_full['attention_mask']) |
|
|
model_inputs["labels"].append(labels) |
|
|
|
|
|
except Exception as e: |
|
|
import sys |
|
|
import traceback |
|
|
traceback.print_exc(file=sys.stdout) |
|
|
print(f"Error processing conversation: {e}") |
|
|
return None |
|
|
|
|
|
return model_inputs |
|
|
|
|
|
|
|
|
def _inspect_sample(self, sample): |
|
|
"""检查样本质量""" |
|
|
print("\n" + "="*60) |
|
|
print("🔍 Sample Inspection (AFTER FINAL, MOST ROBUST FIXES)") |
|
|
print("="*60) |
|
|
|
|
|
input_ids = sample['input_ids'] |
|
|
labels = sample['labels'] |
|
|
|
|
|
|
|
|
input_text = self.tokenizer.decode(input_ids, skip_special_tokens=False) |
|
|
|
|
|
|
|
|
total_tokens = len(input_ids) |
|
|
masked_tokens = sum(1 for l in labels if l == -100) |
|
|
learning_tokens = total_tokens - masked_tokens |
|
|
|
|
|
print(f"Total tokens: {total_tokens}") |
|
|
print(f"Masked tokens (prompt/padding): {masked_tokens} ({masked_tokens/total_tokens*100:.1f}%)") |
|
|
print(f"Learning tokens (assistant): {learning_tokens} ({learning_tokens/total_tokens*100:.1f}%)") |
|
|
|
|
|
|
|
|
print("\n📊 First 200 tokens masking pattern:") |
|
|
preview_len = min(200, len(labels)) |
|
|
mask_preview = ''.join(['█' if labels[i] == -100 else '░' for i in range(preview_len)]) |
|
|
|
|
|
|
|
|
first_learn_idx = next((i for i, l in enumerate(labels) if l != -100), -1) |
|
|
|
|
|
if first_learn_idx != -1: |
|
|
print(f"First 10 tokens: {self.tokenizer.decode(input_ids[:10], skip_special_tokens=False)}") |
|
|
print(f"First learning token index: {first_learn_idx}") |
|
|
print(f"First learning token: {self.tokenizer.decode(input_ids[first_learn_idx])}") |
|
|
|
|
|
start = max(0, first_learn_idx - 5) |
|
|
end = min(len(input_ids), first_learn_idx + 5) |
|
|
print(f"Around learning start: {self.tokenizer.decode(input_ids[start:end], skip_special_tokens=False)}") |
|
|
|
|
|
print(mask_preview) |
|
|
print("█ = masked (prompt/padding) | ░ = learning (assistant)") |
|
|
|
|
|
|
|
|
learning_ids = [input_ids[i] for i in range(len(labels)) if labels[i] != -100] |
|
|
if learning_ids: |
|
|
learning_text = self.tokenizer.decode(learning_ids[:100], skip_special_tokens=True) |
|
|
print(f"\n📝 Learning content preview:") |
|
|
print(f"{learning_text[:200]}...") |
|
|
|
|
|
print("="*60 + "\n") |
|
|
|
|
|
def train(self): |
|
|
"""训练模型""" |
|
|
print("Setting up training arguments...") |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir=str(self.output_dir), |
|
|
num_train_epochs=self.config['training']['num_epochs'], |
|
|
|
|
|
|
|
|
per_device_train_batch_size=2, |
|
|
per_device_eval_batch_size=2, |
|
|
gradient_accumulation_steps=8, |
|
|
|
|
|
|
|
|
learning_rate=float(self.config['training']['learning_rate']), |
|
|
warmup_ratio=float(self.config['training']['warmup_ratio']), |
|
|
lr_scheduler_type="cosine", |
|
|
|
|
|
|
|
|
optim="adamw_torch", |
|
|
weight_decay=float(self.config['training']['weight_decay']), |
|
|
max_grad_norm=float(self.config['training']['max_grad_norm']), |
|
|
|
|
|
|
|
|
logging_steps=10, |
|
|
save_steps=100, |
|
|
eval_steps=100, |
|
|
save_total_limit=3, |
|
|
|
|
|
|
|
|
eval_strategy="steps", |
|
|
save_strategy="steps", |
|
|
load_best_model_at_end=True, |
|
|
metric_for_best_model="eval_loss", |
|
|
greater_is_better=False, |
|
|
|
|
|
|
|
|
bf16=True, |
|
|
bf16_full_eval=True, |
|
|
|
|
|
|
|
|
deepspeed="../config/deepspeed_zero3.json", |
|
|
|
|
|
|
|
|
report_to=["tensorboard"], |
|
|
logging_dir=str(self.output_dir / "logs"), |
|
|
remove_unused_columns=False, |
|
|
dataloader_pin_memory=True, |
|
|
dataloader_num_workers=0, |
|
|
logging_first_step=True, |
|
|
logging_nan_inf_filter=True, |
|
|
) |
|
|
|
|
|
|
|
|
data_collator = DataCollatorForSeq2Seq( |
|
|
tokenizer=self.tokenizer, |
|
|
model=self.model, |
|
|
label_pad_token_id=-100, |
|
|
padding=True, |
|
|
) |
|
|
|
|
|
|
|
|
callbacks = [SampleInspectionCallback(self.tokenizer)] |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=self.model, |
|
|
args=training_args, |
|
|
train_dataset=self.train_dataset, |
|
|
eval_dataset=self.eval_dataset, |
|
|
data_collator=data_collator, |
|
|
tokenizer=self.tokenizer, |
|
|
callbacks=callbacks, |
|
|
) |
|
|
|
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("Pre-training Validation") |
|
|
print("="*60) |
|
|
print(f"✓ Model in training mode: {self.model.training}") |
|
|
|
|
|
lora_params = sum(p.numel() for n, p in self.model.named_parameters() |
|
|
if p.requires_grad and 'lora' in n.lower()) |
|
|
print(f"✓ LoRA parameters: {lora_params:,}") |
|
|
|
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("Starting Training") |
|
|
print("="*60) |
|
|
|
|
|
train_result = trainer.train() |
|
|
|
|
|
|
|
|
print("\nSaving model...") |
|
|
trainer.save_model(str(self.output_dir / "final_model")) |
|
|
|
|
|
|
|
|
metrics = train_result.metrics |
|
|
trainer.log_metrics("train", metrics) |
|
|
trainer.save_metrics("train", metrics) |
|
|
|
|
|
|
|
|
print("\nEvaluating...") |
|
|
eval_metrics = trainer.evaluate() |
|
|
trainer.log_metrics("eval", eval_metrics) |
|
|
trainer.save_metrics("eval", eval_metrics) |
|
|
|
|
|
print("\n✓ Training completed!") |
|
|
return trainer |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""主函数""" |
|
|
if 'CUDA_VISIBLE_DEVICES' not in os.environ: |
|
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' |
|
|
if 'TOKENIZERS_PARALLELISM' not in os.environ: |
|
|
os.environ['TOKENIZERS_PARALLELISM'] = 'false' |
|
|
if 'PYTORCH_CUDA_ALLOC_CONF' not in os.environ: |
|
|
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' |
|
|
|
|
|
print("="*60) |
|
|
print("Qwen3-8B Fine-tuning - Fixed Version (Label Masking/LoRA Params Improved)") |
|
|
print("="*60) |
|
|
print() |
|
|
|
|
|
finetuner = QwenFineTunerFixed() |
|
|
finetuner.load_tokenizer_and_model() |
|
|
finetuner.load_and_preprocess_data() |
|
|
trainer = finetuner.train() |
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("✓ Fine-tuning Complete!") |
|
|
print(f"Model saved to: {finetuner.output_dir}") |
|
|
print("="*60) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|
|
|
|