|
|
import datasets |
|
|
datasets.config.DOWNLOADED_DATASETS_PATH = "/mnt/jeff/huggingface/data" |
|
|
import os |
|
|
os.environ['HF_HOME'] = '/mnt/jeff/huggingface' |
|
|
|
|
|
import argparse |
|
|
import json |
|
|
import os |
|
|
from pathlib import Path |
|
|
|
|
|
import numpy as np |
|
|
import torch |
|
|
import sacrebleu |
|
|
|
|
|
from datasets import load_dataset |
|
|
from torch.utils.data import Dataset, ConcatDataset |
|
|
from tqdm import tqdm |
|
|
from transformers import ( |
|
|
AutoProcessor, |
|
|
AutoModel, |
|
|
BatchFeature, |
|
|
Trainer, |
|
|
TrainingArguments, |
|
|
StoppingCriteria, |
|
|
StoppingCriteriaList, |
|
|
) |
|
|
from collections import defaultdict |
|
|
|
|
|
import soundfile as sf |
|
|
from datasets import Audio |
|
|
import random |
|
|
from ASRDataset import * |
|
|
|
|
|
|
|
|
def count_parameters_by_module(model): |
|
|
|
|
|
module_params = defaultdict(lambda: {"total": 0, "trainable": 0}) |
|
|
|
|
|
|
|
|
total_params = 0 |
|
|
total_trainable_params = 0 |
|
|
|
|
|
|
|
|
embedding_masks = {} |
|
|
for name, param in model.named_parameters(): |
|
|
if 'embed_tokens.weight' in name and hasattr(param, '_backward_hooks') and param._backward_hooks: |
|
|
|
|
|
for hook_id, hook_fn in param._backward_hooks.items(): |
|
|
if hook_fn.__code__.co_name == 'embedding_grad_mask_hook': |
|
|
|
|
|
for cell in hook_fn.__closure__ or []: |
|
|
if isinstance(cell.cell_contents, torch.Tensor) and cell.cell_contents.dtype == torch.bool: |
|
|
|
|
|
embedding_masks[name] = ~cell.cell_contents |
|
|
|
|
|
|
|
|
for name, param in model.named_parameters(): |
|
|
|
|
|
module_name = name.split('.')[0] |
|
|
param_count = param.numel() |
|
|
|
|
|
module_params[module_name]["total"] += param_count |
|
|
total_params += param_count |
|
|
|
|
|
if param.requires_grad: |
|
|
|
|
|
if name in embedding_masks: |
|
|
trainable_count = embedding_masks[name].sum().item() |
|
|
module_params[module_name]["trainable"] += trainable_count |
|
|
total_trainable_params += trainable_count |
|
|
else: |
|
|
module_params[module_name]["trainable"] += param_count |
|
|
total_trainable_params += param_count |
|
|
|
|
|
print(f"All Params: {total_params:,}") |
|
|
print(f"Trainable Params: {total_trainable_params:,} ({total_trainable_params/total_params*100:.2f}%)") |
|
|
print("\nParams by Module:") |
|
|
|
|
|
for module_name, counts in sorted(module_params.items()): |
|
|
trainable_percentage = counts["trainable"] / counts["total"] * 100 if counts["total"] > 0 else 0 |
|
|
total_percentage = counts["total"] / total_params * 100 |
|
|
|
|
|
print(f"- {module_name}:") |
|
|
print(f" Total: {counts['total']:,} ({total_percentage:.2f}% of model)") |
|
|
print(f" Trainable: {counts['trainable']:,} ({trainable_percentage:.2f}% of module)") |
|
|
|
|
|
return module_params |
|
|
|
|
|
def create_model(model_name_or_path, revision="main", use_flash_attention = False): |
|
|
model = AutoModel.from_pretrained( |
|
|
model_name_or_path, |
|
|
revision=revision, |
|
|
torch_dtype=torch.bfloat16, |
|
|
device_map="auto", |
|
|
attn_implementation="flash_attention_2" if use_flash_attention else "eager", |
|
|
trust_remote_code=True, |
|
|
) |
|
|
|
|
|
|
|
|
model.config.use_cache = False |
|
|
|
|
|
|
|
|
for param in model.parameters(): |
|
|
param.requires_grad = False |
|
|
|
|
|
model.set_lora_adapter('speech') |
|
|
|
|
|
model.to(torch.bfloat16) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_embed = True |
|
|
if train_embed: |
|
|
embed_tokens = model.language_model.model.model.embed_tokens |
|
|
|
|
|
embed_tokens.weight.requires_grad = False |
|
|
|
|
|
|
|
|
trainable_token_ids = [256001, 256002] |
|
|
|
|
|
embed_tokens.weight.requires_grad = True |
|
|
mask = torch.ones_like(embed_tokens.weight, dtype=torch.bool) |
|
|
mask[trainable_token_ids] = False |
|
|
|
|
|
|
|
|
def embedding_grad_mask_hook(grad): |
|
|
return grad.masked_fill(mask, 0) |
|
|
|
|
|
embed_tokens.weight.register_hook(embedding_grad_mask_hook) |
|
|
|
|
|
model.language_model.model.model.embed_tokens = embed_tokens |
|
|
|
|
|
count_parameters_by_module(model) |
|
|
|
|
|
return model |
|
|
|
|
|
ANSWER_SUFFIX = "<end_of_turn>" |
|
|
_IGNORE_INDEX = -100 |
|
|
|
|
|
ANSWER_SUFFIX = "<end_of_turn>" |
|
|
_IGNORE_INDEX = -100 |
|
|
|
|
|
model_name_or_path = '/mnt/jeff/gemma-3-4b-it-omni' |
|
|
use_flash_attention = False |
|
|
|
|
|
output_dir = '../gemma_tmp14_audio_and_text_speechlora' |
|
|
batch_size = 16 |
|
|
batch_size_per_gpu = 1 |
|
|
learning_rate = 5.0e-5 |
|
|
wd = 0.01 |
|
|
num_train_epochs = 10 |
|
|
|
|
|
revision = "main" |
|
|
|
|
|
processor = AutoProcessor.from_pretrained( |
|
|
model_name_or_path, |
|
|
revision=revision, |
|
|
trust_remote_code=True, |
|
|
) |
|
|
|
|
|
model = create_model( |
|
|
model_name_or_path, |
|
|
revision=revision, |
|
|
use_flash_attention=use_flash_attention, |
|
|
) |
|
|
|
|
|
train_datasets = [] |
|
|
|
|
|
pickup_dataset = MultiturnAudioDataset(processor=processor,text_only=True,json_path='/mnt/jeff/InCar/data/multiturn_data/pickup_processed.json') |
|
|
train_datasets.append(pickup_dataset) |
|
|
|
|
|
pickup_dataset = MultiturnAudioDataset(processor=processor,json_path='/mnt/jeff/InCar/data/multiturn_data/pickup_processed.json') |
|
|
train_datasets.append(pickup_dataset) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Count Num of Datasets", len(train_datasets)) |
|
|
print([len(dataset) for dataset in train_datasets]) |
|
|
|
|
|
|
|
|
train_dataset = ConcatDataset(train_datasets) if len(train_datasets) > 1 else train_datasets[0] |
|
|
print("Count Length of Datas", len(train_dataset)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
num_gpus = torch.cuda.device_count() |
|
|
print(f'training on {num_gpus} GPUs') |
|
|
|
|
|
assert ( |
|
|
batch_size % (num_gpus * batch_size_per_gpu) == 0 |
|
|
), 'Batch size must be divisible by the number of GPUs' |
|
|
gradient_accumulation_steps = batch_size // (num_gpus * batch_size_per_gpu) |
|
|
|
|
|
|
|
|
dp_config = { |
|
|
"fp16": { |
|
|
"enabled": "auto", |
|
|
"loss_scale": 0, |
|
|
"loss_scale_window": 1000, |
|
|
"initial_scale_power": 16, |
|
|
"hysteresis": 2, |
|
|
"min_loss_scale": 1 |
|
|
}, |
|
|
"zero_optimization": { |
|
|
"stage": 2, |
|
|
"allgather_partitions": True, |
|
|
"allgather_bucket_size": 5e8, |
|
|
"overlap_comm": False, |
|
|
"reduce_scatter": True, |
|
|
"reduce_bucket_size": 5e8, |
|
|
"contiguous_gradients": True, |
|
|
"cpu_offload": True |
|
|
}, |
|
|
|
|
|
"train_batch_size": "auto", |
|
|
"gradient_accumulation_steps": "auto", |
|
|
"optimizer": { |
|
|
"type": "AdamW", |
|
|
"params": { |
|
|
"lr": "auto", |
|
|
"betas": 'auto', |
|
|
"eps": 'auto', |
|
|
"weight_decay": "auto" |
|
|
} |
|
|
}, |
|
|
"scheduler": { |
|
|
"type": "WarmupDecayLR", |
|
|
"params": { |
|
|
"warmup_min_lr": "auto", |
|
|
"warmup_max_lr": "auto", |
|
|
"warmup_num_steps": "auto", |
|
|
"total_num_steps": "auto" |
|
|
} |
|
|
}, |
|
|
"gradient_clipping": 1.0, |
|
|
"zero_optimization": { |
|
|
"stage": 0 |
|
|
} |
|
|
} |
|
|
training_args = TrainingArguments( |
|
|
num_train_epochs=num_train_epochs, |
|
|
per_device_train_batch_size=batch_size_per_gpu, |
|
|
gradient_checkpointing=True, |
|
|
gradient_checkpointing_kwargs={'use_reentrant': False}, |
|
|
gradient_accumulation_steps=gradient_accumulation_steps, |
|
|
optim='adamw_torch', |
|
|
adam_beta1=0.9, |
|
|
adam_beta2=0.95, |
|
|
adam_epsilon=1e-7, |
|
|
learning_rate=learning_rate, |
|
|
weight_decay=wd, |
|
|
max_grad_norm=1.0, |
|
|
lr_scheduler_type='cosine', |
|
|
warmup_steps=50, |
|
|
logging_steps=10, |
|
|
output_dir=output_dir, |
|
|
save_total_limit=10, |
|
|
save_only_model=True, |
|
|
bf16=True, |
|
|
fp16=False, |
|
|
remove_unused_columns=False, |
|
|
report_to='none', |
|
|
deepspeed=None, |
|
|
disable_tqdm=False, |
|
|
dataloader_num_workers=16, |
|
|
save_strategy='epoch', |
|
|
|
|
|
ddp_find_unused_parameters=True, |
|
|
|
|
|
) |
|
|
|
|
|
out_path = Path(training_args.output_dir) |
|
|
out_path.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
optimizer = torch.optim.AdamW( |
|
|
filter(lambda p: p.requires_grad, model.parameters()), |
|
|
lr=learning_rate, |
|
|
weight_decay=wd, |
|
|
betas=(0.9, 0.95), |
|
|
eps=1e-7, |
|
|
) |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
data_collator=covost_collate_fn, |
|
|
train_dataset=train_dataset, |
|
|
optimizers=(optimizer, None) |
|
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
|
|
|
|
|
|
model.language_model.model.save_pretrained(output_dir) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model.save_pretrained(output_dir) |
|
|
|