Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| import yaml | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaConfig | |
| from torch.utils.data import DataLoader, IterableDataset | |
| from pytorch_lightning import Trainer, LightningModule | |
| from pytorch_lightning.callbacks import ( | |
| ModelCheckpoint, | |
| LearningRateMonitor, | |
| RichProgressBar, | |
| ) | |
| from pytorch_lightning.loggers import TensorBoardLogger | |
| from torch.nn.utils.rnn import pad_sequence | |
| from lightning.pytorch.callbacks.progress.rich_progress import RichProgressBarTheme | |
| from pytorch_lightning.callbacks import TQDMProgressBar | |
| # Set environment variable for memory management | |
| os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" | |
| # Function to log GPU memory usage | |
| def log_memory_usage(step): | |
| if torch.cuda.is_available(): | |
| print( | |
| f"Step {step}: " | |
| f"Allocated = {torch.cuda.memory_allocated() / 1e9:.2f} GB, " | |
| f"Reserved = {torch.cuda.memory_reserved() / 1e9:.2f} GB" | |
| ) | |
| # Custom Collate Function | |
| def collate_fn(batch): | |
| input_ids = [item["input_ids"] for item in batch] | |
| labels = [item["labels"] for item in batch] | |
| input_ids = pad_sequence( | |
| input_ids, batch_first=True, padding_value=tokenizer.pad_token_id | |
| ) | |
| labels = pad_sequence( | |
| labels, batch_first=True, padding_value=tokenizer.pad_token_id | |
| ) | |
| return {"input_ids": input_ids, "labels": labels} | |
| # Streaming Dataset | |
| class StreamingDataset(IterableDataset): | |
| def __init__(self, dataset, tokenizer, max_length=2048): | |
| self.dataset = dataset | |
| self.tokenizer = tokenizer | |
| self.max_length = max_length | |
| def __iter__(self): | |
| for example in iter(self.dataset): | |
| tokenized = self.tokenizer( | |
| example["text"], | |
| truncation=True, | |
| max_length=self.max_length, | |
| return_overflowing_tokens=True, | |
| return_tensors="pt", | |
| ) | |
| for chunk in tokenized["input_ids"]: | |
| yield { | |
| "input_ids": chunk.squeeze(0), | |
| "labels": chunk.squeeze(0), | |
| } | |
| # Lightning Module | |
| class SmolLMModule(LightningModule): | |
| def __init__(self, config, learning_rate=1e-4): | |
| super().__init__() | |
| self.config = config | |
| self.learning_rate = learning_rate | |
| self.save_hyperparameters() | |
| model_config = LlamaConfig( | |
| vocab_size=49152, | |
| hidden_size=config["model"]["model_config"]["hidden_size"], | |
| intermediate_size=config["model"]["model_config"]["intermediate_size"], | |
| num_hidden_layers=config["model"]["model_config"]["num_hidden_layers"], | |
| num_attention_heads=config["model"]["model_config"]["num_attention_heads"], | |
| num_key_value_heads=config["model"]["model_config"]["num_key_value_heads"], | |
| hidden_act=config["model"]["model_config"]["hidden_act"], | |
| max_position_embeddings=config["model"]["model_config"][ | |
| "max_position_embeddings" | |
| ], | |
| initializer_range=config["model"]["model_config"]["initializer_range"], | |
| rms_norm_eps=1e-5, | |
| use_cache=True, | |
| pad_token_id=config["model"]["model_config"]["pad_token_id"], | |
| bos_token_id=config["model"]["model_config"]["bos_token_id"], | |
| eos_token_id=config["model"]["model_config"]["eos_token_id"], | |
| ) | |
| self.model = AutoModelForCausalLM.from_config(model_config) | |
| def training_step(self, batch, batch_idx): | |
| outputs = self.model(input_ids=batch["input_ids"], labels=batch["labels"]) | |
| loss = outputs.loss | |
| self.log( | |
| "train_loss", loss, prog_bar=True, on_step=True, on_epoch=True | |
| ) # Log loss | |
| # # Log memory usage | |
| # if batch_idx % 10 == 0: | |
| # log_memory_usage(batch_idx) | |
| # Release intermediate tensors | |
| del outputs | |
| torch.cuda.empty_cache() | |
| return loss | |
| def configure_optimizers(self): | |
| return torch.optim.AdamW( | |
| self.model.parameters(), | |
| lr=self.learning_rate, | |
| betas=(0.9, 0.95), | |
| eps=1e-8, | |
| weight_decay=self.config["optimizer"]["weight_decay"], | |
| ) | |
| # Main Script | |
| if __name__ == "__main__": | |
| # Load config | |
| with open("config_smollm2_135.yaml", "r") as file: | |
| config = yaml.safe_load(file) | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer") | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # Load dataset | |
| dataset = load_dataset( | |
| "HuggingFaceTB/smollm-corpus", "cosmopedia-v2", streaming=True | |
| ) | |
| train_dataset = dataset["train"] | |
| # Create DataLoader | |
| streaming_dataset = StreamingDataset(train_dataset, tokenizer, max_length=2048) | |
| train_loader = DataLoader( | |
| streaming_dataset, | |
| batch_size=1, # Reduced batch size | |
| num_workers=4, | |
| collate_fn=collate_fn, | |
| pin_memory=True, | |
| ) | |
| # Create model | |
| model = SmolLMModule( | |
| config, | |
| learning_rate=config["optimizer"]["learning_rate_scheduler"]["learning_rate"], | |
| ) | |
| # Initialize logger with version based on start_step | |
| logger = TensorBoardLogger("logs", name="smollm2") | |
| # Checkpoint callback configuration | |
| checkpoint_callback = ModelCheckpoint( | |
| dirpath="checkpoints", | |
| filename="model-{epoch:02d}-{step}-{train_loss:.2f}", # Include training loss in filename | |
| monitor="train_loss", # Monitor training loss | |
| mode="min", # Lower loss is better | |
| save_top_k=3, # Save the best 3 models | |
| save_last=True, # Additionally save the last model | |
| every_n_train_steps=5000, # Save every 500 steps | |
| save_weights_only=False, # Save the full model state | |
| auto_insert_metric_name=False, # Don't insert metric name in filename | |
| ) | |
| # Progress bar | |
| # progress_bar = RichProgressBar( | |
| # refresh_rate=1, | |
| # leave=False, | |
| # theme=RichProgressBarTheme( | |
| # description="", | |
| # progress_bar="#6206E0", | |
| # progress_bar_finished="#6206E0", | |
| # progress_bar_pulse="#6206E0", | |
| # batch_progress="", | |
| # time="dim", | |
| # processing_speed="dim underline", | |
| # metrics="italic", | |
| # metrics_text_delimiter=" ", | |
| # metrics_format=".3f", | |
| # ), | |
| # console_kwargs=None, | |
| # ) | |
| progress_bar = TQDMProgressBar(refresh_rate=10) | |
| # Create trainer | |
| trainer = Trainer( | |
| logger=logger, | |
| strategy="ddp", | |
| accelerator="gpu", | |
| devices=2, | |
| precision="16-mixed", | |
| max_steps=500000, | |
| accumulate_grad_batches=1, | |
| enable_checkpointing = True, | |
| callbacks=[ | |
| LearningRateMonitor(logging_interval="step"), | |
| progress_bar, | |
| checkpoint_callback, | |
| ], | |
| enable_progress_bar=True, | |
| enable_model_summary=True, | |
| log_every_n_steps=10, | |
| ) | |
| # Find latest checkpoint if exists | |
| if os.path.exists("checkpoints/last.ckpt"): | |
| resume_from_checkpoint = "checkpoints/last.ckpt" | |
| print(f"Resuming from checkpoint: {resume_from_checkpoint}") | |
| else: | |
| resume_from_checkpoint = None | |
| print("Starting training from scratch") | |
| # Train with automatic checkpoint resumption | |
| trainer.fit(model, train_loader, ckpt_path=resume_from_checkpoint) | |
| optimizers = trainer.optimizers | |
| if optimizers: | |
| optimizer = optimizers[0] | |
| print("optimizer state:",optimizer.state_dict()) | |
| # After training, print the best model path and score | |
| print(f"Best model path: {checkpoint_callback.best_model_path}") | |
| # print(f"Best train loss: {checkpoint_callback.best_model_score:.4f}") | |
| # Save final model | |
| if trainer.is_global_zero: | |
| output_dir = "final_model" | |
| os.makedirs(output_dir, exist_ok=True) | |
| model.model.save_pretrained(output_dir) | |
| tokenizer.save_pretrained(output_dir) | |