| | """
|
| | Step 3: STREAMLINED Training - Minimal, Fast
|
| | """
|
| |
|
| | import torch
|
| | from torch.utils.data import Dataset, DataLoader
|
| | from transformers import GPT2LMHeadModel, GPT2Config
|
| | import sentencepiece as spm
|
| | from tqdm import tqdm
|
| | import time
|
| |
|
| |
|
| | CONFIG = {
|
| | 'train_file': './final_corpus/multilingual_corpus_train.txt',
|
| | 'val_file': './final_corpus/multilingual_corpus_val.txt',
|
| | 'tokenizer_path': './final_corpus/multilingual_spm.model',
|
| |
|
| |
|
| | 'n_positions': 128,
|
| | 'n_embd': 256,
|
| | 'n_layer': 4,
|
| | 'n_head': 4,
|
| | 'n_inner': 512,
|
| |
|
| |
|
| | 'batch_size': 2,
|
| | 'grad_accum': 8,
|
| | 'learning_rate': 2e-4,
|
| | 'total_steps': 5000,
|
| | 'save_every': 1000,
|
| | }
|
| |
|
| | class SimpleDataset(Dataset):
|
| | def __init__(self, filepath, tokenizer, block_size):
|
| | self.tokenizer = tokenizer
|
| | self.block_size = block_size
|
| |
|
| | print("Loading data...")
|
| | with open(filepath, 'r', encoding='utf-8') as f:
|
| | lines = [line.strip() for line in f if line.strip()]
|
| |
|
| |
|
| | self.examples = []
|
| | for line in tqdm(lines[:600000], desc="Tokenizing"):
|
| | tokens = tokenizer.encode(line)
|
| | if len(tokens) > 10:
|
| | if len(tokens) > block_size:
|
| | tokens = tokens[:block_size]
|
| | else:
|
| | tokens = tokens + [0] * (block_size - len(tokens))
|
| | self.examples.append(tokens)
|
| |
|
| | print(f"Created {len(self.examples)} examples")
|
| |
|
| | def __len__(self):
|
| | return len(self.examples)
|
| |
|
| | def __getitem__(self, idx):
|
| | return torch.tensor(self.examples[idx], dtype=torch.long)
|
| |
|
| | def train_streamlined():
|
| | print("\n" + "="*60)
|
| | print("STREAMLINED TRAINING - FASTEST POSSIBLE")
|
| | print("="*60)
|
| |
|
| | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| | print(f"Device: {device}")
|
| |
|
| |
|
| | tokenizer = spm.SentencePieceProcessor()
|
| | tokenizer.load(CONFIG['tokenizer_path'])
|
| | vocab_size = tokenizer.get_piece_size()
|
| |
|
| |
|
| | config = GPT2Config(
|
| | vocab_size=vocab_size,
|
| | n_positions=CONFIG['n_positions'],
|
| | n_embd=CONFIG['n_embd'],
|
| | n_layer=CONFIG['n_layer'],
|
| | n_head=CONFIG['n_head'],
|
| | n_inner=CONFIG['n_inner'],
|
| | pad_token_id=0,
|
| | )
|
| |
|
| | model = GPT2LMHeadModel(config)
|
| | model.to(device)
|
| | model.train()
|
| |
|
| |
|
| | model.gradient_checkpointing_enable()
|
| |
|
| |
|
| | dataset = SimpleDataset(CONFIG['train_file'], tokenizer, CONFIG['n_positions'])
|
| | dataloader = DataLoader(dataset, batch_size=CONFIG['batch_size'], shuffle=True)
|
| |
|
| |
|
| | optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'])
|
| |
|
| | print(f"\nModel: {sum(p.numel() for p in model.parameters())/1e6:.1f}M params")
|
| | print(f"Training steps: {CONFIG['total_steps']}")
|
| | print(f"Estimated time: {CONFIG['total_steps']*0.3/3600:.1f} hours\n")
|
| |
|
| |
|
| | global_step = 0
|
| | accumulation_steps = 0
|
| | start_time = time.time()
|
| |
|
| | while global_step < CONFIG['total_steps']:
|
| | for batch in dataloader:
|
| | batch = batch.to(device)
|
| |
|
| |
|
| | outputs = model(input_ids=batch, labels=batch)
|
| | loss = outputs.loss / CONFIG['grad_accum']
|
| |
|
| |
|
| | loss.backward()
|
| | accumulation_steps += 1
|
| |
|
| |
|
| | if accumulation_steps == CONFIG['grad_accum']:
|
| | torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
|
| | optimizer.step()
|
| | optimizer.zero_grad()
|
| |
|
| | global_step += 1
|
| | accumulation_steps = 0
|
| |
|
| |
|
| | if global_step % 100 == 0:
|
| | elapsed = time.time() - start_time
|
| | steps_per_second = global_step / elapsed
|
| | remaining = (CONFIG['total_steps'] - global_step) / steps_per_second
|
| |
|
| | print(f"Step {global_step}/{CONFIG['total_steps']} | "
|
| | f"Loss: {loss.item()*CONFIG['grad_accum']:.3f} | "
|
| | f"Remaining: {remaining/3600:.1f}h")
|
| |
|
| |
|
| | if global_step % CONFIG['save_every'] == 0:
|
| | save_path = f"./checkpoints_tiny/step{global_step}"
|
| | model.save_pretrained(save_path)
|
| | print(f"Saved checkpoint: {save_path}")
|
| |
|
| |
|
| | if global_step >= CONFIG['total_steps']:
|
| | break
|
| |
|
| | print(f"\nTraining completed in {(time.time()-start_time)/3600:.2f} hours")
|
| |
|
| |
|
| | model.save_pretrained("./checkpoints_tiny/final")
|
| | print("Final model saved to ./checkpoints_tiny/final")
|
| |
|
| | if __name__ == "__main__":
|
| | train_streamlined() |