import torch from torch.utils.data import Dataset from typing import List, Tuple class MathDataset(Dataset): """ A custom PyTorch Dataset to handle the encoded math problem sequences. It performs the crucial language model shift (X is the input, Y is X shifted by one) and handles padding. """ def __init__(self, data: List[str], tokenizer, max_len: int): self.data = data self.tokenizer = tokenizer self.max_len = max_len self.pad_token_id = tokenizer.pad_token_id # Use the ID stored in the tokenizer def __len__(self): # Returns the total number of problems in the dataset return len(self.data) def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: # 1. Get the raw encoded sequence (list of IDs) raw_text = self.data[idx] sequence_ids = self.tokenizer.encode(raw_text) # 2. Sequence Shift: The core of Language Modeling # X (Input): The Transformer sees this. (e.g., [7, +, 2, =]) # Y (Target): The Transformer must predict this at the next step. (e.g., [+, 2, =, 9]) # X: All tokens except the final token (or final answer token) # We cut off the last token because there is no token for the model to predict AFTER it. x = sequence_ids[:-1] # Y: All tokens except the first one. This is the sequence X is trying to predict. # This is the "correct next token" for every position in X. y = sequence_ids[1:] # 3. Padding # All sequences in a batch must have the same length (T or block_size). padding_length = self.max_len - len(x) # Pad the sequences X and Y with the token ID x_padded = x + [self.pad_token_id] * padding_length y_padded = y + [self.pad_token_id] * padding_length # 4. Convert to PyTorch Tensors (dtype=torch.long is standard for integer IDs) return torch.tensor(x_padded, dtype=torch.long), torch.tensor(y_padded, dtype=torch.long)