File size: 1,125 Bytes
cea4a4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
def preprocess_dataset(dataset: Dataset | DatasetDict, tokenizer: AutoTokenizer) -> Dataset | DatasetDict:
tokenized_dataset = dataset.map(
lambda examples: tokenize_function(examples, tokenizer), batched=True, remove_columns=["text", 'text_clean', 'language']
)
return tokenized_dataset.map(group_texts, batched=True)
def tokenize_function(examples, tokenizer: AutoTokenizer):
result = tokenizer(examples["text"])
if tokenizer.is_fast:
result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
return result
def group_texts(examples, chunk_size: int = 128):
concatinated_examples = {k : sum(examples[k], []) for k in examples.keys()}
total_length = len(concatinated_examples["input_ids"])
total_length = (total_length // chunk_size) * chunk_size
result = {k : [t[i : i+chunk_size] for i in range(0, total_length, chunk_size)] for k, t in concatinated_examples.items()}
result["labels"] = result["input_ids"].copy()
return result
|