File size: 1,125 Bytes
cea4a4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

def preprocess_dataset(dataset: Dataset | DatasetDict, tokenizer: AutoTokenizer) -> Dataset | DatasetDict:

    tokenized_dataset = dataset.map(
        lambda examples: tokenize_function(examples, tokenizer), batched=True, remove_columns=["text", 'text_clean', 'language'] 
    )

    return tokenized_dataset.map(group_texts, batched=True)







def tokenize_function(examples, tokenizer: AutoTokenizer):

    result = tokenizer(examples["text"])

    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    
    return result


def group_texts(examples, chunk_size: int = 128):

    concatinated_examples = {k : sum(examples[k], []) for k in examples.keys()}

    total_length = len(concatinated_examples["input_ids"])
    total_length = (total_length // chunk_size) * chunk_size

    result = {k : [t[i : i+chunk_size] for i in range(0, total_length, chunk_size)] for k, t in concatinated_examples.items()}
    result["labels"] = result["input_ids"].copy()

    return result