File size: 695 Bytes
788217c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b78ff52
788217c
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

from argparse import ArgumentParser
from tokenizer import Tokenizer




parser = ArgumentParser(
    prog='Flop Tokenizer Python code',
    description=''
)


if __name__ == '__main__':
    print('Hello world')

    parser.add_argument('-i', '--input_file')
    parser.add_argument('-o', '--output_file', default='tokenizer.bin')
    parser.add_argument('-n', '--max_vocab_size', default=32000)

    args = parser.parse_args()

    tokenizer = Tokenizer()

    with open(args.input_file, 'r') as f:
        dataset = f.read()

    tokenizer.train(dataset, max_length=args.max_vocab_size);
    tokenizer.to_file(args.output_file)

    print(f"Tokenizer has vocab size: {tokenizer.vocab_size}");