File size: 695 Bytes
788217c b78ff52 788217c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
from argparse import ArgumentParser
from tokenizer import Tokenizer
parser = ArgumentParser(
prog='Flop Tokenizer Python code',
description=''
)
if __name__ == '__main__':
print('Hello world')
parser.add_argument('-i', '--input_file')
parser.add_argument('-o', '--output_file', default='tokenizer.bin')
parser.add_argument('-n', '--max_vocab_size', default=32000)
args = parser.parse_args()
tokenizer = Tokenizer()
with open(args.input_file, 'r') as f:
dataset = f.read()
tokenizer.train(dataset, max_length=args.max_vocab_size);
tokenizer.to_file(args.output_file)
print(f"Tokenizer has vocab size: {tokenizer.vocab_size}");
|