from argparse import ArgumentParser from tokenizer import Tokenizer parser = ArgumentParser( prog='Flop Tokenizer Python code', description='' ) if __name__ == '__main__': print('Hello world') parser.add_argument('-i', '--input_file') parser.add_argument('-o', '--output_file', default='tokenizer.bin') parser.add_argument('-n', '--max_vocab_size', default=32000) args = parser.parse_args() tokenizer = Tokenizer() with open(args.input_file, 'r') as f: dataset = f.read() tokenizer.train(dataset, max_length=args.max_vocab_size); tokenizer.to_file(args.output_file) print(f"Tokenizer has vocab size: {tokenizer.vocab_size}");