File size: 1,208 Bytes
4349b9b 126580f 7db0785 7c818c6 f4896a3 1907275 f085d61 7c818c6 4349b9b 44afb45 4349b9b 0c4d8e6 4349b9b 15a77d6 7c818c6 fd0332a 1907275 829720a 52a257c 1907275 2361658 6e1aaa9 52a257c 1907275 52a257c f934dae 80437f3 4349b9b 76bd6b1 b861536 76bd6b1 0c4d8e6 76bd6b1 301a9d0 76bd6b1 0c4d8e6 131891c 2b1c712 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
from argparse import ArgumentParser
from util import ConfigParser
from logger import Wandb
from trainer import Trainer
from dataset import Dataset
from tokenizer import Tokenizer
from model import Model
from logger import Wandb
from export import ExportAll
parser = ArgumentParser(
prog='Trainer implementation, using Pytorch',
description=''
)
if __name__ == '__main__':
parser.add_argument('-p', '--config_path')
args = parser.parse_args()
config = ConfigParser(args.config_path).config
dataset = Dataset(config.dataset)
tokenizer = Tokenizer()
tokenizer.train(dataset.text, max_length=config.tokenizer.max_length)
ids = tokenizer.c_encode(dataset.text)
config.model.tokenizer = tokenizer
config.model.params.vocab_size = tokenizer.vocab_size
batches, num_batches = dataset.batch(ids)
config.trainer.num_batches = num_batches
model = Model(config.model)
wandb = Wandb(config.wandb)
config.trainer.model = model
config.trainer.wandb = wandb
trainer = Trainer(config.trainer)
trainer.train(batches)
model.save_pretrained()
tokenizer.to_file('tokenizer.bin')
ExportAll() |