File size: 1,208 Bytes
4349b9b
 
 
 
126580f
7db0785
7c818c6
f4896a3
1907275
f085d61
 
7c818c6
4349b9b
44afb45
 
4349b9b
 
 
 
 
 
0c4d8e6
4349b9b
 
 
 
 
 
 
15a77d6
7c818c6
fd0332a
 
1907275
829720a
52a257c
1907275
2361658
6e1aaa9
52a257c
 
1907275
52a257c
f934dae
80437f3
4349b9b
76bd6b1
b861536
76bd6b1
 
 
0c4d8e6
76bd6b1
301a9d0
76bd6b1
0c4d8e6
131891c
 
2b1c712
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

from argparse import ArgumentParser

from util import ConfigParser
from logger import Wandb

from trainer import Trainer
from dataset import Dataset
from tokenizer import Tokenizer
from model import Model
from logger import Wandb


from export import ExportAll


parser = ArgumentParser(
    prog='Trainer implementation, using Pytorch',
    description=''
)



if __name__ == '__main__':
    
    parser.add_argument('-p', '--config_path')
    args = parser.parse_args()


    config = ConfigParser(args.config_path).config
    
    dataset = Dataset(config.dataset)

    tokenizer = Tokenizer()
    tokenizer.train(dataset.text, max_length=config.tokenizer.max_length)
    
    ids = tokenizer.c_encode(dataset.text)

    config.model.tokenizer = tokenizer
    config.model.params.vocab_size = tokenizer.vocab_size
    
    
    batches, num_batches = dataset.batch(ids)
    config.trainer.num_batches = num_batches


    model = Model(config.model)
    wandb = Wandb(config.wandb)

    config.trainer.model = model
    config.trainer.wandb = wandb
    


    trainer = Trainer(config.trainer)
    trainer.train(batches)


    model.save_pretrained()
    tokenizer.to_file('tokenizer.bin')
    ExportAll()