Update 3 files
Browse files- /trainer.py
- /trainer.cli.py
- /tokenizer.py
- tokenizer.py +1 -1
- trainer.cli.py +5 -1
- trainer.py +4 -1
tokenizer.py
CHANGED
|
@@ -145,4 +145,4 @@ class Tokenizer:
|
|
| 145 |
|
| 146 |
|
| 147 |
def c_encode(self, text): #TODO: Implement
|
| 148 |
-
return []
|
|
|
|
| 145 |
|
| 146 |
|
| 147 |
def c_encode(self, text): #TODO: Implement
|
| 148 |
+
return [1, 2, 3, 4]
|
trainer.cli.py
CHANGED
|
@@ -31,8 +31,12 @@ if __name__ == '__main__':
|
|
| 31 |
tokenizer.train(dataset.text, max_length=config.tokenizer.max_length)
|
| 32 |
ids = tokenizer.c_encode(dataset.text)
|
| 33 |
|
|
|
|
| 34 |
dataset += ids
|
| 35 |
-
dataset.batch(ids)
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
trainer = Trainer(config)
|
| 38 |
trainer.train(dataset)
|
|
|
|
| 31 |
tokenizer.train(dataset.text, max_length=config.tokenizer.max_length)
|
| 32 |
ids = tokenizer.c_encode(dataset.text)
|
| 33 |
|
| 34 |
+
|
| 35 |
dataset += ids
|
| 36 |
+
#dataset.batch(ids)
|
| 37 |
+
|
| 38 |
+
print(f"dataset ids: {dataset.ids}")
|
| 39 |
+
|
| 40 |
|
| 41 |
trainer = Trainer(config)
|
| 42 |
trainer.train(dataset)
|
trainer.py
CHANGED
|
@@ -11,4 +11,7 @@ class Trainer:
|
|
| 11 |
#self.wandb = Wandb(config.wandb)
|
| 12 |
|
| 13 |
self.model = Model(config.model)
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
#self.wandb = Wandb(config.wandb)
|
| 12 |
|
| 13 |
self.model = Model(config.model)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def train(self, dataset): # TODO: Implement
|
| 17 |
+
pass
|