| | import os |
| | from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer |
| | from datasets import load_dataset |
| | import json |
| |
|
| | |
| | with open('../config/config.json') as f: |
| | config = json.load(f) |
| |
|
| | |
| | dataset = load_dataset('csv', data_files={'train': '../data/train.csv', 'validation': '../data/valid.csv'}) |
| |
|
| | |
| | model = AutoModelForSequenceClassification.from_pretrained(config['model_name'], num_labels=config['num_labels']) |
| | tokenizer = AutoTokenizer.from_pretrained(config['model_name']) |
| |
|
| | |
| | def tokenize_function(examples): |
| | return tokenizer(examples['text'], padding="max_length", truncation=True) |
| |
|
| | tokenized_datasets = dataset.map(tokenize_function, batched=True) |
| |
|
| | |
| | training_args = TrainingArguments( |
| | output_dir='./results', |
| | learning_rate=config['learning_rate'], |
| | per_device_train_batch_size=config['batch_size'], |
| | num_train_epochs=config['num_epochs'], |
| | evaluation_strategy="epoch", |
| | save_strategy="epoch", |
| | logging_dir='./logs' |
| | ) |
| |
|
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=tokenized_datasets['train'], |
| | eval_dataset=tokenized_datasets['validation'], |
| | tokenizer=tokenizer |
| | ) |
| |
|
| | trainer.train() |
| | trainer.save_model('../model') |
| |
|