Papaflessas's picture
Deploy Signal Generator app
3fe0726
import os
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
import evaluate
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
class RobertaRetrainer:
"""
A class to retrain the RobertaLarge model using labeled financial news data.
This follows the Hugging Face fine-tuning approach outlined in the RETRAIN.MD guide.
"""
def __init__(self,
model_name="Farshid/roberta-large-financial-phrasebank-allagree1",
output_dir="./nimou-RoBERTa",
csv_path=None):
"""
Initialize the retrainer with model configuration.
Args:
model_name (str): HuggingFace model identifier
output_dir (str): Directory where fine-tuned model will be saved
csv_path (str): Path to the labeled dataset CSV
"""
self.model_name = model_name
self.output_dir = output_dir
self.csv_path = csv_path
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.tokenizer = None
self.model = None
self.train_dataset = None
self.val_dataset = None
self.accuracy_metric = evaluate.load("accuracy")
print(f"Using device: {self.device}")
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
def load_data(self, csv_path=None):
"""
Load and prepare the dataset from CSV.
Args:
csv_path (str, optional): Override the CSV path provided in the constructor
Returns:
tuple: Processed train and validation datasets
"""
if csv_path:
self.csv_path = csv_path
if not self.csv_path:
raise ValueError("CSV path must be provided")
print(f"Loading data from {self.csv_path}")
df = pd.read_csv(self.csv_path)
# Basic data validation
if 'text' not in df.columns or 'label' not in df.columns:
raise ValueError("CSV must contain 'text' and 'label' columns")
# Split into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
# Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
# Display label distribution
print("Label distribution in training set:")
for label, count in train_df['label'].value_counts().items():
print(f" Label {label}: {count} samples ({count / len(train_df) * 100:.2f}%)")
self.train_dataset = train_dataset
self.val_dataset = val_dataset
return train_dataset, val_dataset
def load_model(self):
"""
Load the pretrained model and tokenizer.
Returns:
tuple: Loaded tokenizer and model
"""
print(f"Loading model {self.model_name}")
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(
self.model_name,
num_labels=3 # NEGATIVE=0, NEUTRAL=1, POSITIVE=2
)
return self.tokenizer, self.model
def tokenize_data(self, max_length=128):
"""
Tokenize the datasets.
Args:
max_length (int): Maximum sequence length for tokenization
Returns:
tuple: Tokenized training and validation datasets
"""
if not self.tokenizer:
self.load_model()
if not self.train_dataset or not self.val_dataset:
self.load_data()
def preprocess(examples):
return self.tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=max_length
)
tokenized_train = self.train_dataset.map(preprocess, batched=True)
tokenized_val = self.val_dataset.map(preprocess, batched=True)
print("Datasets tokenized")
self.train_dataset = tokenized_train
self.val_dataset = tokenized_val
return tokenized_train, tokenized_val
def compute_metrics(self, eval_pred):
"""
Compute evaluation metrics during training.
Args:
eval_pred (tuple): Tuple of predictions and labels from the trainer
Returns:
dict: Dictionary containing evaluation metrics
"""
logits, labels = eval_pred
predictions = np.argmax(logits, axis=1)
acc = self.accuracy_metric.compute(predictions=predictions, references=labels)
# Calculate precision, recall, and f1 for each class
results = {"accuracy": acc["accuracy"]}
return results
def train(self,
num_train_epochs=3,
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
weight_decay=0.01,
warmup_ratio=0.1,
logging_steps=100,
eval_steps=500,
save_steps=1000,
load_best_model_at_end=True):
"""
Train the model on the prepared dataset.
Args:
num_train_epochs (int): Number of training epochs
learning_rate (float): Learning rate for optimizer
per_device_train_batch_size (int): Batch size for training
per_device_eval_batch_size (int): Batch size for evaluation
weight_decay (float): Weight decay for regularization
warmup_ratio (float): Ratio of warmup steps
logging_steps (int): Number of steps between logging
eval_steps (int): Number of steps between evaluations
save_steps (int): Number of steps between checkpoints
load_best_model_at_end (bool): Whether to load the best model at the end
Returns:
Trainer: Trained model trainer
"""
if not self.model:
self.load_model()
if not self.train_dataset or not self.val_dataset:
self.tokenize_data()
# Set training arguments
training_args = TrainingArguments(
output_dir=self.output_dir,
num_train_epochs=num_train_epochs,
per_device_train_batch_size=per_device_train_batch_size,
per_device_eval_batch_size=per_device_eval_batch_size,
learning_rate=learning_rate,
weight_decay=weight_decay,
warmup_ratio=warmup_ratio,
evaluation_strategy="steps",
eval_steps=eval_steps,
logging_steps=logging_steps,
save_steps=save_steps,
load_best_model_at_end=load_best_model_at_end,
metric_for_best_model="accuracy",
save_total_limit=2, # Only keep the 2 best checkpoints
)
# Move model to the correct device
self.model.to(self.device)
# Initialize the Trainer
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=self.train_dataset,
eval_dataset=self.val_dataset,
tokenizer=self.tokenizer,
compute_metrics=self.compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)
# Train the model
print("Starting training...")
trainer.train()
# Save the best model
trainer.save_model(self.output_dir)
print(f"Model saved to {self.output_dir}")
# Evaluate the model
print("Evaluating model...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")
return trainer
def main():
"""
Main function to demonstrate the retraining process.
"""
# Define paths
csv_path = "c:/Users/M/Desktop/repos/gotti/LLaMAVestor/src/logs/prepared_training_data.csv"
output_dir = "c:/Users/M/Desktop/repos/gotti/LLaMAVestor/src/models/finetuned-roberta"
# Initialize the retrainer
retrainer = RobertaRetrainer(
model_name="Farshid/roberta-large-financial-phrasebank-allagree1",
output_dir=output_dir,
csv_path=csv_path
)
# Start the training process
retrainer.load_data()
retrainer.tokenize_data(max_length=128)
trainer = retrainer.train(
num_train_epochs=5,
learning_rate=1e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8
)
print("Training completed successfully!")
if __name__ == "__main__":
main()