Spaces:

Papaflessas
/

gotti_signal_gen

Running

File size: 9,305 Bytes

3fe0726

import os
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
import evaluate
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split

class RobertaRetrainer:
    """
    A class to retrain the RobertaLarge model using labeled financial news data.
    
    This follows the Hugging Face fine-tuning approach outlined in the RETRAIN.MD guide.
    """
    
    def __init__(self, 
                 model_name="Farshid/roberta-large-financial-phrasebank-allagree1", 
                 output_dir="./nimou-RoBERTa", 
                 csv_path=None):
        """
        Initialize the retrainer with model configuration.
        
        Args:
            model_name (str): HuggingFace model identifier
            output_dir (str): Directory where fine-tuned model will be saved
            csv_path (str): Path to the labeled dataset CSV
        """
        self.model_name = model_name
        self.output_dir = output_dir
        self.csv_path = csv_path
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = None
        self.model = None
        self.train_dataset = None
        self.val_dataset = None
        self.accuracy_metric = evaluate.load("accuracy")
        
        print(f"Using device: {self.device}")
        
        # Create output directory if it doesn't exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
    
    def load_data(self, csv_path=None):
        """
        Load and prepare the dataset from CSV.
        
        Args:
            csv_path (str, optional): Override the CSV path provided in the constructor
            
        Returns:
            tuple: Processed train and validation datasets
        """
        if csv_path:
            self.csv_path = csv_path
        
        if not self.csv_path:
            raise ValueError("CSV path must be provided")
        
        print(f"Loading data from {self.csv_path}")
        df = pd.read_csv(self.csv_path)
        
        # Basic data validation
        if 'text' not in df.columns or 'label' not in df.columns:
            raise ValueError("CSV must contain 'text' and 'label' columns")
        
        # Split into train and validation sets
        train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
        
        # Convert to HuggingFace datasets
        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)
        
        print(f"Training samples: {len(train_dataset)}")
        print(f"Validation samples: {len(val_dataset)}")
        
        # Display label distribution
        print("Label distribution in training set:")
        for label, count in train_df['label'].value_counts().items():
            print(f"  Label {label}: {count} samples ({count / len(train_df) * 100:.2f}%)")
        
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        return train_dataset, val_dataset
    
    def load_model(self):
        """
        Load the pretrained model and tokenizer.
        
        Returns:
            tuple: Loaded tokenizer and model
        """
        print(f"Loading model {self.model_name}")
        
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=3  # NEGATIVE=0, NEUTRAL=1, POSITIVE=2
        )
        
        return self.tokenizer, self.model
    
    def tokenize_data(self, max_length=128):
        """
        Tokenize the datasets.
        
        Args:
            max_length (int): Maximum sequence length for tokenization
            
        Returns:
            tuple: Tokenized training and validation datasets
        """
        if not self.tokenizer:
            self.load_model()
        
        if not self.train_dataset or not self.val_dataset:
            self.load_data()
            
        def preprocess(examples):
            return self.tokenizer(
                examples["text"],
                padding="max_length",
                truncation=True,
                max_length=max_length
            )
        
        tokenized_train = self.train_dataset.map(preprocess, batched=True)
        tokenized_val = self.val_dataset.map(preprocess, batched=True)
        
        print("Datasets tokenized")
        self.train_dataset = tokenized_train
        self.val_dataset = tokenized_val
        
        return tokenized_train, tokenized_val
    
    def compute_metrics(self, eval_pred):
        """
        Compute evaluation metrics during training.
        
        Args:
            eval_pred (tuple): Tuple of predictions and labels from the trainer
            
        Returns:
            dict: Dictionary containing evaluation metrics
        """
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=1)
        
        acc = self.accuracy_metric.compute(predictions=predictions, references=labels)
        
        # Calculate precision, recall, and f1 for each class
        results = {"accuracy": acc["accuracy"]}
        
        return results
    
    def train(self, 
              num_train_epochs=3,
              learning_rate=2e-5,
              per_device_train_batch_size=8,
              per_device_eval_batch_size=8,
              weight_decay=0.01,
              warmup_ratio=0.1,
              logging_steps=100,
              eval_steps=500,
              save_steps=1000,
              load_best_model_at_end=True):
        """
        Train the model on the prepared dataset.
        
        Args:
            num_train_epochs (int): Number of training epochs
            learning_rate (float): Learning rate for optimizer
            per_device_train_batch_size (int): Batch size for training
            per_device_eval_batch_size (int): Batch size for evaluation
            weight_decay (float): Weight decay for regularization
            warmup_ratio (float): Ratio of warmup steps
            logging_steps (int): Number of steps between logging
            eval_steps (int): Number of steps between evaluations
            save_steps (int): Number of steps between checkpoints
            load_best_model_at_end (bool): Whether to load the best model at the end
            
        Returns:
            Trainer: Trained model trainer
        """
        if not self.model:
            self.load_model()
        
        if not self.train_dataset or not self.val_dataset:
            self.tokenize_data()
        
        # Set training arguments
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            per_device_eval_batch_size=per_device_eval_batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            warmup_ratio=warmup_ratio,
            evaluation_strategy="steps",
            eval_steps=eval_steps,
            logging_steps=logging_steps,
            save_steps=save_steps,
            load_best_model_at_end=load_best_model_at_end,
            metric_for_best_model="accuracy",
            save_total_limit=2,  # Only keep the 2 best checkpoints
        )
        
        # Move model to the correct device
        self.model.to(self.device)
        
        # Initialize the Trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.val_dataset,
            tokenizer=self.tokenizer,
            compute_metrics=self.compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )
        
        # Train the model
        print("Starting training...")
        trainer.train()
        
        # Save the best model
        trainer.save_model(self.output_dir)
        print(f"Model saved to {self.output_dir}")
        
        # Evaluate the model
        print("Evaluating model...")
        eval_results = trainer.evaluate()
        print(f"Evaluation results: {eval_results}")
        
        return trainer

def main():
    """
    Main function to demonstrate the retraining process.
    """
    # Define paths
    csv_path = "c:/Users/M/Desktop/repos/gotti/LLaMAVestor/src/logs/prepared_training_data.csv"
    output_dir = "c:/Users/M/Desktop/repos/gotti/LLaMAVestor/src/models/finetuned-roberta"
    
    # Initialize the retrainer
    retrainer = RobertaRetrainer(
        model_name="Farshid/roberta-large-financial-phrasebank-allagree1",
        output_dir=output_dir,
        csv_path=csv_path
    )
    
    # Start the training process
    retrainer.load_data()
    retrainer.tokenize_data(max_length=128)
    trainer = retrainer.train(
        num_train_epochs=5,
        learning_rate=1e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8
    )
    
    print("Training completed successfully!")

if __name__ == "__main__":
    main()