File size: 9,305 Bytes
3fe0726
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import os
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
import evaluate
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split

class RobertaRetrainer:
    """
    A class to retrain the RobertaLarge model using labeled financial news data.
    
    This follows the Hugging Face fine-tuning approach outlined in the RETRAIN.MD guide.
    """
    
    def __init__(self, 
                 model_name="Farshid/roberta-large-financial-phrasebank-allagree1", 
                 output_dir="./nimou-RoBERTa", 
                 csv_path=None):
        """
        Initialize the retrainer with model configuration.
        
        Args:
            model_name (str): HuggingFace model identifier
            output_dir (str): Directory where fine-tuned model will be saved
            csv_path (str): Path to the labeled dataset CSV
        """
        self.model_name = model_name
        self.output_dir = output_dir
        self.csv_path = csv_path
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = None
        self.model = None
        self.train_dataset = None
        self.val_dataset = None
        self.accuracy_metric = evaluate.load("accuracy")
        
        print(f"Using device: {self.device}")
        
        # Create output directory if it doesn't exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
    
    def load_data(self, csv_path=None):
        """
        Load and prepare the dataset from CSV.
        
        Args:
            csv_path (str, optional): Override the CSV path provided in the constructor
            
        Returns:
            tuple: Processed train and validation datasets
        """
        if csv_path:
            self.csv_path = csv_path
        
        if not self.csv_path:
            raise ValueError("CSV path must be provided")
        
        print(f"Loading data from {self.csv_path}")
        df = pd.read_csv(self.csv_path)
        
        # Basic data validation
        if 'text' not in df.columns or 'label' not in df.columns:
            raise ValueError("CSV must contain 'text' and 'label' columns")
        
        # Split into train and validation sets
        train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
        
        # Convert to HuggingFace datasets
        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)
        
        print(f"Training samples: {len(train_dataset)}")
        print(f"Validation samples: {len(val_dataset)}")
        
        # Display label distribution
        print("Label distribution in training set:")
        for label, count in train_df['label'].value_counts().items():
            print(f"  Label {label}: {count} samples ({count / len(train_df) * 100:.2f}%)")
        
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        return train_dataset, val_dataset
    
    def load_model(self):
        """
        Load the pretrained model and tokenizer.
        
        Returns:
            tuple: Loaded tokenizer and model
        """
        print(f"Loading model {self.model_name}")
        
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=3  # NEGATIVE=0, NEUTRAL=1, POSITIVE=2
        )
        
        return self.tokenizer, self.model
    
    def tokenize_data(self, max_length=128):
        """
        Tokenize the datasets.
        
        Args:
            max_length (int): Maximum sequence length for tokenization
            
        Returns:
            tuple: Tokenized training and validation datasets
        """
        if not self.tokenizer:
            self.load_model()
        
        if not self.train_dataset or not self.val_dataset:
            self.load_data()
            
        def preprocess(examples):
            return self.tokenizer(
                examples["text"],
                padding="max_length",
                truncation=True,
                max_length=max_length
            )
        
        tokenized_train = self.train_dataset.map(preprocess, batched=True)
        tokenized_val = self.val_dataset.map(preprocess, batched=True)
        
        print("Datasets tokenized")
        self.train_dataset = tokenized_train
        self.val_dataset = tokenized_val
        
        return tokenized_train, tokenized_val
    
    def compute_metrics(self, eval_pred):
        """
        Compute evaluation metrics during training.
        
        Args:
            eval_pred (tuple): Tuple of predictions and labels from the trainer
            
        Returns:
            dict: Dictionary containing evaluation metrics
        """
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=1)
        
        acc = self.accuracy_metric.compute(predictions=predictions, references=labels)
        
        # Calculate precision, recall, and f1 for each class
        results = {"accuracy": acc["accuracy"]}
        
        return results
    
    def train(self, 
              num_train_epochs=3,
              learning_rate=2e-5,
              per_device_train_batch_size=8,
              per_device_eval_batch_size=8,
              weight_decay=0.01,
              warmup_ratio=0.1,
              logging_steps=100,
              eval_steps=500,
              save_steps=1000,
              load_best_model_at_end=True):
        """
        Train the model on the prepared dataset.
        
        Args:
            num_train_epochs (int): Number of training epochs
            learning_rate (float): Learning rate for optimizer
            per_device_train_batch_size (int): Batch size for training
            per_device_eval_batch_size (int): Batch size for evaluation
            weight_decay (float): Weight decay for regularization
            warmup_ratio (float): Ratio of warmup steps
            logging_steps (int): Number of steps between logging
            eval_steps (int): Number of steps between evaluations
            save_steps (int): Number of steps between checkpoints
            load_best_model_at_end (bool): Whether to load the best model at the end
            
        Returns:
            Trainer: Trained model trainer
        """
        if not self.model:
            self.load_model()
        
        if not self.train_dataset or not self.val_dataset:
            self.tokenize_data()
        
        # Set training arguments
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            per_device_eval_batch_size=per_device_eval_batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            warmup_ratio=warmup_ratio,
            evaluation_strategy="steps",
            eval_steps=eval_steps,
            logging_steps=logging_steps,
            save_steps=save_steps,
            load_best_model_at_end=load_best_model_at_end,
            metric_for_best_model="accuracy",
            save_total_limit=2,  # Only keep the 2 best checkpoints
        )
        
        # Move model to the correct device
        self.model.to(self.device)
        
        # Initialize the Trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.val_dataset,
            tokenizer=self.tokenizer,
            compute_metrics=self.compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )
        
        # Train the model
        print("Starting training...")
        trainer.train()
        
        # Save the best model
        trainer.save_model(self.output_dir)
        print(f"Model saved to {self.output_dir}")
        
        # Evaluate the model
        print("Evaluating model...")
        eval_results = trainer.evaluate()
        print(f"Evaluation results: {eval_results}")
        
        return trainer

def main():
    """
    Main function to demonstrate the retraining process.
    """
    # Define paths
    csv_path = "c:/Users/M/Desktop/repos/gotti/LLaMAVestor/src/logs/prepared_training_data.csv"
    output_dir = "c:/Users/M/Desktop/repos/gotti/LLaMAVestor/src/models/finetuned-roberta"
    
    # Initialize the retrainer
    retrainer = RobertaRetrainer(
        model_name="Farshid/roberta-large-financial-phrasebank-allagree1",
        output_dir=output_dir,
        csv_path=csv_path
    )
    
    # Start the training process
    retrainer.load_data()
    retrainer.tokenize_data(max_length=128)
    trainer = retrainer.train(
        num_train_epochs=5,
        learning_rate=1e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8
    )
    
    print("Training completed successfully!")

if __name__ == "__main__":
    main()