Spaces:

Papaflessas
/

gotti_signal_gen

Running

App Files Files Community

gotti_signal_gen / src /news_scraper /nlp_models /roberta_retrained.py

Papaflessas

Deploy Signal Generator app

3fe0726 5 days ago

raw

history blame contribute delete

9.31 kB

	import os
	import numpy as np
	import pandas as pd
	import torch
	from datasets import Dataset
	import evaluate
	from transformers import (
	AutoTokenizer,
	AutoModelForSequenceClassification,
	TrainingArguments,
	Trainer,
	EarlyStoppingCallback
	)
	from sklearn.model_selection import train_test_split

	class RobertaRetrainer:
	"""
	A class to retrain the RobertaLarge model using labeled financial news data.

	This follows the Hugging Face fine-tuning approach outlined in the RETRAIN.MD guide.
	"""

	def __init__(self,
	model_name="Farshid/roberta-large-financial-phrasebank-allagree1",
	output_dir="./nimou-RoBERTa",
	csv_path=None):
	"""
	Initialize the retrainer with model configuration.

	Args:
	model_name (str): HuggingFace model identifier
	output_dir (str): Directory where fine-tuned model will be saved
	csv_path (str): Path to the labeled dataset CSV
	"""
	self.model_name = model_name
	self.output_dir = output_dir
	self.csv_path = csv_path
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	self.tokenizer = None
	self.model = None
	self.train_dataset = None
	self.val_dataset = None
	self.accuracy_metric = evaluate.load("accuracy")

	print(f"Using device: {self.device}")

	# Create output directory if it doesn't exist
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	def load_data(self, csv_path=None):
	"""
	Load and prepare the dataset from CSV.

	Args:
	csv_path (str, optional): Override the CSV path provided in the constructor

	Returns:
	tuple: Processed train and validation datasets
	"""
	if csv_path:
	self.csv_path = csv_path

	if not self.csv_path:
	raise ValueError("CSV path must be provided")

	print(f"Loading data from {self.csv_path}")
	df = pd.read_csv(self.csv_path)

	# Basic data validation
	if 'text' not in df.columns or 'label' not in df.columns:
	raise ValueError("CSV must contain 'text' and 'label' columns")

	# Split into train and validation sets
	train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

	# Convert to HuggingFace datasets
	train_dataset = Dataset.from_pandas(train_df)
	val_dataset = Dataset.from_pandas(val_df)

	print(f"Training samples: {len(train_dataset)}")
	print(f"Validation samples: {len(val_dataset)}")

	# Display label distribution
	print("Label distribution in training set:")
	for label, count in train_df['label'].value_counts().items():
	print(f" Label {label}: {count} samples ({count / len(train_df) * 100:.2f}%)")

	self.train_dataset = train_dataset
	self.val_dataset = val_dataset
	return train_dataset, val_dataset

	def load_model(self):
	"""
	Load the pretrained model and tokenizer.

	Returns:
	tuple: Loaded tokenizer and model
	"""
	print(f"Loading model {self.model_name}")

	self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
	self.model = AutoModelForSequenceClassification.from_pretrained(
	self.model_name,
	num_labels=3 # NEGATIVE=0, NEUTRAL=1, POSITIVE=2
	)

	return self.tokenizer, self.model

	def tokenize_data(self, max_length=128):
	"""
	Tokenize the datasets.

	Args:
	max_length (int): Maximum sequence length for tokenization

	Returns:
	tuple: Tokenized training and validation datasets
	"""
	if not self.tokenizer:
	self.load_model()

	if not self.train_dataset or not self.val_dataset:
	self.load_data()

	def preprocess(examples):
	return self.tokenizer(
	examples["text"],
	padding="max_length",
	truncation=True,
	max_length=max_length
	)

	tokenized_train = self.train_dataset.map(preprocess, batched=True)
	tokenized_val = self.val_dataset.map(preprocess, batched=True)

	print("Datasets tokenized")
	self.train_dataset = tokenized_train
	self.val_dataset = tokenized_val

	return tokenized_train, tokenized_val

	def compute_metrics(self, eval_pred):
	"""
	Compute evaluation metrics during training.

	Args:
	eval_pred (tuple): Tuple of predictions and labels from the trainer

	Returns:
	dict: Dictionary containing evaluation metrics
	"""
	logits, labels = eval_pred
	predictions = np.argmax(logits, axis=1)

	acc = self.accuracy_metric.compute(predictions=predictions, references=labels)

	# Calculate precision, recall, and f1 for each class
	results = {"accuracy": acc["accuracy"]}

	return results

	def train(self,
	num_train_epochs=3,
	learning_rate=2e-5,
	per_device_train_batch_size=8,
	per_device_eval_batch_size=8,
	weight_decay=0.01,
	warmup_ratio=0.1,
	logging_steps=100,
	eval_steps=500,
	save_steps=1000,
	load_best_model_at_end=True):
	"""
	Train the model on the prepared dataset.

	Args:
	num_train_epochs (int): Number of training epochs
	learning_rate (float): Learning rate for optimizer
	per_device_train_batch_size (int): Batch size for training
	per_device_eval_batch_size (int): Batch size for evaluation
	weight_decay (float): Weight decay for regularization
	warmup_ratio (float): Ratio of warmup steps
	logging_steps (int): Number of steps between logging
	eval_steps (int): Number of steps between evaluations
	save_steps (int): Number of steps between checkpoints
	load_best_model_at_end (bool): Whether to load the best model at the end

	Returns:
	Trainer: Trained model trainer
	"""
	if not self.model:
	self.load_model()

	if not self.train_dataset or not self.val_dataset:
	self.tokenize_data()

	# Set training arguments
	training_args = TrainingArguments(
	output_dir=self.output_dir,
	num_train_epochs=num_train_epochs,
	per_device_train_batch_size=per_device_train_batch_size,
	per_device_eval_batch_size=per_device_eval_batch_size,
	learning_rate=learning_rate,
	weight_decay=weight_decay,
	warmup_ratio=warmup_ratio,
	evaluation_strategy="steps",
	eval_steps=eval_steps,
	logging_steps=logging_steps,
	save_steps=save_steps,
	load_best_model_at_end=load_best_model_at_end,
	metric_for_best_model="accuracy",
	save_total_limit=2, # Only keep the 2 best checkpoints
	)

	# Move model to the correct device
	self.model.to(self.device)

	# Initialize the Trainer
	trainer = Trainer(
	model=self.model,
	args=training_args,
	train_dataset=self.train_dataset,
	eval_dataset=self.val_dataset,
	tokenizer=self.tokenizer,
	compute_metrics=self.compute_metrics,
	callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
	)

	# Train the model
	print("Starting training...")
	trainer.train()

	# Save the best model
	trainer.save_model(self.output_dir)
	print(f"Model saved to {self.output_dir}")

	# Evaluate the model
	print("Evaluating model...")
	eval_results = trainer.evaluate()
	print(f"Evaluation results: {eval_results}")

	return trainer

	def main():
	"""
	Main function to demonstrate the retraining process.
	"""
	# Define paths
	csv_path = "c:/Users/M/Desktop/repos/gotti/LLaMAVestor/src/logs/prepared_training_data.csv"
	output_dir = "c:/Users/M/Desktop/repos/gotti/LLaMAVestor/src/models/finetuned-roberta"

	# Initialize the retrainer
	retrainer = RobertaRetrainer(
	model_name="Farshid/roberta-large-financial-phrasebank-allagree1",
	output_dir=output_dir,
	csv_path=csv_path
	)

	# Start the training process
	retrainer.load_data()
	retrainer.tokenize_data(max_length=128)
	trainer = retrainer.train(
	num_train_epochs=5,
	learning_rate=1e-5,
	per_device_train_batch_size=8,
	per_device_eval_batch_size=8
	)

	print("Training completed successfully!")

	if __name__ == "__main__":
	main()