Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

src/__init__.py +41 -0
src/data_loader.py +205 -0
src/model.py +108 -0
src/utils.py +255 -0
src/validation.py +174 -0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""
+Initialization for src package
+"""
+from .data_loader import load_config, prepare_datasets_for_training
+from .model import (
+    create_model,
+    get_model_size,
+    get_trainable_params,
+    apply_class_weights
+)
+from .utils import (
+    compute_metrics,
+    compute_metrics_factory,
+    plot_confusion_matrix,
+    print_classification_report,
+    plot_training_curves
+)
+from .validation import (
+    validate_config,
+    validate_model_path,
+    validate_data_file,
+    validate_config_file
+)
+__all__ = [
+    'load_config',
+    'prepare_datasets_for_training',
+    'create_model',
+    'get_model_size',
+    'get_trainable_params',
+    'apply_class_weights',
+    'compute_metrics',
+    'compute_metrics_factory',
+    'plot_confusion_matrix',
+    'print_classification_report',
+    'plot_training_curves',
+    'validate_config',
+    'validate_model_path',
+    'validate_data_file',
+    'validate_config_file'
+]

src/data_loader.py ADDED Viewed

	@@ -0,0 +1,205 @@

+"""
+Data loader utilities for Code Comment Quality Classifier
+"""
+import pandas as pd
+from datasets import Dataset, DatasetDict
+from sklearn.model_selection import train_test_split
+from typing import Tuple, Dict, List, Optional
+import yaml
+import logging
+import os
+from pathlib import Path
+def load_config(config_path: str = "config.yaml") -> dict:
+    """Load configuration from YAML file."""
+    with open(config_path, 'r') as f:
+        config = yaml.safe_load(f)
+    return config
+def load_data(data_path: str) -> pd.DataFrame:
+    """
+    Load data from CSV file with validation.
+    Expected format:
+    - comment: str (the code comment text)
+    - label: str (excellent, helpful, unclear, or outdated)
+    Args:
+        data_path: Path to the CSV file
+    Returns:
+        DataFrame with validated data
+    Raises:
+        FileNotFoundError: If data file doesn't exist
+        ValueError: If data format is invalid
+    """
+    if not os.path.exists(data_path):
+        raise FileNotFoundError(f"Data file not found: {data_path}")
+    df = pd.read_csv(data_path)
+    # Validate required columns
+    required_columns = ['comment', 'label']
+    missing_columns = [col for col in required_columns if col not in df.columns]
+    if missing_columns:
+        raise ValueError(f"Missing required columns: {missing_columns}")
+    # Remove rows with missing values
+    initial_len = len(df)
+    df = df.dropna(subset=required_columns)
+    if len(df) < initial_len:
+        logging.warning(f"Removed {initial_len - len(df)} rows with missing values")
+    # Remove empty comments
+    df = df[df['comment'].str.strip().str.len() > 0]
+    # Validate labels
+    if df['label'].isna().any():
+        logging.warning("Found NaN labels, removing those rows")
+        df = df.dropna(subset=['label'])
+    logging.info(f"Loaded {len(df)} samples from {data_path}")
+    return df
+def create_label_mapping(labels: list) -> Tuple[Dict[str, int], Dict[int, str]]:
+    """Create bidirectional label mapping."""
+    label2id = {label: idx for idx, label in enumerate(labels)}
+    id2label = {idx: label for idx, label in enumerate(labels)}
+    return label2id, id2label
+def prepare_dataset(
+    df: pd.DataFrame,
+    label2id: Dict[str, int],
+    train_size: float = 0.8,
+    val_size: float = 0.1,
+    test_size: float = 0.1,
+    seed: int = 42,
+    stratify: bool = True
+) -> DatasetDict:
+    """
+    Prepare dataset splits for training.
+    Args:
+        df: DataFrame with 'comment' and 'label' columns
+        label2id: Mapping from label names to IDs
+        train_size: Proportion of training data
+        val_size: Proportion of validation data
+        test_size: Proportion of test data
+        seed: Random seed for reproducibility
+        stratify: Whether to maintain class distribution in splits
+    Returns:
+        DatasetDict with train, validation, and test splits
+    """
+    # Validate label distribution
+    invalid_labels = set(df['label'].unique()) - set(label2id.keys())
+    if invalid_labels:
+        raise ValueError(f"Found invalid labels: {invalid_labels}. Expected: {list(label2id.keys())}")
+    # Convert labels to IDs
+    df['label_id'] = df['label'].map(label2id)
+    # Check for missing mappings
+    if df['label_id'].isna().any():
+        missing_labels = df[df['label_id'].isna()]['label'].unique()
+        raise ValueError(f"Labels not found in label2id mapping: {missing_labels}")
+    # Validate split proportions
+    total_size = train_size + val_size + test_size
+    if abs(total_size - 1.0) > 1e-6:
+        raise ValueError(f"Split sizes must sum to 1.0, got {total_size}")
+    # Stratification column
+    stratify_col = df['label_id'] if stratify else None
+    # First split: separate test set
+    train_val_df, test_df = train_test_split(
+        df,
+        test_size=test_size,
+        random_state=seed,
+        stratify=stratify_col
+    )
+    # Second split: separate train and validation
+    val_size_adjusted = val_size / (train_size + val_size)
+    stratify_col_train = train_val_df['label_id'] if stratify else None
+    train_df, val_df = train_test_split(
+        train_val_df,
+        test_size=val_size_adjusted,
+        random_state=seed,
+        stratify=stratify_col_train
+    )
+    # Log distribution
+    logging.info(f"Dataset splits - Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")
+    logging.info(f"Train label distribution:\n{train_df['label'].value_counts().sort_index()}")
+    # Create datasets
+    dataset_dict = DatasetDict({
+        'train': Dataset.from_pandas(train_df[['comment', 'label_id']], preserve_index=False),
+        'validation': Dataset.from_pandas(val_df[['comment', 'label_id']], preserve_index=False),
+        'test': Dataset.from_pandas(test_df[['comment', 'label_id']], preserve_index=False)
+    })
+    return dataset_dict
+def tokenize_function(examples, tokenizer, max_length: int = 512):
+    """Tokenize the input text."""
+    return tokenizer(
+        examples['comment'],
+        padding='max_length',
+        truncation=True,
+        max_length=max_length
+    )
+def prepare_datasets_for_training(config_path: str = "config.yaml"):
+    """
+    Complete pipeline to prepare datasets for training.
+    Returns:
+        Tuple of (tokenized_datasets, label2id, id2label, tokenizer)
+    """
+    from transformers import AutoTokenizer
+    config = load_config(config_path)
+    # Load data
+    df = load_data(config['data']['data_path'])
+    # Create label mappings
+    labels = config['labels']
+    label2id, id2label = create_label_mapping(labels)
+    # Prepare dataset splits
+    stratify = config['data'].get('stratify', True)
+    dataset_dict = prepare_dataset(
+        df,
+        label2id,
+        train_size=config['data']['train_size'],
+        val_size=config['data']['val_size'],
+        test_size=config['data']['test_size'],
+        seed=config['training']['seed'],
+        stratify=stratify
+    )
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(config['model']['name'])
+    # Tokenize datasets
+    tokenized_datasets = dataset_dict.map(
+        lambda x: tokenize_function(x, tokenizer, config['model']['max_length']),
+        batched=True,
+        remove_columns=['comment']
+    )
+    # Rename label_id to labels for training
+    tokenized_datasets = tokenized_datasets.rename_column('label_id', 'labels')
+    return tokenized_datasets, label2id, id2label, tokenizer

src/model.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+Model definition and utilities
+"""
+from transformers import AutoModelForSequenceClassification, AutoConfig
+from typing import Dict, Optional
+import logging
+import torch
+import torch.nn as nn
+def create_model(
+    model_name: str,
+    num_labels: int,
+    label2id: Dict[str, int],
+    id2label: Dict[int, str],
+    dropout: Optional[float] = None
+):
+    """
+    Create a sequence classification model with optional dropout configuration.
+    Args:
+        model_name: Name of the pretrained model
+        num_labels: Number of classification labels
+        label2id: Mapping from label names to IDs
+        id2label: Mapping from IDs to label names
+        dropout: Optional dropout probability for classifier head
+    Returns:
+        Initialized model
+    """
+    config = AutoConfig.from_pretrained(
+        model_name,
+        num_labels=num_labels,
+        label2id=label2id,
+        id2label=id2label
+    )
+    # Set dropout if provided
+    if dropout is not None:
+        if hasattr(config, 'hidden_dropout_prob'):
+            config.hidden_dropout_prob = dropout
+        if hasattr(config, 'attention_probs_dropout_prob'):
+            config.attention_probs_dropout_prob = dropout
+        if hasattr(config, 'classifier_dropout'):
+            config.classifier_dropout = dropout
+        logging.info(f"Set model dropout to {dropout}")
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_name,
+        config=config
+    )
+    return model
+def apply_class_weights(
+    model: nn.Module,
+    class_weights: Optional[list] = None
+) -> Optional[nn.Module]:
+    """
+    Apply class weights to the model's loss function.
+    Args:
+        model: The model to modify
+        class_weights: List of weights for each class (must match num_labels)
+    Returns:
+        Model with modified loss function (if class_weights provided)
+    """
+    if class_weights is not None:
+        weights_tensor = torch.tensor(class_weights, dtype=torch.float32)
+        # Note: This requires custom Trainer with weighted loss
+        logging.info(f"Class weights applied: {class_weights}")
+        return weights_tensor
+    return None
+def get_model_size(model: nn.Module) -> float:
+    """
+    Calculate model size in millions of parameters.
+    Args:
+        model: PyTorch model
+    Returns:
+        Number of parameters in millions
+    """
+    param_size = sum(p.numel() for p in model.parameters())
+    return param_size / 1e6
+def get_trainable_params(model: nn.Module) -> Dict[str, int]:
+    """
+    Get count of trainable and non-trainable parameters.
+    Args:
+        model: PyTorch model
+    Returns:
+        Dictionary with 'trainable' and 'total' parameter counts
+    """
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    total = sum(p.numel() for p in model.parameters())
+    return {
+        'trainable': trainable,
+        'total': total,
+        'non_trainable': total - trainable
+    }

src/utils.py ADDED Viewed

	@@ -0,0 +1,255 @@

+"""
+Utility functions for training and evaluation
+"""
+import numpy as np
+from sklearn.metrics import (
+    accuracy_score,
+    precision_recall_fscore_support,
+    confusion_matrix,
+    classification_report
+)
+import matplotlib.pyplot as plt
+import seaborn as sns
+from typing import Dict, Tuple, List, Optional
+import os
+def compute_metrics(eval_pred, id2label: Optional[Dict[int, str]] = None) -> Dict[str, float]:
+    """
+    Compute comprehensive metrics for evaluation.
+    Args:
+        eval_pred: Tuple of (predictions, labels)
+        id2label: Optional mapping from label IDs to label names for per-class metrics
+    Returns:
+        Dictionary of metrics including overall and per-class metrics
+    """
+    predictions, labels = eval_pred
+    predictions = np.argmax(predictions, axis=1)
+    # Overall metrics
+    accuracy = accuracy_score(labels, predictions)
+    # Weighted metrics (accounts for class imbalance)
+    precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(
+        labels,
+        predictions,
+        average='weighted',
+        zero_division=0
+    )
+    # Macro-averaged metrics (treats all classes equally)
+    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
+        labels,
+        predictions,
+        average='macro',
+        zero_division=0
+    )
+    # Micro-averaged metrics (aggregates contributions of all classes)
+    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
+        labels,
+        predictions,
+        average='micro',
+        zero_division=0
+    )
+    metrics = {
+        'accuracy': accuracy,
+        'precision_weighted': precision_weighted,
+        'recall_weighted': recall_weighted,
+        'f1_weighted': f1_weighted,
+        'precision_macro': precision_macro,
+        'recall_macro': recall_macro,
+        'f1_macro': f1_macro,
+        'precision_micro': precision_micro,
+        'recall_micro': recall_micro,
+        'f1_micro': f1_micro,
+    }
+    # Per-class metrics if label mapping is provided
+    if id2label is not None:
+        num_classes = len(id2label)
+        precision_per_class, recall_per_class, f1_per_class, support = precision_recall_fscore_support(
+            labels,
+            predictions,
+            labels=list(range(num_classes)),
+            average=None,
+            zero_division=0
+        )
+        for i in range(num_classes):
+            label_name = id2label[i]
+            metrics[f'precision_{label_name}'] = float(precision_per_class[i])
+            metrics[f'recall_{label_name}'] = float(recall_per_class[i])
+            metrics[f'f1_{label_name}'] = float(f1_per_class[i])
+            metrics[f'support_{label_name}'] = int(support[i])
+    return metrics
+def compute_metrics_factory(id2label: Optional[Dict[int, str]] = None):
+    """
+    Factory function to create compute_metrics with label mapping.
+    Args:
+        id2label: Mapping from label IDs to label names
+    Returns:
+        Function compatible with HuggingFace Trainer
+    """
+    def compute_metrics_fn(eval_pred):
+        return compute_metrics(eval_pred, id2label)
+    return compute_metrics_fn
+def plot_confusion_matrix(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    labels: List[str],
+    save_path: str = "confusion_matrix.png",
+    normalize: bool = False,
+    figsize: Tuple[int, int] = (10, 8)
+) -> None:
+    """
+    Plot and save confusion matrix with optional normalization.
+    Args:
+        y_true: True labels
+        y_pred: Predicted labels
+        labels: List of label names
+        save_path: Path to save the plot
+        normalize: If True, normalize confusion matrix to percentages
+        figsize: Figure size (width, height)
+    """
+    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(labels))))
+    if normalize:
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+        fmt = '.2f'
+        title = 'Normalized Confusion Matrix'
+    else:
+        fmt = 'd'
+        title = 'Confusion Matrix'
+    plt.figure(figsize=figsize)
+    sns.heatmap(
+        cm,
+        annot=True,
+        fmt=fmt,
+        cmap='Blues',
+        xticklabels=labels,
+        yticklabels=labels,
+        cbar_kws={'label': 'Percentage' if normalize else 'Count'}
+    )
+    plt.title(title, fontsize=14, fontweight='bold')
+    plt.ylabel('True Label', fontsize=12)
+    plt.xlabel('Predicted Label', fontsize=12)
+    plt.tight_layout()
+    # Create directory if it doesn't exist
+    os.makedirs(os.path.dirname(save_path) if os.path.dirname(save_path) else '.', exist_ok=True)
+    plt.savefig(save_path, dpi=300, bbox_inches='tight')
+    plt.close()
+    print(f"Confusion matrix saved to {save_path}")
+def print_classification_report(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    labels: List[str],
+    output_dict: bool = False
+) -> Optional[Dict]:
+    """
+    Print detailed classification report.
+    Args:
+        y_true: True labels
+        y_pred: Predicted labels
+        labels: List of label names
+        output_dict: If True, return report as dictionary instead of printing
+    Returns:
+        Classification report as dictionary if output_dict=True, else None
+    """
+    report = classification_report(
+        y_true,
+        y_pred,
+        target_names=labels,
+        digits=4,
+        output_dict=output_dict,
+        zero_division=0
+    )
+    if output_dict:
+        return report
+    print("\nClassification Report:")
+    print("=" * 60)
+    print(report)
+    return None
+def plot_training_curves(
+    train_losses: List[float],
+    eval_losses: List[float],
+    eval_metrics: Dict[str, List[float]],
+    save_path: str = "./results/training_curves.png"
+) -> None:
+    """
+    Plot training and evaluation curves.
+    Args:
+        train_losses: List of training losses per step/epoch
+        eval_losses: List of evaluation losses per step/epoch
+        eval_metrics: Dictionary of metric names to lists of values
+        save_path: Path to save the plot
+    """
+    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
+    # Loss curves
+    axes[0, 0].plot(train_losses, label='Train Loss', color='blue')
+    axes[0, 0].plot(eval_losses, label='Eval Loss', color='red')
+    axes[0, 0].set_xlabel('Step/Epoch')
+    axes[0, 0].set_ylabel('Loss')
+    axes[0, 0].set_title('Training and Validation Loss')
+    axes[0, 0].legend()
+    axes[0, 0].grid(True, alpha=0.3)
+    # Accuracy
+    if 'accuracy' in eval_metrics:
+        axes[0, 1].plot(eval_metrics['accuracy'], label='Accuracy', color='green')
+        axes[0, 1].set_xlabel('Step/Epoch')
+        axes[0, 1].set_ylabel('Accuracy')
+        axes[0, 1].set_title('Validation Accuracy')
+        axes[0, 1].legend()
+        axes[0, 1].grid(True, alpha=0.3)
+    # F1 Score
+    if 'f1_weighted' in eval_metrics:
+        axes[1, 0].plot(eval_metrics['f1_weighted'], label='F1 (weighted)', color='purple')
+        axes[1, 0].set_xlabel('Step/Epoch')
+        axes[1, 0].set_ylabel('F1 Score')
+        axes[1, 0].set_title('Validation F1 Score')
+        axes[1, 0].legend()
+        axes[1, 0].grid(True, alpha=0.3)
+    # Precision and Recall
+    if 'precision_weighted' in eval_metrics and 'recall_weighted' in eval_metrics:
+        axes[1, 1].plot(eval_metrics['precision_weighted'], label='Precision', color='orange')
+        axes[1, 1].plot(eval_metrics['recall_weighted'], label='Recall', color='cyan')
+        axes[1, 1].set_xlabel('Step/Epoch')
+        axes[1, 1].set_ylabel('Score')
+        axes[1, 1].set_title('Validation Precision and Recall')
+        axes[1, 1].legend()
+        axes[1, 1].grid(True, alpha=0.3)
+    plt.tight_layout()
+    os.makedirs(os.path.dirname(save_path) if os.path.dirname(save_path) else '.', exist_ok=True)
+    plt.savefig(save_path, dpi=300, bbox_inches='tight')
+    plt.close()
+    print(f"Training curves saved to {save_path}")

src/validation.py ADDED Viewed

	@@ -0,0 +1,174 @@

+"""
+Validation utilities for model and data validation
+"""
+import os
+import yaml
+from typing import Dict, List, Optional
+import logging
+from pathlib import Path
+def validate_config(config: Dict) -> List[str]:
+    """
+    Validate configuration file for common issues.
+    Args:
+        config: Configuration dictionary
+    Returns:
+        List of validation error messages (empty if valid)
+    """
+    errors = []
+    # Check required sections
+    required_sections = ['model', 'training', 'data', 'labels']
+    for section in required_sections:
+        if section not in config:
+            errors.append(f"Missing required section: {section}")
+    if errors:
+        return errors
+    # Validate model section
+    if 'name' not in config['model']:
+        errors.append("model.name is required")
+    if 'num_labels' not in config['model']:
+        errors.append("model.num_labels is required")
+    elif config['model']['num_labels'] != len(config.get('labels', [])):
+        errors.append(f"model.num_labels ({config['model']['num_labels']}) doesn't match number of labels ({len(config['labels'])})")
+    # Validate training section
+    training = config['training']
+    if 'num_train_epochs' in training and training['num_train_epochs'] <= 0:
+        errors.append("training.num_train_epochs must be positive")
+    if 'learning_rate' in training and training['learning_rate'] <= 0:
+        errors.append("training.learning_rate must be positive")
+    if 'per_device_train_batch_size' in training and training['per_device_train_batch_size'] <= 0:
+        errors.append("training.per_device_train_batch_size must be positive")
+    # Validate data section
+    data = config['data']
+    if 'data_path' in data and not os.path.exists(data['data_path']):
+        errors.append(f"Data file not found: {data['data_path']}")
+    train_size = data.get('train_size', 0)
+    val_size = data.get('val_size', 0)
+    test_size = data.get('test_size', 0)
+    total = train_size + val_size + test_size
+    if abs(total - 1.0) > 1e-6:
+        errors.append(f"Data split sizes must sum to 1.0, got {total}")
+    # Validate labels
+    if 'labels' not in config or not config['labels']:
+        errors.append("labels section is required and cannot be empty")
+    elif len(set(config['labels'])) != len(config['labels']):
+        errors.append("labels must be unique")
+    return errors
+def validate_model_path(model_path: str) -> bool:
+    """
+    Validate that model path exists and contains required files.
+    Args:
+        model_path: Path to model directory
+    Returns:
+        True if valid, False otherwise
+    """
+    if not os.path.exists(model_path):
+        logging.error(f"Model path does not exist: {model_path}")
+        return False
+    required_files = ['config.json']
+    for file in required_files:
+        file_path = os.path.join(model_path, file)
+        if not os.path.exists(file_path):
+            logging.error(f"Required file missing: {file_path}")
+            return False
+    return True
+def validate_data_file(data_path: str, required_columns: List[str] = None) -> List[str]:
+    """
+    Validate data file format and content.
+    Args:
+        data_path: Path to data file
+        required_columns: List of required column names
+    Returns:
+        List of validation error messages (empty if valid)
+    """
+    errors = []
+    if required_columns is None:
+        required_columns = ['comment', 'label']
+    if not os.path.exists(data_path):
+        errors.append(f"Data file not found: {data_path}")
+        return errors
+    try:
+        import pandas as pd
+        df = pd.read_csv(data_path)
+        # Check required columns
+        missing_columns = [col for col in required_columns if col not in df.columns]
+        if missing_columns:
+            errors.append(f"Missing required columns: {missing_columns}")
+        # Check for empty dataframe
+        if len(df) == 0:
+            errors.append("Data file is empty")
+        # Check for missing values in required columns
+        if 'comment' in df.columns:
+            empty_comments = df['comment'].isna().sum() + (df['comment'].str.strip().str.len() == 0).sum()
+            if empty_comments > 0:
+                errors.append(f"Found {empty_comments} empty comments")
+        if 'label' in df.columns:
+            missing_labels = df['label'].isna().sum()
+            if missing_labels > 0:
+                errors.append(f"Found {missing_labels} missing labels")
+    except Exception as e:
+        errors.append(f"Error reading data file: {str(e)}")
+    return errors
+def validate_config_file(config_path: str) -> bool:
+    """
+    Validate configuration file.
+    Args:
+        config_path: Path to configuration file
+    Returns:
+        True if valid, False otherwise
+    """
+    if not os.path.exists(config_path):
+        logging.error(f"Config file not found: {config_path}")
+        return False
+    try:
+        with open(config_path, 'r') as f:
+            config = yaml.safe_load(f)
+        errors = validate_config(config)
+        if errors:
+            logging.error("Configuration validation errors:")
+            for error in errors:
+                logging.error(f"  - {error}")
+            return False
+        logging.info("Configuration file is valid")
+        return True
+    except Exception as e:
+        logging.error(f"Error reading config file: {str(e)}")
+        return False