Snaseem2026
/

code-comment-classifier

+# Training Configuration for Code Comment Quality Classifier
+model:
+  name: "distilbert-base-uncased"
+  num_labels: 4
+  max_length: 512
+  dropout: 0.1  # Dropout probability for regularization
+training:
+  output_dir: "./results"
+  num_train_epochs: 3
+  per_device_train_batch_size: 16
+  per_device_eval_batch_size: 32
+  gradient_accumulation_steps: 1  # Effective batch size = per_device_batch_size * gradient_accumulation_steps * num_gpus
+  learning_rate: 0.00002
+  lr_scheduler_type: "cosine"  # Options: linear, cosine, cosine_with_restarts, polynomial, constant, constant_with_warmup
+  weight_decay: 0.01
+  warmup_steps: 500
+  warmup_ratio: null  # Alternative to warmup_steps (ratio of total training steps)
+  logging_steps: 100
+  eval_steps: 500
+  save_steps: 1000
+  save_total_limit: 3  # Maximum number of checkpoints to keep
+  evaluation_strategy: "steps"
+  save_strategy: "steps"
+  load_best_model_at_end: true
+  metric_for_best_model: "f1"
+  greater_is_better: true
+  early_stopping_patience: 3  # Number of evaluations without improvement before stopping
+  early_stopping_threshold: 0.001  # Minimum improvement to reset patience counter
+  seed: 42
+  fp16: false  # Mixed precision training (set to true if using GPU with Tensor Cores)
+  dataloader_num_workers: 4  # Number of workers for data loading
+  dataloader_pin_memory: true  # Pin memory for faster GPU transfer
+  remove_unused_columns: true
+  report_to: ["none"]  # Options: "wandb", "tensorboard", "none", or list
+# Class weights for handling imbalanced data (null = equal weights)
+class_weights: null  # Example: [1.0, 1.0, 1.2, 1.0] if unclear class needs more weight
+data:
+  train_size: 0.8
+  val_size: 0.1
+  test_size: 0.1
+  data_path: "./data/comments.csv"
+  shuffle: true
+  stratify: true  # Maintain class distribution in splits
+labels:
+  - "excellent"
+  - "helpful"
+  - "unclear"
+  - "outdated"
+# Logging configuration
+logging:
+  level: "INFO"  # DEBUG, INFO, WARNING, ERROR
+  log_file: "./results/training.log"