stmasson
/

training-scripts

stmasson commited on 24 days ago

Commit

188cdd5

verified ·

1 Parent(s): 9c8cf56

Upload scripts/train_qwen3_sft_multitask.py with huggingface_hub

Files changed (1) hide show

scripts/train_qwen3_sft_multitask.py CHANGED Viewed

@@ -65,7 +65,7 @@ NUM_EPOCHS = int(os.environ.get("NUM_EPOCHS", "1"))
 BATCH_SIZE = int(os.environ.get("BATCH_SIZE", "1"))
 GRAD_ACCUM = int(os.environ.get("GRAD_ACCUM", "8"))
 LEARNING_RATE = float(os.environ.get("LEARNING_RATE", "1e-5"))
-MAX_SEQ_LENGTH = int(os.environ.get("MAX_SEQ_LENGTH", "8192"))
 # LoRA (continuing from DPO adapter)
 LORA_R = int(os.environ.get("LORA_R", "32"))
@@ -164,6 +164,17 @@ val_dataset = load_jsonl_dataset(DATASET_REPO, VAL_FILE)
 print(f"Train: {len(train_dataset)} examples")
 print(f"Validation: {len(val_dataset)} examples")
 # Format examples
 def format_example(example):
     """Format messages to text for training."""

 BATCH_SIZE = int(os.environ.get("BATCH_SIZE", "1"))
 GRAD_ACCUM = int(os.environ.get("GRAD_ACCUM", "8"))
 LEARNING_RATE = float(os.environ.get("LEARNING_RATE", "1e-5"))
+MAX_SEQ_LENGTH = int(os.environ.get("MAX_SEQ_LENGTH", "4096"))
 # LoRA (continuing from DPO adapter)
 LORA_R = int(os.environ.get("LORA_R", "32"))
 print(f"Train: {len(train_dataset)} examples")
 print(f"Validation: {len(val_dataset)} examples")
+# Filter out very long examples to avoid OOM
+def filter_by_length(example):
+    """Filter examples that would be too long."""
+    total_len = sum(len(m.get('content', '')) for m in example['messages'])
+    return total_len < 30000  # ~7500 tokens max
+print("Filtering long examples...")
+train_dataset = train_dataset.filter(filter_by_length)
+val_dataset = val_dataset.filter(filter_by_length)
+print(f"After filtering - Train: {len(train_dataset)}, Val: {len(val_dataset)}")
 # Format examples
 def format_example(example):
     """Format messages to text for training."""