#!/bin/bash # Initialize variables LEARNING_RATE="2e-4" NUM_TRAIN_EPOCHS="3" CONCURRENCY_THREADS="2" DATA_SYNTHESIS_MODE="low" HALF=False USE_CUDA=False # Default to False, will be overridden by parameter IS_COT=False # Process parameters while [[ "$#" -gt 0 ]]; do case $1 in --lr) LEARNING_RATE="$2"; shift ;; --epochs) NUM_TRAIN_EPOCHS="$2"; shift ;; --threads) CONCURRENCY_THREADS="$2"; shift ;; --mode) DATA_SYNTHESIS_MODE="$2"; shift ;; --cuda) # Convert string to lowercase for consistent comparison cuda_value=$(echo "$2" | tr '[:upper:]' '[:lower:]') if [[ "$cuda_value" == "true" || "$cuda_value" == "1" || "$cuda_value" == "yes" ]]; then USE_CUDA=True echo "CUDA enabled by user configuration." else USE_CUDA=False echo "CUDA disabled by user configuration." fi shift ;; --is_cot) IS_COT="$2"; shift ;; *) echo "Unknown parameter: $1"; exit 1 ;; esac shift done # Explicitly log the CUDA setting passed from the command line echo "CUDA parameter received: $USE_CUDA" # Verify CUDA availability if enabled if [[ "$USE_CUDA" == "True" ]]; then # Set CUDA environment variables to ensure PyTorch detects GPU export CUDA_VISIBLE_DEVICES=0 echo "CUDA_VISIBLE_DEVICES set to 0" # Set CUDA_LAUNCH_BLOCKING to 0 for async operations (better performance) export CUDA_LAUNCH_BLOCKING=0 echo "CUDA_LAUNCH_BLOCKING set to 0 for better performance" else # Explicitly disable CUDA export CUDA_VISIBLE_DEVICES="" echo "CUDA_VISIBLE_DEVICES explicitly disabled" fi # Log the parameters being used echo "Using training parameters:" echo " Learning rate: $LEARNING_RATE" echo " Number of epochs: $NUM_TRAIN_EPOCHS" echo " Concurrency threads: $CONCURRENCY_THREADS" echo " Data synthesis mode: $DATA_SYNTHESIS_MODE" echo " Use CUDA: $USE_CUDA" echo " Is chain of thought: $IS_COT" # If concurrency threads are set, configure related environment variables if [ "$CONCURRENCY_THREADS" != "1" ]; then # Limit the number of parallel threads to avoid memory issues export OMP_NUM_THREADS=$CONCURRENCY_THREADS export MKL_NUM_THREADS=$CONCURRENCY_THREADS export NUMEXPR_NUM_THREADS=$CONCURRENCY_THREADS # Add torch-specific threading controls export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128 echo "Set thread environment variables to $CONCURRENCY_THREADS" fi # Add BF16 option based on the platform and CUDA availability if [ "$PLATFORM" != "apple" ] && [ "$USE_CUDA" == "True" ]; then HALF=True echo "Enabling BF16 half precision for non-Apple platform with CUDA" else echo "Using standard precision (not using BF16)" fi # Print environment for debugging echo "Environment configuration:" echo " CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}" echo " PYTORCH_CUDA_ALLOC_CONF: ${PYTORCH_CUDA_ALLOC_CONF}" echo " Using half precision: ${HALF}" # Execute training script with parameters from environment variables python lpm_kernel/L2/train.py \ --seed 42 \ --model_name_or_path "${MODEL_BASE_PATH}" \ --user_name "${USER_NAME}" \ --dataset_name "resources/L2/data/merged.json" \ --chat_template_format "chatml" \ --add_special_tokens False \ --append_concat_token False \ --max_seq_length 2048 \ --num_train_epochs $NUM_TRAIN_EPOCHS \ --save_total_limit 2 \ --logging_steps 20 \ --log_level "info" \ --logging_strategy "steps" \ --save_strategy "steps" \ --save_steps 5 \ --push_to_hub False \ --bf16 $HALF \ --packing False \ --learning_rate $LEARNING_RATE \ --lr_scheduler_type "cosine" \ --weight_decay 1e-4 \ --max_grad_norm 0.3 \ --output_dir "${MODEL_PERSONAL_DIR}" \ --per_device_train_batch_size 2 \ --gradient_accumulation_steps $CONCURRENCY_THREADS \ --gradient_checkpointing True \ --use_reentrant False \ --use_peft_lora True \ --lora_r 8 \ --lora_alpha 16 \ --lora_dropout 0.1 \ --lora_target_modules "all-linear" \ --use_4bit_quantization False \ --use_nested_quant False \ --bnb_4bit_compute_dtype "bfloat16" \ --is_cot $IS_COT \ --use_cuda $USE_CUDA