Spaces:
Sleeping
Sleeping
| # Initialize variables | |
| LEARNING_RATE="2e-4" | |
| NUM_TRAIN_EPOCHS="3" | |
| CONCURRENCY_THREADS="2" | |
| DATA_SYNTHESIS_MODE="low" | |
| HALF=False | |
| USE_CUDA=False # Default to False, will be overridden by parameter | |
| IS_COT=False | |
| # Process parameters | |
| while [[ "$#" -gt 0 ]]; do | |
| case $1 in | |
| --lr) LEARNING_RATE="$2"; shift ;; | |
| --epochs) NUM_TRAIN_EPOCHS="$2"; shift ;; | |
| --threads) CONCURRENCY_THREADS="$2"; shift ;; | |
| --mode) DATA_SYNTHESIS_MODE="$2"; shift ;; | |
| --cuda) | |
| # Convert string to lowercase for consistent comparison | |
| cuda_value=$(echo "$2" | tr '[:upper:]' '[:lower:]') | |
| if [[ "$cuda_value" == "true" || "$cuda_value" == "1" || "$cuda_value" == "yes" ]]; then | |
| USE_CUDA=True | |
| echo "CUDA enabled by user configuration." | |
| else | |
| USE_CUDA=False | |
| echo "CUDA disabled by user configuration." | |
| fi | |
| shift ;; | |
| --is_cot) IS_COT="$2"; shift ;; | |
| *) echo "Unknown parameter: $1"; exit 1 ;; | |
| esac | |
| shift | |
| done | |
| # Explicitly log the CUDA setting passed from the command line | |
| echo "CUDA parameter received: $USE_CUDA" | |
| # Verify CUDA availability if enabled | |
| if [[ "$USE_CUDA" == "True" ]]; then | |
| # Set CUDA environment variables to ensure PyTorch detects GPU | |
| export CUDA_VISIBLE_DEVICES=0 | |
| echo "CUDA_VISIBLE_DEVICES set to 0" | |
| # Set CUDA_LAUNCH_BLOCKING to 0 for async operations (better performance) | |
| export CUDA_LAUNCH_BLOCKING=0 | |
| echo "CUDA_LAUNCH_BLOCKING set to 0 for better performance" | |
| else | |
| # Explicitly disable CUDA | |
| export CUDA_VISIBLE_DEVICES="" | |
| echo "CUDA_VISIBLE_DEVICES explicitly disabled" | |
| fi | |
| # Log the parameters being used | |
| echo "Using training parameters:" | |
| echo " Learning rate: $LEARNING_RATE" | |
| echo " Number of epochs: $NUM_TRAIN_EPOCHS" | |
| echo " Concurrency threads: $CONCURRENCY_THREADS" | |
| echo " Data synthesis mode: $DATA_SYNTHESIS_MODE" | |
| echo " Use CUDA: $USE_CUDA" | |
| echo " Is chain of thought: $IS_COT" | |
| # If concurrency threads are set, configure related environment variables | |
| if [ "$CONCURRENCY_THREADS" != "1" ]; then | |
| # Limit the number of parallel threads to avoid memory issues | |
| export OMP_NUM_THREADS=$CONCURRENCY_THREADS | |
| export MKL_NUM_THREADS=$CONCURRENCY_THREADS | |
| export NUMEXPR_NUM_THREADS=$CONCURRENCY_THREADS | |
| # Add torch-specific threading controls | |
| export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128 | |
| echo "Set thread environment variables to $CONCURRENCY_THREADS" | |
| fi | |
| # Add BF16 option based on the platform and CUDA availability | |
| if [ "$PLATFORM" != "apple" ] && [ "$USE_CUDA" == "True" ]; then | |
| HALF=True | |
| echo "Enabling BF16 half precision for non-Apple platform with CUDA" | |
| else | |
| echo "Using standard precision (not using BF16)" | |
| fi | |
| # Print environment for debugging | |
| echo "Environment configuration:" | |
| echo " CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}" | |
| echo " PYTORCH_CUDA_ALLOC_CONF: ${PYTORCH_CUDA_ALLOC_CONF}" | |
| echo " Using half precision: ${HALF}" | |
| # Execute training script with parameters from environment variables | |
| python lpm_kernel/L2/train.py \ | |
| --seed 42 \ | |
| --model_name_or_path "${MODEL_BASE_PATH}" \ | |
| --user_name "${USER_NAME}" \ | |
| --dataset_name "resources/L2/data/merged.json" \ | |
| --chat_template_format "chatml" \ | |
| --add_special_tokens False \ | |
| --append_concat_token False \ | |
| --max_seq_length 2048 \ | |
| --num_train_epochs $NUM_TRAIN_EPOCHS \ | |
| --save_total_limit 2 \ | |
| --logging_steps 20 \ | |
| --log_level "info" \ | |
| --logging_strategy "steps" \ | |
| --save_strategy "steps" \ | |
| --save_steps 5 \ | |
| --push_to_hub False \ | |
| --bf16 $HALF \ | |
| --packing False \ | |
| --learning_rate $LEARNING_RATE \ | |
| --lr_scheduler_type "cosine" \ | |
| --weight_decay 1e-4 \ | |
| --max_grad_norm 0.3 \ | |
| --output_dir "${MODEL_PERSONAL_DIR}" \ | |
| --per_device_train_batch_size 2 \ | |
| --gradient_accumulation_steps $CONCURRENCY_THREADS \ | |
| --gradient_checkpointing True \ | |
| --use_reentrant False \ | |
| --use_peft_lora True \ | |
| --lora_r 8 \ | |
| --lora_alpha 16 \ | |
| --lora_dropout 0.1 \ | |
| --lora_target_modules "all-linear" \ | |
| --use_4bit_quantization False \ | |
| --use_nested_quant False \ | |
| --bnb_4bit_compute_dtype "bfloat16" \ | |
| --is_cot $IS_COT \ | |
| --use_cuda $USE_CUDA | |