secondme-api / lpm_kernel /L2 /train_for_user.sh
Gemini
feat: add detailed logging
01d5a5d
raw
history blame
4.21 kB
#!/bin/bash
# Initialize variables
LEARNING_RATE="2e-4"
NUM_TRAIN_EPOCHS="3"
CONCURRENCY_THREADS="2"
DATA_SYNTHESIS_MODE="low"
HALF=False
USE_CUDA=False # Default to False, will be overridden by parameter
IS_COT=False
# Process parameters
while [[ "$#" -gt 0 ]]; do
case $1 in
--lr) LEARNING_RATE="$2"; shift ;;
--epochs) NUM_TRAIN_EPOCHS="$2"; shift ;;
--threads) CONCURRENCY_THREADS="$2"; shift ;;
--mode) DATA_SYNTHESIS_MODE="$2"; shift ;;
--cuda)
# Convert string to lowercase for consistent comparison
cuda_value=$(echo "$2" | tr '[:upper:]' '[:lower:]')
if [[ "$cuda_value" == "true" || "$cuda_value" == "1" || "$cuda_value" == "yes" ]]; then
USE_CUDA=True
echo "CUDA enabled by user configuration."
else
USE_CUDA=False
echo "CUDA disabled by user configuration."
fi
shift ;;
--is_cot) IS_COT="$2"; shift ;;
*) echo "Unknown parameter: $1"; exit 1 ;;
esac
shift
done
# Explicitly log the CUDA setting passed from the command line
echo "CUDA parameter received: $USE_CUDA"
# Verify CUDA availability if enabled
if [[ "$USE_CUDA" == "True" ]]; then
# Set CUDA environment variables to ensure PyTorch detects GPU
export CUDA_VISIBLE_DEVICES=0
echo "CUDA_VISIBLE_DEVICES set to 0"
# Set CUDA_LAUNCH_BLOCKING to 0 for async operations (better performance)
export CUDA_LAUNCH_BLOCKING=0
echo "CUDA_LAUNCH_BLOCKING set to 0 for better performance"
else
# Explicitly disable CUDA
export CUDA_VISIBLE_DEVICES=""
echo "CUDA_VISIBLE_DEVICES explicitly disabled"
fi
# Log the parameters being used
echo "Using training parameters:"
echo " Learning rate: $LEARNING_RATE"
echo " Number of epochs: $NUM_TRAIN_EPOCHS"
echo " Concurrency threads: $CONCURRENCY_THREADS"
echo " Data synthesis mode: $DATA_SYNTHESIS_MODE"
echo " Use CUDA: $USE_CUDA"
echo " Is chain of thought: $IS_COT"
# If concurrency threads are set, configure related environment variables
if [ "$CONCURRENCY_THREADS" != "1" ]; then
# Limit the number of parallel threads to avoid memory issues
export OMP_NUM_THREADS=$CONCURRENCY_THREADS
export MKL_NUM_THREADS=$CONCURRENCY_THREADS
export NUMEXPR_NUM_THREADS=$CONCURRENCY_THREADS
# Add torch-specific threading controls
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
echo "Set thread environment variables to $CONCURRENCY_THREADS"
fi
# Add BF16 option based on the platform and CUDA availability
if [ "$PLATFORM" != "apple" ] && [ "$USE_CUDA" == "True" ]; then
HALF=True
echo "Enabling BF16 half precision for non-Apple platform with CUDA"
else
echo "Using standard precision (not using BF16)"
fi
# Print environment for debugging
echo "Environment configuration:"
echo " CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}"
echo " PYTORCH_CUDA_ALLOC_CONF: ${PYTORCH_CUDA_ALLOC_CONF}"
echo " Using half precision: ${HALF}"
# Execute training script with parameters from environment variables
python lpm_kernel/L2/train.py \
--seed 42 \
--model_name_or_path "${MODEL_BASE_PATH}" \
--user_name "${USER_NAME}" \
--dataset_name "resources/L2/data/merged.json" \
--chat_template_format "chatml" \
--add_special_tokens False \
--append_concat_token False \
--max_seq_length 2048 \
--num_train_epochs $NUM_TRAIN_EPOCHS \
--save_total_limit 2 \
--logging_steps 20 \
--log_level "info" \
--logging_strategy "steps" \
--save_strategy "steps" \
--save_steps 5 \
--push_to_hub False \
--bf16 $HALF \
--packing False \
--learning_rate $LEARNING_RATE \
--lr_scheduler_type "cosine" \
--weight_decay 1e-4 \
--max_grad_norm 0.3 \
--output_dir "${MODEL_PERSONAL_DIR}" \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps $CONCURRENCY_THREADS \
--gradient_checkpointing True \
--use_reentrant False \
--use_peft_lora True \
--lora_r 8 \
--lora_alpha 16 \
--lora_dropout 0.1 \
--lora_target_modules "all-linear" \
--use_4bit_quantization False \
--use_nested_quant False \
--bnb_4bit_compute_dtype "bfloat16" \
--is_cot $IS_COT \
--use_cuda $USE_CUDA