| #SBATCH --job-name=taiyi-sd-dreambooth # create a short name for your job | |
| #SBATCH --nodes=1 # node count | |
| #SBATCH --ntasks-per-node=1 # number of tasks to run per node | |
| #SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) | |
| #SBATCH --gres=gpu:1 # number of gpus per node | |
| #SBATCH -o %x-%j.log # output and error log file names (%x for job id) | |
| #SBATCH -x dgx050 | |
| # pwd=Fengshenbang-LM/fengshen/examples/pretrain_erlangshen | |
| ROOT_DIR=../../workspace | |
| # export CUDA_VISIBLE_DEVICES='7' | |
| export TORCH_EXTENSIONS_DIR=${ROOT_DIR}/torch_extendsions | |
| MODEL_NAME=taiyi-sd-dreambooth | |
| MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME} | |
| if [ ! -d ${MODEL_ROOT_DIR} ];then | |
| mkdir ${MODEL_ROOT_DIR} | |
| fi | |
| NNODES=1 | |
| GPUS_PER_NODE=1 | |
| MICRO_BATCH_SIZE=1 | |
| INSTANCE_PROMPT="小黄鸭" | |
| OUTPUT_DIR="saved_model_tinyduck" | |
| INSTANCE_DIR="train_images_duck" | |
| DATA_ARGS="\ | |
| --dataloader_workers 2 \ | |
| --train_batchsize $MICRO_BATCH_SIZE \ | |
| --val_batchsize $MICRO_BATCH_SIZE \ | |
| --test_batchsize $MICRO_BATCH_SIZE \ | |
| --instance_data_dir=$INSTANCE_DIR \ | |
| --instance_prompt=$INSTANCE_PROMPT \ | |
| --resolution=512 \ | |
| " | |
| MODEL_ARGS="\ | |
| --model_path $MODEL_ROOT_DIR/pretrain/Taiyi-Stable-Diffusion-1B-Chinese-v0.1/ \ | |
| --train_text_encoder \ | |
| --learning_rate 1e-6 \ | |
| --scheduler_type constant \ | |
| --warmup_steps 100 \ | |
| " | |
| MODEL_CHECKPOINT_ARGS="\ | |
| --save_ckpt_path ${MODEL_ROOT_DIR}/ckpt \ | |
| --load_ckpt_path ${MODEL_ROOT_DIR}/ckpt/last.ckpt \ | |
| " | |
| TRAINER_ARGS="\ | |
| --max_steps 1200 \ | |
| --gpus $GPUS_PER_NODE \ | |
| --num_nodes $NNODES \ | |
| --strategy ddp \ | |
| --log_every_n_steps 100 \ | |
| --precision 32 \ | |
| --default_root_dir ${MODEL_ROOT_DIR} \ | |
| --replace_sampler_ddp False \ | |
| --num_sanity_val_steps 0 \ | |
| --limit_val_batches 0 \ | |
| " | |
| # num_sanity_val_steps, limit_val_batches 通过这俩参数把validation关了 | |
| export options=" \ | |
| $DATA_ARGS \ | |
| $MODEL_ARGS \ | |
| $MODEL_CHECKPOINT_ARGS \ | |
| $TRAINER_ARGS \ | |
| " | |
| # run local | |
| python train.py $options | |
| # run on slurm | |
| # srun python train.py $options |