IvanHU commited on
Commit
a796796
·
verified ·
1 Parent(s): a635351

Add files using upload-large-folder tool

Browse files
LOG_NODE_RANK_3.log ADDED
The diff for this file is too large to render. See raw diff
 
dsv3_0.5b_pretrain_template.sh ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # DeepSeek V3 aux-free load balancing
3
+ # 0421 update rate = 1e-3
4
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
5
+ export OMP_NUM_THREADS=4
6
+
7
+ # Dir Arguments
8
+ DIR=`pwd`
9
+ PRETRAINED_CKPT_ROOT_PATH=${PRETRAINED_CKPT_ROOT_PATH:-"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/megatron_lm_workspace"}
10
+ PRETRAINED_CKPT_ID=${PRETRAINED_CKPT_ID:-"NOT_EXISTS"}
11
+ PRETRAINED_CKPT_NAME=${PRETRAINED_CKPT_NAME:-"NOT_EXISTS"}
12
+ OUTPUT_CHECKPOINT_PATH=${OUTPUT_CHECKPOINT_PATH:-"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/megatron_lm_workspace"}
13
+ OUTPUT_BASE_PATH=${OUTPUT_BASE_PATH:-"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/megatron_lm_workspace"}
14
+
15
+ # Training Arguments
16
+ SEQ_LEN=8192
17
+ BATCH_SIZE=${BATCH_SIZE:-1}
18
+ GLOBAL_BATCH_SIZE=${GLOBAL_BATCH_SIZE:-4096}
19
+ MP_SIZE=${MP_SIZE:-1}
20
+ PP_SIZE=${PP_SIZE:-1}
21
+ EP_SIZE=${EP_SIZE:-4}
22
+ ACTIVATION_CHECKPOINT=${ACTIVATION_CHECKPOINT:-"false"}
23
+ LOG_INTERVAL=${LOG_INTERVAL:-1}
24
+
25
+ # Learning Rate Arguments
26
+ LR=${LR:-"2e-3"}
27
+ MIN_LR=${MIN_LR:-"3.0e-5"}
28
+ LR_DECAY_STYLE=${LR_DECAY_STYLE:-"linear"}
29
+ TRAIN_TOKENS=${TRAIN_TOKENS:-1_000_000_000}
30
+ LR_WARMUP_TOKENS=${LR_WARMUP_TOKENS:-10_000_000}
31
+ LR_DECAY_TOKENS=${LR_DECAY_TOKENS:-990_000_000}
32
+ SAVE_TOKENS=${SAVE_TOKENS:-1_000_000_000}
33
+
34
+ # Sample-based training
35
+ TRAIN_SAMPLES=$(( ${TRAIN_TOKENS//_/} / ${SEQ_LEN} ))
36
+ LR_DECAY_SAMPLES=$(( ${LR_DECAY_TOKENS//_/} / ${SEQ_LEN} ))
37
+ LR_WARMUP_SAMPLES=$(( ${LR_WARMUP_TOKENS//_/} / ${SEQ_LEN} ))
38
+ SAVE_INTERVAL=$(( ${SAVE_TOKENS//_/} / ${SEQ_LEN} / ${GLOBAL_BATCH_SIZE} ))
39
+
40
+ # MoE Arguments
41
+ MOE_FFN_HIDDEN_SIZE=${MOE_FFN_HIDDEN_SIZE:-768}
42
+ MOE_TOPK=${MOE_TOPK:-2}
43
+ NUM_EXPERTS=${NUM_EXPERTS:-16}
44
+ NUM_SHARED_EXPERTS=${NUM_SHARED_EXPERTS:-0}
45
+ LOAD_BALANCING=${LOAD_BALANCING:-"dsv3"}
46
+ MOE_ROUTER_SCORE_FUNCTION=${MOE_ROUTER_SCORE_FUNCTION:-"sigmoid"}
47
+ MOE_EXPERT_CAPACITY_FACTOR=${MOE_EXPERT_CAPACITY_FACTOR:-2}
48
+ MOE_ROUTER_BIAS_UPDATE_RATE=${MOE_ROUTER_BIAS_UPDATE_RATE:-1e-3}
49
+
50
+ # Model Arguments
51
+ INIT_STD=${INIT_STD:-0.006}
52
+ NUM_LAYERS=${NUM_LAYERS:-12}
53
+ HIDDEN_SIZE=${HIDDEN_SIZE:-1024}
54
+ NUM_ATTN_HEADS=16
55
+ NUM_QUERY_GROUPS=2
56
+ ROTARY_BASE=${ROTARY_BASE:-"100000"}
57
+ TIE_EMBEDDING=${TIE_EMBEDDING:-"true"}
58
+
59
+ # Multi-node Arguments
60
+ GPUS_PER_NODE=${GPUS_PER_NODE:-8}
61
+ MASTER_ADDR=${MASTER_ADDR:-"localhost"}
62
+ MASTER_PORT=${MASTER_PORT:-"6000"}
63
+ NNODES=${NNODES:-"1"}
64
+ NODE_RANK=${NODE_RANK:-"0"}
65
+ WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
66
+
67
+ EXTRA_ARGS=${EXTRA_ARGS:-""}
68
+
69
+ # ###################################################
70
+ # ################# Process Arguments
71
+ # ###################################################
72
+
73
+ current_time=$(date "+%Y.%m.%d-%H.%M.%S")
74
+ JOB_ID=${TASK_UUID:-$current_time}
75
+ MODEL_SIZE='0.5b'
76
+ NAME="${NAME_PREFIX}dsv3-${MODEL_SIZE}-q${NUM_ATTN_HEADS}-kv${NUM_QUERY_GROUPS}-ep-${NUM_EXPERTS}-sep-${NUM_SHARED_EXPERTS}-top${MOE_TOPK}-cf-${MOE_EXPERT_CAPACITY_FACTOR}-bias-${MOE_ROUTER_BIAS_UPDATE_RATE}-bf16-ep${EP_SIZE}-mp${MP_SIZE}-pp${PP_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${WORLD_SIZE}-seqlen-${SEQ_LEN}"
77
+ CHECKPOINT_PATH="${OUTPUT_CHECKPOINT_PATH}/checkpoint/${NAME}"
78
+ LOG_DIR="${OUTPUT_CHECKPOINT_PATH}/log/${JOB_ID}_${NAME}"
79
+ mkdir -p ${CHECKPOINT_PATH}
80
+ mkdir -p ${LOG_DIR}
81
+ ln -s $CHECKPOINT_PATH $LOG_DIR/checkpoint
82
+ echo $JOB_ID >> $CHECKPOINT_PATH/linked_runs.txt
83
+ cp $0 ${LOG_DIR}
84
+
85
+ # check continual-pretrain or from-scratch
86
+ if [ -d "$CHECKPOINT_PATH/latest_checkpointed_iteration.txt" ]; then
87
+ LOAD_CHECKPOINT_PATH="${CHECKPOINT_PATH}"
88
+ CONTINUE_TRAIN=${CONTINUE_TRAIN:-'true'}
89
+ echo -e "\033[32mFind existing checkpoint $CHECKPOINT_PATH\033[0m"
90
+ else
91
+ LOAD_CHECKPOINT_PATH="${PRETRAINED_CKPT_ROOT_PATH}/${PRETRAINED_CKPT_NAME}"
92
+ CONTINUE_TRAIN=${CONTINUE_TRAIN:-'false'}
93
+ echo -e "\033[32mCheckpoint '$CHECKPOINT_PATH' does not exists. Try to load from '$LOAD_CHECKPOINT_PATH'\033[0m"
94
+ fi
95
+
96
+ # setup tokenizer
97
+ TOKENIZER_TYPE=${TOKENIZER_TYPE:-'hf_tokenizer_yulan_mini'}
98
+ DATA_PATH_CACHE="/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/cache"
99
+ if [[ ${TOKENIZER_TYPE} == "hf_tokenizer_qwen" ]]; then
100
+ DATA_PATH_TOKENIZED="${DATA_PATH}/qwen2.5"
101
+ TOKENIZER_ARGS="--tokenizer-type HuggingFaceTokenizer --tokenizer-model ../../tokenizer"
102
+ elif [[ ${TOKENIZER_TYPE} == "gpt2bpe" ]]; then
103
+ DATA_PATH_TOKENIZED="${DATA_PATH}"
104
+ TOKENIZER_ARGS="--vocab-file /volume/ailab4sci/models/gpt2/vocab.json --merge-file /volume/ailab4sci/models/gpt2/merges.txt"
105
+ elif [[ ${TOKENIZER_TYPE} == "hf_tokenizer_yulan_mini" ]]; then
106
+ # DATA_PATH_TOKENIZED="${DATA_PATH}/yulan_mini"
107
+ echo "DATA_PATH: ${DATA_PATH}"
108
+ DATA_PATH_TOKENIZED="${DATA_PATH}"
109
+ echo "DATA_PATH_TOKENIZED: ${DATA_PATH_TOKENIZED}"
110
+ TOKENIZER_ARGS="--tokenizer-type HuggingFaceTokenizer --tokenizer-model /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/YuLan-Mini"
111
+ else
112
+ echo "ERROR: Unknown tokenizer type ${TOKENIZER_TYPE}"
113
+ exit 1
114
+ fi
115
+
116
+ # setup embedding tying
117
+ if [[ "1${TIE_EMBEDDING}" == "1false" ]]; then
118
+ EXTRA_ARGS="${EXTRA_ARGS} \
119
+ --untie-embeddings-and-output-weights
120
+ "
121
+ fi
122
+
123
+ # moe
124
+ if [[ ${LOAD_BALANCING} == "dsv3" ]]; then
125
+ EXTRA_ARGS="${EXTRA_ARGS} \
126
+ --moe-router-enable-expert-bias
127
+ "
128
+ LOAD_BALANCING=none
129
+ fi
130
+ if [ -n "$MOE_AUX_LOSS_COEFF" ]; then
131
+ echo "ERROR: DeepSeek V3 does not support MOE_AUX_LOSS_COEFF=$MOE_AUX_LOSS_COEFF"
132
+ exit 1
133
+ fi
134
+
135
+ # ###################################################
136
+ # ################# models
137
+ # ###################################################
138
+
139
+
140
+ DISTRIBUTED_ARGS=(
141
+ --nproc_per_node $GPUS_PER_NODE
142
+ --nnodes $NNODES
143
+ --node_rank $NODE_RANK
144
+ --master_addr $MASTER_ADDR
145
+ --master_port $MASTER_PORT
146
+ )
147
+
148
+
149
+ MODEL_ARGS=(
150
+ --use-mcore-models
151
+ --disable-bias-linear
152
+ --seq-length ${SEQ_LEN}
153
+ --max-position-embeddings ${SEQ_LEN}
154
+ --num-layers ${NUM_LAYERS}
155
+ --hidden-size ${HIDDEN_SIZE}
156
+ --ffn-hidden-size ${MOE_FFN_HIDDEN_SIZE}
157
+ --num-attention-heads ${NUM_ATTN_HEADS}
158
+ --init-method-std ${INIT_STD}
159
+ --attention-dropout 0.0
160
+ --hidden-dropout 0.0
161
+ --normalization RMSNorm
162
+ --position-embedding-type rope
163
+ --swiglu
164
+ --group-query-attention
165
+ --num-query-groups ${NUM_QUERY_GROUPS}
166
+ --no-masked-softmax-fusion
167
+ --no-position-embedding
168
+ --rotary-base ${ROTARY_BASE}
169
+ --use-flash-attn
170
+ )
171
+
172
+ MOE_ARGS=(
173
+ --num-experts ${NUM_EXPERTS}
174
+ --expert-tensor-parallel-size 1
175
+ --moe-grouped-gemm
176
+ --moe-router-topk ${MOE_TOPK}
177
+ --moe-router-load-balancing-type ${LOAD_BALANCING}
178
+ --moe-router-score-function sigmoid
179
+ --moe-token-dispatcher-type alltoall
180
+ --overlap-param-gather
181
+ --overlap-grad-reduce
182
+ --moe-expert-capacity-factor ${MOE_EXPERT_CAPACITY_FACTOR}
183
+ --moe-router-bias-update-rate ${MOE_ROUTER_BIAS_UPDATE_RATE}
184
+ )
185
+
186
+ TRAINING_ARGS=(
187
+ --micro-batch-size ${BATCH_SIZE}
188
+ --global-batch-size ${GLOBAL_BATCH_SIZE}
189
+ --lr ${LR}
190
+ --train-samples ${TRAIN_SAMPLES}
191
+ --lr-warmup-samples ${LR_WARMUP_SAMPLES}
192
+ --lr-decay-samples ${LR_DECAY_SAMPLES}
193
+ --lr-decay-style ${LR_DECAY_STYLE}
194
+ --min-lr ${MIN_LR}
195
+ --split 100,0,0
196
+ --weight-decay 0.1
197
+ --clip-grad 0.5
198
+ --num-workers 2
199
+ --bf16
200
+ --save ${CHECKPOINT_PATH}
201
+ --load ${LOAD_CHECKPOINT_PATH}
202
+ )
203
+
204
+ DATA_ARGS=(
205
+ --data-path ${DATA_PATH_TOKENIZED}
206
+ --data-cache-path ${DATA_PATH_CACHE}
207
+ )
208
+
209
+ MODEL_PARALLEL_ARGS=(
210
+ --tensor-model-parallel-size ${MP_SIZE}
211
+ --pipeline-model-parallel-size ${PP_SIZE}
212
+ --expert-model-parallel-size ${EP_SIZE}
213
+ --use-distributed-optimizer
214
+ --sequence-parallel
215
+ )
216
+
217
+ LOGGING_ARGS=(
218
+ --log-interval ${LOG_INTERVAL}
219
+ --log-throughput
220
+ --save-interval ${SAVE_INTERVAL}
221
+ --eval-interval 1000
222
+ --eval-iters 10
223
+ --tensorboard-dir ${LOG_DIR}
224
+ --log-timers-to-tensorboard
225
+ --log-memory-to-tensorboard
226
+ )
227
+
228
+ if [ -n "${WANDB_API_KEY}" ]; then
229
+ LOGGING_ARGS+=(
230
+ --wandb-project ${WANDB_PROJECT:-"DSV3"}
231
+ --wandb-exp-name ${NAME}
232
+ )
233
+ fi
234
+
235
+ if [ "1${ACTIVATION_CHECKPOINT}" = "1true" ]; then
236
+ EXTRA_ARGS="${EXTRA_ARGS} \
237
+ --recompute-granularity selective
238
+ "
239
+ fi
240
+
241
+ if [ $NODE_RANK == "0" ]; then
242
+ env >> ${LOG_DIR}/ENV-${HOSTNAME}.log
243
+ echo $(which torchrun) ${DISTRIBUTED_ARGS[@]} ../../pretrain_gpt.py ${MODEL_ARGS[@]} ${DATA_ARGS[@]} ${MOE_ARGS[@]} ${TRAINING_ARGS[@]} ${MODEL_PARALLEL_ARGS[@]} ${LOGGING_ARGS[@]} ${TOKENIZER_ARGS} ${EXTRA_ARGS} >> ${LOG_DIR}/ENV-${HOSTNAME}.log
244
+ fi
245
+ set -x
246
+
247
+ torchrun ${DISTRIBUTED_ARGS[@]} ../../pretrain_gpt.py \
248
+ ${MODEL_ARGS[@]} \
249
+ ${DATA_ARGS[@]} \
250
+ ${MOE_ARGS[@]} \
251
+ ${TRAINING_ARGS[@]} \
252
+ ${MODEL_PARALLEL_ARGS[@]} \
253
+ ${LOGGING_ARGS[@]} \
254
+ ${TOKENIZER_ARGS} \
255
+ ${EXTRA_ARGS} 2>&1 | tee ${LOG_DIR}/LOG_NODE_RANK_${NODE_RANK}.log