Upload folder using huggingface_hub
Browse files- model.safetensors +1 -1
- training_artifacts/README.md +1 -1
- training_artifacts/logs/pipeline_cleaned.txt +1144 -76
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 988097824
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09c851117fac482c71cf54466a6c7f4d8c68bfadc3b913986477652224411dc9
|
| 3 |
size 988097824
|
training_artifacts/README.md
CHANGED
|
@@ -12,5 +12,5 @@ This directory contains the training configuration and logs for this model.
|
|
| 12 |
## Job Information
|
| 13 |
|
| 14 |
- Job Name: lf_torch_test__interactive
|
| 15 |
-
- Timestamp: 2025-10-23 00:
|
| 16 |
- Execution Mode: Local
|
|
|
|
| 12 |
## Job Information
|
| 13 |
|
| 14 |
- Job Name: lf_torch_test__interactive
|
| 15 |
+
- Timestamp: 2025-10-23 00:42:45 UTC
|
| 16 |
- Execution Mode: Local
|
training_artifacts/logs/pipeline_cleaned.txt
CHANGED
|
@@ -11061,7 +11061,18 @@ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default
|
|
| 11061 |
warnings.warn(
|
| 11062 |
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
|
| 11063 |
warnings.warn(
|
| 11064 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11065 |
STAGE 1: Training Model
|
| 11066 |
Start Time: Wed Oct 22 08:37:00 PM EDT 2025
|
| 11067 |
========================================
|
|
@@ -11283,72 +11294,90 @@ gl064:2627273:2627302 [1] NCCL INFO Using network IB
|
|
| 11283 |
gl064:2627272:2627301 [0] NCCL INFO Using network IB
|
| 11284 |
gl064:2627272:2627301 [0] NCCL INFO ncclCommInitRankConfig comm 0x15210000 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0xb71ac44899f1b45 - Init START
|
| 11285 |
gl064:2627273:2627302 [1] NCCL INFO ncclCommInitRankConfig comm 0x138c8d70 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0xb71ac44899f1b45 - Init START
|
| 11286 |
-
|
| 11287 |
-
|
| 11288 |
-
|
| 11289 |
-
|
| 11290 |
-
|
| 11291 |
-
|
| 11292 |
-
|
| 11293 |
-
|
| 11294 |
-
|
| 11295 |
-
|
| 11296 |
-
|
| 11297 |
-
|
| 11298 |
-
|
| 11299 |
-
|
| 11300 |
-
|
| 11301 |
-
|
| 11302 |
-
|
| 11303 |
-
|
| 11304 |
-
|
| 11305 |
-
|
| 11306 |
-
|
| 11307 |
-
|
| 11308 |
-
|
| 11309 |
-
|
| 11310 |
-
|
| 11311 |
-
|
| 11312 |
-
|
| 11313 |
-
|
| 11314 |
-
|
| 11315 |
-
|
| 11316 |
-
|
| 11317 |
-
|
| 11318 |
-
|
| 11319 |
-
|
| 11320 |
-
|
| 11321 |
-
|
| 11322 |
-
|
| 11323 |
-
|
| 11324 |
-
|
| 11325 |
-
|
| 11326 |
-
|
| 11327 |
-
|
| 11328 |
-
|
| 11329 |
-
|
| 11330 |
-
|
| 11331 |
-
|
| 11332 |
-
|
| 11333 |
-
|
| 11334 |
-
|
| 11335 |
-
|
| 11336 |
-
|
| 11337 |
-
|
| 11338 |
-
|
| 11339 |
-
|
| 11340 |
-
|
| 11341 |
-
|
| 11342 |
-
|
| 11343 |
-
|
| 11344 |
-
|
| 11345 |
-
|
| 11346 |
-
|
| 11347 |
-
|
| 11348 |
-
|
| 11349 |
-
|
| 11350 |
-
|
| 11351 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11352 |
"architectures": [
|
| 11353 |
"Qwen2ForCausalLM"
|
| 11354 |
],
|
|
@@ -11404,14 +11433,119 @@ wandb: View run at https://wandb.ai/ut_nlp_deduce/llamafactory/runs/18s2z8v7
|
|
| 11404 |
"vocab_size": 151936
|
| 11405 |
}
|
| 11406 |
|
| 11407 |
-
[INFO|
|
| 11408 |
-
|
| 11409 |
-
[
|
| 11410 |
-
|
| 11411 |
-
|
| 11412 |
-
|
| 11413 |
-
|
| 11414 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11415 |
100%|| 100/100 [00:33<00:00, 2.92it/s][INFO|trainer.py:4309] 2025-10-22 20:37:51,912 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
|
| 11416 |
[INFO|configuration_utils.py:765] 2025-10-22 20:37:52,016 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 11417 |
[INFO|configuration_utils.py:839] 2025-10-22 20:37:52,017 >> Model config Qwen2Config {
|
|
@@ -11578,7 +11712,7 @@ Checkpoint details:
|
|
| 11578 |
Training step: 100
|
| 11579 |
Updating merge config to point to checkpoint...
|
| 11580 |
Successfully updated merge config
|
| 11581 |
-
2025
|
| 11582 |
========================================
|
| 11583 |
|
| 11584 |
========================================
|
|
@@ -11772,3 +11906,937 @@ Preparing Training Artifacts
|
|
| 11772 |
========================================
|
| 11773 |
Copying configuration files...
|
| 11774 |
Copying and cleaning training logs...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11061 |
warnings.warn(
|
| 11062 |
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
|
| 11063 |
warnings.warn(
|
| 11064 |
+
Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
|
| 11065 |
+
Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
|
| 11066 |
+
Dataset Info:
|
| 11067 |
+
Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
|
| 11068 |
+
Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
|
| 11069 |
+
HF Repo ID: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
|
| 11070 |
+
|
| 11071 |
+
|
| 11072 |
+
Found pre-tokenized dataset at: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/tokenized/my_custom_sft12
|
| 11073 |
+
Training will load from cached tokenized data (fast startup)
|
| 11074 |
+
|
| 11075 |
+
========================================
|
| 11076 |
STAGE 1: Training Model
|
| 11077 |
Start Time: Wed Oct 22 08:37:00 PM EDT 2025
|
| 11078 |
========================================
|
|
|
|
| 11294 |
gl064:2627272:2627301 [0] NCCL INFO Using network IB
|
| 11295 |
gl064:2627272:2627301 [0] NCCL INFO ncclCommInitRankConfig comm 0x15210000 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0xb71ac44899f1b45 - Init START
|
| 11296 |
gl064:2627273:2627302 [1] NCCL INFO ncclCommInitRankConfig comm 0x138c8d70 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0xb71ac44899f1b45 - Init START
|
| 11297 |
+
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
|
| 11298 |
+
import pkg_resources
|
| 11299 |
+
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
|
| 11300 |
+
import pkg_resources
|
| 11301 |
+
[INFO|2025-10-22 20:37:15] llamafactory.hparams.parser:143 >> Set `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.
|
| 11302 |
+
[INFO|2025-10-22 20:37:15] llamafactory.hparams.parser:423 >> Process rank: 2, world size: 4, device: cuda:0, distributed training: True, compute dtype: torch.float16
|
| 11303 |
+
[INFO|2025-10-22 20:37:15] llamafactory.hparams.parser:423 >> Process rank: 3, world size: 4, device: cuda:1, distributed training: True, compute dtype: torch.float16
|
| 11304 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:37:15,296 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
|
| 11305 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:37:15,296 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
|
| 11306 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:37:15,296 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
|
| 11307 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:37:15,296 >> loading file added_tokens.json from cache at None
|
| 11308 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:37:15,296 >> loading file special_tokens_map.json from cache at None
|
| 11309 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:37:15,296 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
|
| 11310 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:37:15,296 >> loading file chat_template.jinja from cache at None
|
| 11311 |
+
[INFO|tokenization_utils_base.py:2364] 2025-10-22 20:37:15,467 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
| 11312 |
+
[INFO|configuration_utils.py:765] 2025-10-22 20:37:15,655 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 11313 |
+
[INFO|configuration_utils.py:839] 2025-10-22 20:37:15,657 >> Model config Qwen2Config {
|
| 11314 |
+
"architectures": [
|
| 11315 |
+
"Qwen2ForCausalLM"
|
| 11316 |
+
],
|
| 11317 |
+
"attention_dropout": 0.0,
|
| 11318 |
+
"bos_token_id": 151643,
|
| 11319 |
+
"dtype": "bfloat16",
|
| 11320 |
+
"eos_token_id": 151643,
|
| 11321 |
+
"hidden_act": "silu",
|
| 11322 |
+
"hidden_size": 896,
|
| 11323 |
+
"initializer_range": 0.02,
|
| 11324 |
+
"intermediate_size": 4864,
|
| 11325 |
+
"layer_types": [
|
| 11326 |
+
"full_attention",
|
| 11327 |
+
"full_attention",
|
| 11328 |
+
"full_attention",
|
| 11329 |
+
"full_attention",
|
| 11330 |
+
"full_attention",
|
| 11331 |
+
"full_attention",
|
| 11332 |
+
"full_attention",
|
| 11333 |
+
"full_attention",
|
| 11334 |
+
"full_attention",
|
| 11335 |
+
"full_attention",
|
| 11336 |
+
"full_attention",
|
| 11337 |
+
"full_attention",
|
| 11338 |
+
"full_attention",
|
| 11339 |
+
"full_attention",
|
| 11340 |
+
"full_attention",
|
| 11341 |
+
"full_attention",
|
| 11342 |
+
"full_attention",
|
| 11343 |
+
"full_attention",
|
| 11344 |
+
"full_attention",
|
| 11345 |
+
"full_attention",
|
| 11346 |
+
"full_attention",
|
| 11347 |
+
"full_attention",
|
| 11348 |
+
"full_attention",
|
| 11349 |
+
"full_attention"
|
| 11350 |
+
],
|
| 11351 |
+
"max_position_embeddings": 32768,
|
| 11352 |
+
"max_window_layers": 24,
|
| 11353 |
+
"model_type": "qwen2",
|
| 11354 |
+
"num_attention_heads": 14,
|
| 11355 |
+
"num_hidden_layers": 24,
|
| 11356 |
+
"num_key_value_heads": 2,
|
| 11357 |
+
"rms_norm_eps": 1e-06,
|
| 11358 |
+
"rope_scaling": null,
|
| 11359 |
+
"rope_theta": 1000000.0,
|
| 11360 |
+
"sliding_window": null,
|
| 11361 |
+
"tie_word_embeddings": true,
|
| 11362 |
+
"transformers_version": "4.57.1",
|
| 11363 |
+
"use_cache": true,
|
| 11364 |
+
"use_mrope": false,
|
| 11365 |
+
"use_sliding_window": false,
|
| 11366 |
+
"vocab_size": 151936
|
| 11367 |
+
}
|
| 11368 |
+
|
| 11369 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:37:15,715 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
|
| 11370 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:37:15,716 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
|
| 11371 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:37:15,716 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
|
| 11372 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:37:15,716 >> loading file added_tokens.json from cache at None
|
| 11373 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:37:15,716 >> loading file special_tokens_map.json from cache at None
|
| 11374 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:37:15,716 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
|
| 11375 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:37:15,716 >> loading file chat_template.jinja from cache at None
|
| 11376 |
+
[INFO|tokenization_utils_base.py:2364] 2025-10-22 20:37:15,882 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
| 11377 |
+
[WARNING|2025-10-22 20:37:15] llamafactory.data.loader:148 >> Loading dataset from disk will ignore other data arguments.
|
| 11378 |
+
[INFO|2025-10-22 20:37:15] llamafactory.data.loader:143 >> Loaded tokenized dataset from /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/tokenized/my_custom_sft12.
|
| 11379 |
+
[INFO|configuration_utils.py:765] 2025-10-22 20:37:15,984 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 11380 |
+
[INFO|configuration_utils.py:839] 2025-10-22 20:37:15,984 >> Model config Qwen2Config {
|
| 11381 |
"architectures": [
|
| 11382 |
"Qwen2ForCausalLM"
|
| 11383 |
],
|
|
|
|
| 11433 |
"vocab_size": 151936
|
| 11434 |
}
|
| 11435 |
|
| 11436 |
+
[INFO|2025-10-22 20:37:15] llamafactory.model.model_utils.kv_cache:143 >> KV cache is disabled during training.
|
| 11437 |
+
`torch_dtype` is deprecated! Use `dtype` instead!
|
| 11438 |
+
[WARNING|logging.py:328] 2025-10-22 20:37:16,316 >> `torch_dtype` is deprecated! Use `dtype` instead!
|
| 11439 |
+
[INFO|modeling_utils.py:1172] 2025-10-22 20:37:16,317 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
|
| 11440 |
+
[INFO|modeling_utils.py:2341] 2025-10-22 20:37:16,318 >> Instantiating Qwen2ForCausalLM model under default dtype torch.float16.
|
| 11441 |
+
[INFO|configuration_utils.py:986] 2025-10-22 20:37:16,319 >> Generate config GenerationConfig {
|
| 11442 |
+
"bos_token_id": 151643,
|
| 11443 |
+
"eos_token_id": 151643,
|
| 11444 |
+
"use_cache": false
|
| 11445 |
+
}
|
| 11446 |
+
|
| 11447 |
+
[INFO|configuration_utils.py:941] 2025-10-22 20:37:16,605 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
|
| 11448 |
+
[INFO|configuration_utils.py:986] 2025-10-22 20:37:16,605 >> Generate config GenerationConfig {
|
| 11449 |
+
"bos_token_id": 151643,
|
| 11450 |
+
"eos_token_id": 151643,
|
| 11451 |
+
"max_new_tokens": 2048
|
| 11452 |
+
}
|
| 11453 |
+
|
| 11454 |
+
[INFO|dynamic_module_utils.py:423] 2025-10-22 20:37:16,637 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
|
| 11455 |
+
[INFO|2025-10-22 20:37:16] llamafactory.model.model_utils.checkpointing:143 >> Gradient checkpointing enabled.
|
| 11456 |
+
[INFO|2025-10-22 20:37:16] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
|
| 11457 |
+
[INFO|2025-10-22 20:37:16] llamafactory.model.adapter:143 >> Upcasting trainable params to float32.
|
| 11458 |
+
[INFO|2025-10-22 20:37:16] llamafactory.model.adapter:143 >> Fine-tuning method: LoRA
|
| 11459 |
+
[INFO|2025-10-22 20:37:16] llamafactory.model.model_utils.misc:143 >> Found linear modules: gate_proj,v_proj,down_proj,o_proj,q_proj,k_proj,up_proj
|
| 11460 |
+
[INFO|2025-10-22 20:37:17] llamafactory.model.loader:143 >> trainable params: 4,399,104 || all params: 498,431,872 || trainable%: 0.8826
|
| 11461 |
+
The model is already on multiple devices. Skipping the move to device specified in `args`.
|
| 11462 |
+
[WARNING|trainer.py:906] 2025-10-22 20:37:17,117 >> The model is already on multiple devices. Skipping the move to device specified in `args`.
|
| 11463 |
+
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
|
| 11464 |
+
[INFO|trainer.py:699] 2025-10-22 20:37:17,119 >> max_steps is given, it will override any value given in num_train_epochs
|
| 11465 |
+
[INFO|trainer.py:749] 2025-10-22 20:37:17,119 >> Using auto half precision backend
|
| 11466 |
+
[WARNING|trainer.py:982] 2025-10-22 20:37:17,120 >> The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
|
| 11467 |
+
gl065:3840251:3840251 [1] NCCL INFO cudaDriverVersion 13000
|
| 11468 |
+
gl065:3840251:3840251 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
|
| 11469 |
+
gl065:3840251:3840251 [1] NCCL INFO Bootstrap: Using ibs3:10.0.5.1<0>
|
| 11470 |
+
gl065:3840251:3840251 [1] NCCL INFO NCCL version 2.27.5+cuda12.9
|
| 11471 |
+
gl065:3840251:3840251 [1] NCCL INFO Comm config Blocking set to 1
|
| 11472 |
+
gl065:3840250:3840250 [0] NCCL INFO cudaDriverVersion 13000
|
| 11473 |
+
gl065:3840250:3840250 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
|
| 11474 |
+
gl065:3840250:3840250 [0] NCCL INFO Bootstrap: Using ibs3:10.0.5.1<0>
|
| 11475 |
+
gl065:3840250:3840250 [0] NCCL INFO NCCL version 2.27.5+cuda12.9
|
| 11476 |
+
gl065:3840250:3840250 [0] NCCL INFO Comm config Blocking set to 1
|
| 11477 |
+
gl065:3840251:3840375 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
|
| 11478 |
+
gl065:3840251:3840375 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
|
| 11479 |
+
gl065:3840251:3840375 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
|
| 11480 |
+
gl065:3840251:3840375 [1] NCCL INFO NCCL_IB_HCA set to mlx5
|
| 11481 |
+
gl065:3840250:3840376 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
|
| 11482 |
+
gl065:3840250:3840376 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
|
| 11483 |
+
gl065:3840250:3840376 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
|
| 11484 |
+
gl065:3840250:3840376 [0] NCCL INFO NCCL_IB_HCA set to mlx5
|
| 11485 |
+
gl065:3840250:3840376 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.1<0>
|
| 11486 |
+
gl065:3840250:3840376 [0] NCCL INFO Initialized NET plugin IB
|
| 11487 |
+
gl065:3840251:3840375 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.1<0>
|
| 11488 |
+
gl065:3840251:3840375 [1] NCCL INFO Initialized NET plugin IB
|
| 11489 |
+
gl065:3840250:3840376 [0] NCCL INFO Assigned NET plugin IB to comm
|
| 11490 |
+
gl065:3840251:3840375 [1] NCCL INFO Assigned NET plugin IB to comm
|
| 11491 |
+
gl065:3840250:3840376 [0] NCCL INFO Using network IB
|
| 11492 |
+
gl065:3840251:3840375 [1] NCCL INFO Using network IB
|
| 11493 |
+
gl065:3840250:3840376 [0] NCCL INFO ncclCommInitRankConfig comm 0x15430230 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0xb71ac44899f1b45 - Init START
|
| 11494 |
+
gl065:3840251:3840375 [1] NCCL INFO ncclCommInitRankConfig comm 0x133fe6a0 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0xb71ac44899f1b45 - Init START
|
| 11495 |
+
gl065:3840250:3840376 [0] NCCL INFO RAS client listening socket at ::1<28028>
|
| 11496 |
+
gl065:3840251:3840375 [1] NCCL INFO RAS client listening socket at ::1<28028>
|
| 11497 |
+
gl065:3840251:3840375 [1] NCCL INFO Bootstrap timings total 0.004547 (create 0.000080, send 0.000522, recv 0.001800, ring 0.001144, delay 0.000000)
|
| 11498 |
+
gl065:3840250:3840376 [0] NCCL INFO Bootstrap timings total 0.004873 (create 0.000033, send 0.000581, recv 0.000935, ring 0.002906, delay 0.000000)
|
| 11499 |
+
gl065:3840250:3840376 [0] NCCL INFO Setting affinity for GPU 0 to 0-15
|
| 11500 |
+
gl065:3840251:3840375 [1] NCCL INFO Setting affinity for GPU 1 to 0-15
|
| 11501 |
+
gl065:3840251:3840375 [1] NCCL INFO comm 0x133fe6a0 rank 3 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
|
| 11502 |
+
gl065:3840250:3840376 [0] NCCL INFO comm 0x15430230 rank 2 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
|
| 11503 |
+
gl065:3840251:3840375 [1] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
|
| 11504 |
+
gl065:3840250:3840376 [0] NCCL INFO Trees [0] 3/-1/-1->2->0 [1] 3/0/-1->2->-1
|
| 11505 |
+
gl065:3840251:3840375 [1] NCCL INFO P2P Chunksize set to 131072
|
| 11506 |
+
gl065:3840250:3840376 [0] NCCL INFO P2P Chunksize set to 131072
|
| 11507 |
+
gl065:3840251:3840375 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 11508 |
+
gl065:3840250:3840376 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 11509 |
+
gl065:3840251:3840381 [1] NCCL INFO [Proxy Service] Device 1 CPU core 8
|
| 11510 |
+
gl065:3840251:3840383 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 11
|
| 11511 |
+
gl065:3840250:3840382 [0] NCCL INFO [Proxy Service] Device 0 CPU core 9
|
| 11512 |
+
gl065:3840250:3840384 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 12
|
| 11513 |
+
gl065:3840250:3840376 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
| 11514 |
+
gl065:3840250:3840376 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
|
| 11515 |
+
gl065:3840251:3840375 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
| 11516 |
+
gl065:3840251:3840375 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
|
| 11517 |
+
gl065:3840250:3840376 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
|
| 11518 |
+
gl065:3840251:3840375 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
|
| 11519 |
+
gl065:3840250:3840376 [0] NCCL INFO ncclCommInitRankConfig comm 0x15430230 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0xb71ac44899f1b45 - Init COMPLETE
|
| 11520 |
+
gl065:3840251:3840375 [1] NCCL INFO ncclCommInitRankConfig comm 0x133fe6a0 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0xb71ac44899f1b45 - Init COMPLETE
|
| 11521 |
+
gl065:3840250:3840376 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 0.13 (kernels 0.09, alloc 0.01, bootstrap 0.00, allgathers 0.00, topo 0.03, graphs 0.00, connections 0.00, rest 0.00)
|
| 11522 |
+
gl065:3840251:3840375 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 0.13 (kernels 0.09, alloc 0.01, bootstrap 0.00, allgathers 0.00, topo 0.03, graphs 0.00, connections 0.00, rest 0.00)
|
| 11523 |
+
gl065:3840250:3840385 [0] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [receive] via NET/IB/0
|
| 11524 |
+
gl065:3840250:3840387 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 10
|
| 11525 |
+
gl065:3840250:3840385 [0] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [receive] via NET/IB/0
|
| 11526 |
+
gl065:3840250:3840385 [0] NCCL INFO Channel 00 : 2[0] -> 3[1] via SHM/direct/direct
|
| 11527 |
+
gl065:3840250:3840385 [0] NCCL INFO Channel 01 : 2[0] -> 3[1] via SHM/direct/direct
|
| 11528 |
+
gl065:3840251:3840386 [1] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [send] via NET/IB/0
|
| 11529 |
+
gl065:3840251:3840386 [1] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [send] via NET/IB/0
|
| 11530 |
+
gl065:3840251:3840388 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 1
|
| 11531 |
+
gl065:3840251:3840386 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
|
| 11532 |
+
gl065:3840250:3840385 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
|
| 11533 |
+
[INFO|trainer.py:2519] 2025-10-22 20:37:17,479 >> ***** Running training *****
|
| 11534 |
+
[INFO|trainer.py:2520] 2025-10-22 20:37:17,479 >> Num examples = 3,598
|
| 11535 |
+
[INFO|trainer.py:2521] 2025-10-22 20:37:17,479 >> Num Epochs = 1
|
| 11536 |
+
[INFO|trainer.py:2522] 2025-10-22 20:37:17,479 >> Instantaneous batch size per device = 1
|
| 11537 |
+
[INFO|trainer.py:2525] 2025-10-22 20:37:17,479 >> Total train batch size (w. parallel, distributed & accumulation) = 4
|
| 11538 |
+
[INFO|trainer.py:2526] 2025-10-22 20:37:17,479 >> Gradient Accumulation steps = 1
|
| 11539 |
+
[INFO|trainer.py:2527] 2025-10-22 20:37:17,479 >> Total optimization steps = 100
|
| 11540 |
+
[INFO|trainer.py:2528] 2025-10-22 20:37:17,481 >> Number of trainable parameters = 4,399,104
|
| 11541 |
+
[INFO|trainer.py:2810] 2025-10-22 20:37:51,948 >>
|
| 11542 |
+
|
| 11543 |
+
Training completed. Do not forget to share your model on huggingface.co/models =)
|
| 11544 |
+
|
| 11545 |
+
|
| 11546 |
+
gl065:3840250:3840250 [0] NCCL INFO comm 0x15430230 rank 2 nranks 4 cudaDev 0 busId 47000 - Destroy COMPLETE
|
| 11547 |
+
gl065:3840251:3840251 [1] NCCL INFO comm 0x133fe6a0 rank 3 nranks 4 cudaDev 1 busId 59000 - Destroy COMPLETE
|
| 11548 |
+
s] 91%| | 91/100 [00:30<00:02, 3.41it/s] 92%|| 92/100 [00:30<00:02, 2.70it/s] 93%|| 93/100 [00:31<00:02, 2.78it/s] 94%|| 94/100 [00:31<00:02, 2.83it/s] 95%|| 95/100 [00:31<00:01, 2.99it/s] 96%|| 96/100 [00:31<00:01, 3.13it/s] 97%|| 97/100 [00:32<00:00, 3.27it/s] 98%|| 98/100 [00:32<00:00, 3.13it/s] 99%|| 99/100 [00:32<00:00, 3.00it/s]100%|| 100/100 [00:33<00:00, 2.92it/s] {'loss': 0.9472, 'grad_norm': 0.45911866426467896, 'learning_rate': 5.000000000000001e-07, 'epoch': 0.11}
|
| 11549 |
100%|| 100/100 [00:33<00:00, 2.92it/s][INFO|trainer.py:4309] 2025-10-22 20:37:51,912 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
|
| 11550 |
[INFO|configuration_utils.py:765] 2025-10-22 20:37:52,016 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 11551 |
[INFO|configuration_utils.py:839] 2025-10-22 20:37:52,017 >> Model config Qwen2Config {
|
|
|
|
| 11712 |
Training step: 100
|
| 11713 |
Updating merge config to point to checkpoint...
|
| 11714 |
Successfully updated merge config
|
| 11715 |
+
d Time: Wed Oct 22 08:37:55 PM EDT 2025
|
| 11716 |
========================================
|
| 11717 |
|
| 11718 |
========================================
|
|
|
|
| 11906 |
========================================
|
| 11907 |
Copying configuration files...
|
| 11908 |
Copying and cleaning training logs...
|
| 11909 |
+
Training artifacts prepared in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/training_artifacts
|
| 11910 |
+
Contents:
|
| 11911 |
+
Log files:
|
| 11912 |
+
|
| 11913 |
+
========================================
|
| 11914 |
+
STAGE 3: Uploading to HuggingFace Hub
|
| 11915 |
+
Repository: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
|
| 11916 |
+
Start Time: Wed Oct 22 08:38:09 PM EDT 2025
|
| 11917 |
+
========================================
|
| 11918 |
+
Uploading contents of: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
|
| 11919 |
+
Directory structure:
|
| 11920 |
+
|
| 11921 |
+
Executing: huggingface-cli upload TAUR-dev/testing_llamafactory_helper_quick_test__interactive /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged .
|
| 11922 |
+
Start hashing 17 files.
|
| 11923 |
+
Finished hashing 17 files.
|
| 11924 |
+
[33m Warning: 'huggingface-cli upload' is deprecated. Use 'hf upload' instead.[0m
|
| 11925 |
+
Processing Files (0 / 0) : | | 0.00B / 0.00B
|
| 11926 |
+
New Data Upload : | | 0.00B / 0.00B [A
|
| 11927 |
+
|
| 11928 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 11929 |
+
|
| 11930 |
+
|
| 11931 |
+
.../merged/model.safetensors: 9%| | 92.2MB / 988MB [A[A[A
|
| 11932 |
+
|
| 11933 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 11934 |
+
|
| 11935 |
+
|
| 11936 |
+
.../merged/model.safetensors: 9%| | 92.2MB / 988MB [A[A[AProcessing Files (1 / 2) : 10%| | 104MB / 1.00GB, ???B/s
|
| 11937 |
+
|
| 11938 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 11939 |
+
|
| 11940 |
+
|
| 11941 |
+
.../merged/model.safetensors: 22%| | 218MB / 988MB [A[A[AProcessing Files (1 / 2) : 23%| | 229MB / 1.00GB, 629MB/s
|
| 11942 |
+
|
| 11943 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 11944 |
+
|
| 11945 |
+
|
| 11946 |
+
.../merged/model.safetensors: 28%| | 272MB / 988MB [A[A[AProcessing Files (1 / 2) : 28%| | 284MB / 1.00GB, 449MB/s
|
| 11947 |
+
|
| 11948 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 11949 |
+
|
| 11950 |
+
|
| 11951 |
+
.../merged/model.safetensors: 28%| | 272MB / 988MB [A[A[A
|
| 11952 |
+
|
| 11953 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 11954 |
+
|
| 11955 |
+
|
| 11956 |
+
.../merged/model.safetensors: 34%| | 336MB / 988MB [A[A[AProcessing Files (1 / 2) : 35%| | 348MB / 1.00GB, 305MB/s
|
| 11957 |
+
New Data Upload : 48%| | 64.2MB / 134MB, 80.2MB/s [A
|
| 11958 |
+
|
| 11959 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 11960 |
+
|
| 11961 |
+
|
| 11962 |
+
.../merged/model.safetensors: 39%| | 382MB / 988MB [A[A[AProcessing Files (1 / 2) : 39%| | 394MB / 1.00GB, 290MB/s
|
| 11963 |
+
New Data Upload : 55%| | 110MB / 201MB, 110MB/s [A
|
| 11964 |
+
|
| 11965 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 11966 |
+
|
| 11967 |
+
|
| 11968 |
+
.../merged/model.safetensors: 45%| | 442MB / 988MB [A[A[AProcessing Files (1 / 2) : 45%| | 454MB / 1.00GB, 292MB/s
|
| 11969 |
+
New Data Upload : 63%| | 170MB / 268MB, 142MB/s [A
|
| 11970 |
+
|
| 11971 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 11972 |
+
|
| 11973 |
+
|
| 11974 |
+
.../merged/model.safetensors: 50%| | 490MB / 988MB [A[A[AProcessing Files (1 / 2) : 50%| | 501MB / 1.00GB, 284MB/s
|
| 11975 |
+
New Data Upload : 81%| | 218MB / 268MB, 155MB/s [A
|
| 11976 |
+
|
| 11977 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 11978 |
+
|
| 11979 |
+
|
| 11980 |
+
.../merged/model.safetensors: 55%| | 542MB / 988MB [A[A[AProcessing Files (1 / 2) : 55%| | 553MB / 1.00GB, 281MB/s
|
| 11981 |
+
New Data Upload : 80%| | 270MB / 335MB, 169MB/s [A
|
| 11982 |
+
|
| 11983 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 11984 |
+
|
| 11985 |
+
|
| 11986 |
+
.../merged/model.safetensors: 61%| | 603MB / 988MB [A[A[AProcessing Files (1 / 2) : 62%| | 615MB / 1.00GB, 284MB/s
|
| 11987 |
+
New Data Upload : 82%| | 331MB / 402MB, 184MB/s [A
|
| 11988 |
+
|
| 11989 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 11990 |
+
|
| 11991 |
+
|
| 11992 |
+
.../merged/model.safetensors: 68%| | 673MB / 988MB [A[A[AProcessing Files (1 / 2) : 69%| | 685MB / 1.00GB, 290MB/s
|
| 11993 |
+
New Data Upload : 85%| | 401MB / 470MB, 200MB/s [A
|
| 11994 |
+
|
| 11995 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 11996 |
+
|
| 11997 |
+
|
| 11998 |
+
.../merged/model.safetensors: 72%| | 714MB / 988MB [A[A[AProcessing Files (1 / 2) : 73%| | 726MB / 1.00GB, 283MB/s
|
| 11999 |
+
New Data Upload : 82%| | 442MB / 536MB, 201MB/s [A
|
| 12000 |
+
|
| 12001 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 12002 |
+
|
| 12003 |
+
|
| 12004 |
+
.../merged/model.safetensors: 81%| | 796MB / 988MB [A[A[AProcessing Files (1 / 2) : 81%| | 808MB / 1.00GB, 293MB/s
|
| 12005 |
+
New Data Upload : 87%| | 524MB / 604MB, 218MB/s [A
|
| 12006 |
+
|
| 12007 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 12008 |
+
|
| 12009 |
+
|
| 12010 |
+
.../merged/model.safetensors: 86%| | 849MB / 988MB [A[A[AProcessing Files (1 / 2) : 86%| | 860MB / 1.00GB, 291MB/s
|
| 12011 |
+
New Data Upload : 96%|| 577MB / 604MB, 222MB/s [A
|
| 12012 |
+
|
| 12013 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 12014 |
+
|
| 12015 |
+
|
| 12016 |
+
.../merged/model.safetensors: 91%|| 902MB / 988MB [A[A[AProcessing Files (1 / 2) : 91%|| 914MB / 1.00GB, 289MB/s
|
| 12017 |
+
New Data Upload : 94%|| 630MB / 671MB, 225MB/s [A
|
| 12018 |
+
|
| 12019 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 12020 |
+
|
| 12021 |
+
|
| 12022 |
+
.../merged/model.safetensors: 99%|| 983MB / 988MB [A[A[AProcessing Files (1 / 2) : 99%|| 994MB / 1.00GB, 297MB/s
|
| 12023 |
+
New Data Upload : 99%|| 710MB / 716MB, 237MB/s [A
|
| 12024 |
+
|
| 12025 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 12026 |
+
|
| 12027 |
+
|
| 12028 |
+
.../merged/model.safetensors: 100%|| 988MB / 988MB [A[A[AProcessing Files (1 / 2) : 100%|| 999MB / 1.00GB, 280MB/s
|
| 12029 |
+
New Data Upload : 100%|| 715MB / 716MB, 224MB/s [A
|
| 12030 |
+
|
| 12031 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 12032 |
+
|
| 12033 |
+
|
| 12034 |
+
.../merged/model.safetensors: 100%|| 988MB / 988MB [A[A[A
|
| 12035 |
+
|
| 12036 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 12037 |
+
|
| 12038 |
+
|
| 12039 |
+
.../merged/model.safetensors: 100%|| 988MB / 988MB [A[A[AProcessing Files (1 / 2) : 100%|| 999MB / 1.00GB, 249MB/s
|
| 12040 |
+
New Data Upload : 100%|| 716MB / 716MB, 199MB/s [A
|
| 12041 |
+
|
| 12042 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 12043 |
+
|
| 12044 |
+
|
| 12045 |
+
.../merged/model.safetensors: 100%|| 988MB / 988MB [A[A[AProcessing Files (1 / 2) : 100%|| 999MB / 1.00GB, 236MB/s
|
| 12046 |
+
New Data Upload : 100%|| 716MB / 716MB, 188MB/s [A
|
| 12047 |
+
|
| 12048 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 12049 |
+
|
| 12050 |
+
|
| 12051 |
+
.../merged/model.safetensors: 100%|| 988MB / 988MB [A[A[AProcessing Files (2 / 2) : 100%|| 1.00GB / 1.00GB, 224MB/s
|
| 12052 |
+
New Data Upload : 100%|| 716MB / 716MB, 179MB/s [A
|
| 12053 |
+
|
| 12054 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 12055 |
+
|
| 12056 |
+
|
| 12057 |
+
.../merged/model.safetensors: 100%|| 988MB / 988MB [A[A[A
|
| 12058 |
+
|
| 12059 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 12060 |
+
|
| 12061 |
+
|
| 12062 |
+
.../merged/model.safetensors: 100%|| 988MB / 988MB [A[A[A
|
| 12063 |
+
|
| 12064 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
|
| 12065 |
+
|
| 12066 |
+
|
| 12067 |
+
.../merged/model.safetensors: 100%|| 988MB / 988MB [A[A[AProcessing Files (2 / 2) : 100%|| 1.00GB / 1.00GB, 204MB/s
|
| 12068 |
+
New Data Upload : 100%|| 716MB / 716MB, 163MB/s
|
| 12069 |
+
...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB
|
| 12070 |
+
.../merged/model.safetensors: 100%|| 988MB / 988MB
|
| 12071 |
+
Removing 13 file(s) from commit that have not changed.
|
| 12072 |
+
https://huggingface.co/TAUR-dev/testing_llamafactory_helper_quick_test__interactive/tree/main/.
|
| 12073 |
+
|
| 12074 |
+
========================================
|
| 12075 |
+
Upload completed successfully
|
| 12076 |
+
Model and training artifacts uploaded to: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
|
| 12077 |
+
End Time: Wed Oct 22 08:38:17 PM EDT 2025
|
| 12078 |
+
========================================
|
| 12079 |
+
|
| 12080 |
+
========================================
|
| 12081 |
+
STAGE 4: Cleanup
|
| 12082 |
+
========================================
|
| 12083 |
+
Keeping checkpoints in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
|
| 12084 |
+
Keeping merged model in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
|
| 12085 |
+
|
| 12086 |
+
========================================
|
| 12087 |
+
PIPELINE COMPLETED SUCCESSFULLY
|
| 12088 |
+
End Time: Wed Oct 22 08:38:18 PM EDT 2025
|
| 12089 |
+
========================================
|
| 12090 |
+
|
| 12091 |
+
========================================
|
| 12092 |
+
Cleaning up LlamaFactory processes
|
| 12093 |
+
========================================
|
| 12094 |
+
Cleaned up processes on gl064.hpc.nyu.edu
|
| 12095 |
+
Cleaning up processes on worker node: gl065
|
| 12096 |
+
Process cleanup complete
|
| 12097 |
+
========================================
|
| 12098 |
+
Job Name: lf_torch_test__interactive
|
| 12099 |
+
Hostname: gl064.hpc.nyu.edu
|
| 12100 |
+
Number of nodes: 2
|
| 12101 |
+
GPUs per node: 2
|
| 12102 |
+
Start Time: Wed Oct 22 08:41:31 PM EDT 2025
|
| 12103 |
+
Log file: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/logs/pipeline.log
|
| 12104 |
+
========================================
|
| 12105 |
+
Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
|
| 12106 |
+
|
| 12107 |
+
========================================
|
| 12108 |
+
Configuration Paths
|
| 12109 |
+
========================================
|
| 12110 |
+
Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
|
| 12111 |
+
Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
|
| 12112 |
+
Dataset Info:
|
| 12113 |
+
Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
|
| 12114 |
+
Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
|
| 12115 |
+
HF Repo ID: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
|
| 12116 |
+
|
| 12117 |
+
|
| 12118 |
+
========================================
|
| 12119 |
+
Multi-Node Coordination
|
| 12120 |
+
========================================
|
| 12121 |
+
This is the master node - coordinating worker nodes...
|
| 12122 |
+
Master node: gl064
|
| 12123 |
+
Master port: 29500
|
| 12124 |
+
World size: 2
|
| 12125 |
+
|
| 12126 |
+
Launching on worker node 1: gl065
|
| 12127 |
+
All worker nodes launched successfully
|
| 12128 |
+
Master node (this node) will now join training as rank 0
|
| 12129 |
+
|
| 12130 |
+
|
| 12131 |
+
Found pre-tokenized dataset at: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/tokenized/my_custom_sft12
|
| 12132 |
+
Training will load from cached tokenized data (fast startup)
|
| 12133 |
+
|
| 12134 |
+
========================================
|
| 12135 |
+
STAGE 1: Training Model
|
| 12136 |
+
Start Time: Wed Oct 22 08:41:34 PM EDT 2025
|
| 12137 |
+
========================================
|
| 12138 |
+
Multi-node training detected
|
| 12139 |
+
Nodes: 2, GPUs per node: 2
|
| 12140 |
+
Master address: gl064
|
| 12141 |
+
Master port: 29500
|
| 12142 |
+
Node rank: 0
|
| 12143 |
+
World size: 2
|
| 12144 |
+
CUDA_VISIBLE_DEVICES: 0,1
|
| 12145 |
+
LLaMA-Factory path: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
|
| 12146 |
+
Training config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
|
| 12147 |
+
|
| 12148 |
+
Starting distributed training with torch.distributed.run...
|
| 12149 |
+
|
| 12150 |
+
*****************************************
|
| 12151 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 12152 |
+
*****************************************
|
| 12153 |
+
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
|
| 12154 |
+
warnings.warn(
|
| 12155 |
+
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
|
| 12156 |
+
warnings.warn(
|
| 12157 |
+
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
|
| 12158 |
+
import pkg_resources
|
| 12159 |
+
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
|
| 12160 |
+
import pkg_resources
|
| 12161 |
+
[INFO|2025-10-22 20:41:51] llamafactory.hparams.parser:423 >> Process rank: 1, world size: 4, device: cuda:1, distributed training: True, compute dtype: torch.float16
|
| 12162 |
+
[INFO|2025-10-22 20:41:51] llamafactory.hparams.parser:143 >> Set `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.
|
| 12163 |
+
[INFO|2025-10-22 20:41:51] llamafactory.hparams.parser:423 >> Process rank: 0, world size: 4, device: cuda:0, distributed training: True, compute dtype: torch.float16
|
| 12164 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:41:51,968 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
|
| 12165 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:41:51,968 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
|
| 12166 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:41:51,968 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
|
| 12167 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:41:51,968 >> loading file added_tokens.json from cache at None
|
| 12168 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:41:51,968 >> loading file special_tokens_map.json from cache at None
|
| 12169 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:41:51,968 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
|
| 12170 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:41:51,968 >> loading file chat_template.jinja from cache at None
|
| 12171 |
+
[INFO|tokenization_utils_base.py:2364] 2025-10-22 20:41:52,139 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
| 12172 |
+
[INFO|configuration_utils.py:765] 2025-10-22 20:41:52,336 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 12173 |
+
[INFO|configuration_utils.py:839] 2025-10-22 20:41:52,337 >> Model config Qwen2Config {
|
| 12174 |
+
"architectures": [
|
| 12175 |
+
"Qwen2ForCausalLM"
|
| 12176 |
+
],
|
| 12177 |
+
"attention_dropout": 0.0,
|
| 12178 |
+
"bos_token_id": 151643,
|
| 12179 |
+
"dtype": "bfloat16",
|
| 12180 |
+
"eos_token_id": 151643,
|
| 12181 |
+
"hidden_act": "silu",
|
| 12182 |
+
"hidden_size": 896,
|
| 12183 |
+
"initializer_range": 0.02,
|
| 12184 |
+
"intermediate_size": 4864,
|
| 12185 |
+
"layer_types": [
|
| 12186 |
+
"full_attention",
|
| 12187 |
+
"full_attention",
|
| 12188 |
+
"full_attention",
|
| 12189 |
+
"full_attention",
|
| 12190 |
+
"full_attention",
|
| 12191 |
+
"full_attention",
|
| 12192 |
+
"full_attention",
|
| 12193 |
+
"full_attention",
|
| 12194 |
+
"full_attention",
|
| 12195 |
+
"full_attention",
|
| 12196 |
+
"full_attention",
|
| 12197 |
+
"full_attention",
|
| 12198 |
+
"full_attention",
|
| 12199 |
+
"full_attention",
|
| 12200 |
+
"full_attention",
|
| 12201 |
+
"full_attention",
|
| 12202 |
+
"full_attention",
|
| 12203 |
+
"full_attention",
|
| 12204 |
+
"full_attention",
|
| 12205 |
+
"full_attention",
|
| 12206 |
+
"full_attention",
|
| 12207 |
+
"full_attention",
|
| 12208 |
+
"full_attention",
|
| 12209 |
+
"full_attention"
|
| 12210 |
+
],
|
| 12211 |
+
"max_position_embeddings": 32768,
|
| 12212 |
+
"max_window_layers": 24,
|
| 12213 |
+
"model_type": "qwen2",
|
| 12214 |
+
"num_attention_heads": 14,
|
| 12215 |
+
"num_hidden_layers": 24,
|
| 12216 |
+
"num_key_value_heads": 2,
|
| 12217 |
+
"rms_norm_eps": 1e-06,
|
| 12218 |
+
"rope_scaling": null,
|
| 12219 |
+
"rope_theta": 1000000.0,
|
| 12220 |
+
"sliding_window": null,
|
| 12221 |
+
"tie_word_embeddings": true,
|
| 12222 |
+
"transformers_version": "4.57.1",
|
| 12223 |
+
"use_cache": true,
|
| 12224 |
+
"use_mrope": false,
|
| 12225 |
+
"use_sliding_window": false,
|
| 12226 |
+
"vocab_size": 151936
|
| 12227 |
+
}
|
| 12228 |
+
|
| 12229 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:41:52,405 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
|
| 12230 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:41:52,405 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
|
| 12231 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:41:52,405 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
|
| 12232 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:41:52,405 >> loading file added_tokens.json from cache at None
|
| 12233 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:41:52,405 >> loading file special_tokens_map.json from cache at None
|
| 12234 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:41:52,405 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
|
| 12235 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:41:52,405 >> loading file chat_template.jinja from cache at None
|
| 12236 |
+
[INFO|tokenization_utils_base.py:2364] 2025-10-22 20:41:52,571 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
| 12237 |
+
[WARNING|2025-10-22 20:41:52] llamafactory.data.loader:148 >> Loading dataset from disk will ignore other data arguments.
|
| 12238 |
+
[INFO|2025-10-22 20:41:52] llamafactory.data.loader:143 >> Loaded tokenized dataset from /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/tokenized/my_custom_sft12.
|
| 12239 |
+
[INFO|configuration_utils.py:765] 2025-10-22 20:41:52,629 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 12240 |
+
[INFO|configuration_utils.py:839] 2025-10-22 20:41:52,629 >> Model config Qwen2Config {
|
| 12241 |
+
"architectures": [
|
| 12242 |
+
"Qwen2ForCausalLM"
|
| 12243 |
+
],
|
| 12244 |
+
"attention_dropout": 0.0,
|
| 12245 |
+
"bos_token_id": 151643,
|
| 12246 |
+
"dtype": "bfloat16",
|
| 12247 |
+
"eos_token_id": 151643,
|
| 12248 |
+
"hidden_act": "silu",
|
| 12249 |
+
"hidden_size": 896,
|
| 12250 |
+
"initializer_range": 0.02,
|
| 12251 |
+
"intermediate_size": 4864,
|
| 12252 |
+
"layer_types": [
|
| 12253 |
+
"full_attention",
|
| 12254 |
+
"full_attention",
|
| 12255 |
+
"full_attention",
|
| 12256 |
+
"full_attention",
|
| 12257 |
+
"full_attention",
|
| 12258 |
+
"full_attention",
|
| 12259 |
+
"full_attention",
|
| 12260 |
+
"full_attention",
|
| 12261 |
+
"full_attention",
|
| 12262 |
+
"full_attention",
|
| 12263 |
+
"full_attention",
|
| 12264 |
+
"full_attention",
|
| 12265 |
+
"full_attention",
|
| 12266 |
+
"full_attention",
|
| 12267 |
+
"full_attention",
|
| 12268 |
+
"full_attention",
|
| 12269 |
+
"full_attention",
|
| 12270 |
+
"full_attention",
|
| 12271 |
+
"full_attention",
|
| 12272 |
+
"full_attention",
|
| 12273 |
+
"full_attention",
|
| 12274 |
+
"full_attention",
|
| 12275 |
+
"full_attention",
|
| 12276 |
+
"full_attention"
|
| 12277 |
+
],
|
| 12278 |
+
"max_position_embeddings": 32768,
|
| 12279 |
+
"max_window_layers": 24,
|
| 12280 |
+
"model_type": "qwen2",
|
| 12281 |
+
"num_attention_heads": 14,
|
| 12282 |
+
"num_hidden_layers": 24,
|
| 12283 |
+
"num_key_value_heads": 2,
|
| 12284 |
+
"rms_norm_eps": 1e-06,
|
| 12285 |
+
"rope_scaling": null,
|
| 12286 |
+
"rope_theta": 1000000.0,
|
| 12287 |
+
"sliding_window": null,
|
| 12288 |
+
"tie_word_embeddings": true,
|
| 12289 |
+
"transformers_version": "4.57.1",
|
| 12290 |
+
"use_cache": true,
|
| 12291 |
+
"use_mrope": false,
|
| 12292 |
+
"use_sliding_window": false,
|
| 12293 |
+
"vocab_size": 151936
|
| 12294 |
+
}
|
| 12295 |
+
|
| 12296 |
+
[INFO|2025-10-22 20:41:52] llamafactory.model.model_utils.kv_cache:143 >> KV cache is disabled during training.
|
| 12297 |
+
[WARNING|logging.py:328] 2025-10-22 20:41:52,961 >> `torch_dtype` is deprecated! Use `dtype` instead!
|
| 12298 |
+
`torch_dtype` is deprecated! Use `dtype` instead!
|
| 12299 |
+
[INFO|modeling_utils.py:1172] 2025-10-22 20:41:52,962 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
|
| 12300 |
+
[INFO|modeling_utils.py:2341] 2025-10-22 20:41:52,963 >> Instantiating Qwen2ForCausalLM model under default dtype torch.float16.
|
| 12301 |
+
[INFO|configuration_utils.py:986] 2025-10-22 20:41:52,964 >> Generate config GenerationConfig {
|
| 12302 |
+
"bos_token_id": 151643,
|
| 12303 |
+
"eos_token_id": 151643,
|
| 12304 |
+
"use_cache": false
|
| 12305 |
+
}
|
| 12306 |
+
|
| 12307 |
+
[INFO|configuration_utils.py:941] 2025-10-22 20:41:53,264 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
|
| 12308 |
+
[INFO|configuration_utils.py:986] 2025-10-22 20:41:53,264 >> Generate config GenerationConfig {
|
| 12309 |
+
"bos_token_id": 151643,
|
| 12310 |
+
"eos_token_id": 151643,
|
| 12311 |
+
"max_new_tokens": 2048
|
| 12312 |
+
}
|
| 12313 |
+
|
| 12314 |
+
[INFO|dynamic_module_utils.py:423] 2025-10-22 20:41:53,294 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
|
| 12315 |
+
[INFO|2025-10-22 20:41:53] llamafactory.model.model_utils.checkpointing:143 >> Gradient checkpointing enabled.
|
| 12316 |
+
[INFO|2025-10-22 20:41:53] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
|
| 12317 |
+
[INFO|2025-10-22 20:41:53] llamafactory.model.adapter:143 >> Upcasting trainable params to float32.
|
| 12318 |
+
[INFO|2025-10-22 20:41:53] llamafactory.model.adapter:143 >> Fine-tuning method: LoRA
|
| 12319 |
+
[INFO|2025-10-22 20:41:53] llamafactory.model.model_utils.misc:143 >> Found linear modules: v_proj,gate_proj,k_proj,down_proj,o_proj,up_proj,q_proj
|
| 12320 |
+
[INFO|2025-10-22 20:41:53] llamafactory.model.loader:143 >> trainable params: 4,399,104 || all params: 498,431,872 || trainable%: 0.8826
|
| 12321 |
+
[WARNING|trainer.py:906] 2025-10-22 20:41:53,535 >> The model is already on multiple devices. Skipping the move to device specified in `args`.
|
| 12322 |
+
[INFO|trainer.py:699] 2025-10-22 20:41:53,537 >> max_steps is given, it will override any value given in num_train_epochs
|
| 12323 |
+
[INFO|trainer.py:749] 2025-10-22 20:41:53,538 >> Using auto half precision backend
|
| 12324 |
+
[WARNING|2025-10-22 20:41:53] llamafactory.train.callbacks:154 >> Previous trainer log in this folder will be deleted.
|
| 12325 |
+
[WARNING|trainer.py:982] 2025-10-22 20:41:53,541 >> The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
|
| 12326 |
+
The model is already on multiple devices. Skipping the move to device specified in `args`.
|
| 12327 |
+
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
|
| 12328 |
+
gl064:2628226:2628226 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
|
| 12329 |
+
gl064:2628226:2628226 [0] NCCL INFO Bootstrap: Using ibs3:10.0.5.0<0>
|
| 12330 |
+
gl064:2628226:2628226 [0] NCCL INFO cudaDriverVersion 13000
|
| 12331 |
+
gl064:2628226:2628226 [0] NCCL INFO NCCL version 2.27.5+cuda12.9
|
| 12332 |
+
gl064:2628226:2628226 [0] NCCL INFO Comm config Blocking set to 1
|
| 12333 |
+
gl064:2628227:2628227 [1] NCCL INFO cudaDriverVersion 13000
|
| 12334 |
+
gl064:2628227:2628227 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
|
| 12335 |
+
gl064:2628227:2628227 [1] NCCL INFO Bootstrap: Using ibs3:10.0.5.0<0>
|
| 12336 |
+
gl064:2628227:2628227 [1] NCCL INFO NCCL version 2.27.5+cuda12.9
|
| 12337 |
+
gl064:2628227:2628227 [1] NCCL INFO Comm config Blocking set to 1
|
| 12338 |
+
gl064:2628226:2628285 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
|
| 12339 |
+
gl064:2628226:2628285 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
|
| 12340 |
+
gl064:2628226:2628285 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
|
| 12341 |
+
gl064:2628226:2628285 [0] NCCL INFO NCCL_IB_HCA set to mlx5
|
| 12342 |
+
gl064:2628226:2628285 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.0<0>
|
| 12343 |
+
gl064:2628226:2628285 [0] NCCL INFO Initialized NET plugin IB
|
| 12344 |
+
gl064:2628226:2628285 [0] NCCL INFO Assigned NET plugin IB to comm
|
| 12345 |
+
gl064:2628226:2628285 [0] NCCL INFO Using network IB
|
| 12346 |
+
gl064:2628226:2628285 [0] NCCL INFO ncclCommInitRankConfig comm 0x14bded40 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0xe09b1d45be9f2d6a - Init START
|
| 12347 |
+
gl064:2628227:2628286 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
|
| 12348 |
+
gl064:2628227:2628286 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
|
| 12349 |
+
gl064:2628227:2628286 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
|
| 12350 |
+
gl064:2628227:2628286 [1] NCCL INFO NCCL_IB_HCA set to mlx5
|
| 12351 |
+
gl064:2628227:2628286 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.0<0>
|
| 12352 |
+
gl064:2628227:2628286 [1] NCCL INFO Initialized NET plugin IB
|
| 12353 |
+
gl064:2628227:2628286 [1] NCCL INFO Assigned NET plugin IB to comm
|
| 12354 |
+
gl064:2628227:2628286 [1] NCCL INFO Using network IB
|
| 12355 |
+
gl064:2628227:2628286 [1] NCCL INFO ncclCommInitRankConfig comm 0x13dac3f0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0xe09b1d45be9f2d6a - Init START
|
| 12356 |
+
gl064:2628227:2628286 [1] NCCL INFO RAS client listening socket at ::1<28028>
|
| 12357 |
+
gl064:2628226:2628285 [0] NCCL INFO RAS client listening socket at ::1<28028>
|
| 12358 |
+
gl064:2628226:2628285 [0] NCCL INFO Bootstrap timings total 1.801443 (create 0.000027, send 0.000089, recv 0.023424, ring 0.000382, delay 0.000000)
|
| 12359 |
+
gl064:2628227:2628286 [1] NCCL INFO Bootstrap timings total 1.778394 (create 0.000024, send 0.000070, recv 1.775558, ring 0.002096, delay 0.000000)
|
| 12360 |
+
gl064:2628226:2628285 [0] NCCL INFO Setting affinity for GPU 0 to 0-15
|
| 12361 |
+
gl064:2628227:2628286 [1] NCCL INFO Setting affinity for GPU 1 to 0-15
|
| 12362 |
+
gl064:2628227:2628286 [1] NCCL INFO comm 0x13dac3f0 rank 1 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
|
| 12363 |
+
gl064:2628226:2628285 [0] NCCL INFO comm 0x14bded40 rank 0 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
|
| 12364 |
+
gl064:2628227:2628286 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
|
| 12365 |
+
gl064:2628227:2628286 [1] NCCL INFO P2P Chunksize set to 131072
|
| 12366 |
+
gl064:2628226:2628285 [0] NCCL INFO Channel 00/02 : 0 1 2 3
|
| 12367 |
+
gl064:2628226:2628285 [0] NCCL INFO Channel 01/02 : 0 1 2 3
|
| 12368 |
+
gl064:2628226:2628285 [0] NCCL INFO Trees [0] 1/2/-1->0->-1 [1] 1/-1/-1->0->2
|
| 12369 |
+
gl064:2628226:2628285 [0] NCCL INFO P2P Chunksize set to 131072
|
| 12370 |
+
gl064:2628227:2628286 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 12371 |
+
gl064:2628226:2628285 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 12372 |
+
gl064:2628226:2628285 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
|
| 12373 |
+
gl064:2628226:2628293 [0] NCCL INFO [Proxy Service] Device 0 CPU core 7
|
| 12374 |
+
gl064:2628226:2628294 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 12
|
| 12375 |
+
gl064:2628227:2628291 [1] NCCL INFO [Proxy Service] Device 1 CPU core 1
|
| 12376 |
+
gl064:2628227:2628292 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 2
|
| 12377 |
+
gl064:2628226:2628285 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
| 12378 |
+
gl064:2628226:2628285 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
|
| 12379 |
+
gl064:2628227:2628286 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
| 12380 |
+
gl064:2628227:2628286 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
|
| 12381 |
+
gl064:2628226:2628285 [0] NCCL INFO CC Off, workFifoBytes 1048576
|
| 12382 |
+
gl064:2628227:2628286 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
|
| 12383 |
+
gl064:2628227:2628286 [1] NCCL INFO ncclCommInitRankConfig comm 0x13dac3f0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0xe09b1d45be9f2d6a - Init COMPLETE
|
| 12384 |
+
gl064:2628227:2628286 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 1.90 (kernels 0.08, alloc 0.01, bootstrap 1.78, allgathers 0.01, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
|
| 12385 |
+
gl064:2628226:2628285 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
|
| 12386 |
+
gl064:2628226:2628285 [0] NCCL INFO ncclCommInitRankConfig comm 0x14bded40 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0xe09b1d45be9f2d6a - Init COMPLETE
|
| 12387 |
+
gl064:2628226:2628285 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 1.93 (kernels 0.09, alloc 0.01, bootstrap 1.80, allgathers 0.01, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
|
| 12388 |
+
gl064:2628226:2628295 [0] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [receive] via NET/IB/0
|
| 12389 |
+
gl064:2628226:2628295 [0] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [receive] via NET/IB/0
|
| 12390 |
+
gl064:2628226:2628297 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 3
|
| 12391 |
+
gl064:2628226:2628295 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
|
| 12392 |
+
gl064:2628226:2628295 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
|
| 12393 |
+
gl064:2628227:2628296 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [send] via NET/IB/0
|
| 12394 |
+
gl064:2628227:2628296 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [send] via NET/IB/0
|
| 12395 |
+
gl064:2628227:2628298 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 4
|
| 12396 |
+
gl064:2628227:2628296 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
|
| 12397 |
+
gl064:2628226:2628295 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
|
| 12398 |
+
[INFO|trainer.py:2519] 2025-10-22 20:41:55,692 >> ***** Running training *****
|
| 12399 |
+
[INFO|trainer.py:2520] 2025-10-22 20:41:55,693 >> Num examples = 3,598
|
| 12400 |
+
[INFO|trainer.py:2521] 2025-10-22 20:41:55,693 >> Num Epochs = 1
|
| 12401 |
+
[INFO|trainer.py:2522] 2025-10-22 20:41:55,693 >> Instantaneous batch size per device = 1
|
| 12402 |
+
[INFO|trainer.py:2525] 2025-10-22 20:41:55,693 >> Total train batch size (w. parallel, distributed & accumulation) = 4
|
| 12403 |
+
[INFO|trainer.py:2526] 2025-10-22 20:41:55,693 >> Gradient Accumulation steps = 1
|
| 12404 |
+
[INFO|trainer.py:2527] 2025-10-22 20:41:55,693 >> Total optimization steps = 100
|
| 12405 |
+
[INFO|trainer.py:2528] 2025-10-22 20:41:55,694 >> Number of trainable parameters = 4,399,104
|
| 12406 |
+
[INFO|integration_utils.py:867] 2025-10-22 20:41:55,714 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
|
| 12407 |
+
wandb: Currently logged in as: zsprague (ut_nlp_deduce) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
|
| 12408 |
+
wandb: Tracking run with wandb version 0.22.2
|
| 12409 |
+
wandb: Run data is saved locally in /scratch/zrs2020/LlamaFactoryHelper/wandb/run-20251022_204155-v2077oxb
|
| 12410 |
+
wandb: Run `wandb offline` to turn off syncing.
|
| 12411 |
+
wandb: Syncing run interactive_test
|
| 12412 |
+
wandb: View project at https://wandb.ai/ut_nlp_deduce/llamafactory
|
| 12413 |
+
wandb: View run at https://wandb.ai/ut_nlp_deduce/llamafactory/runs/v2077oxb
|
| 12414 |
+
0%| | 0/100 [00:00<?, ?it/s] 1%| | 1/100 [00:00<00:56, 1.77it/s] 2%| | 2/100 [00:00<00:39, 2.47it/s] 3%| | 3/100 [00:01<00:39, 2.47it/s] 4%| | 4/100 [00:01<00:33, 2.88it/s] 5%| | 5/100 [00:02<00:40, 2.33it/s] 6%| | 6/100 [00:02<00:34, 2.73it/s] 7%| | 7/100 [00:02<00:31, 2.98it/s] 8%| | 8/100 [00:02<00:29, 3.14it/s] 9%| | 9/100 [00:03<00:27, 3.31it/s] 10%| | 10/100 [00:03<00:29, 3.06it/s] {'loss': 1.286, 'grad_norm': 0.3636094033718109, 'learning_rate': 4.55e-05, 'epoch': 0.01}
|
| 12415 |
+
10%| | 10/100 [00:03<00:29, 3.06it/s] 11%| | 11/100 [00:04<00:35, 2.52it/s] 12%| | 12/100 [00:04<00:33, 2.61it/s] 13%| | 13/100 [00:04<00:30, 2.88it/s]NFO|trainer.py:2521] 2025-10-22 20:41:55,693 >> Num Epochs = 1
|
| 12416 |
+
[INFO|trainer.py:2522] 2025-10-22 20:41:55,693 >> Instantaneous batch size per device = 1
|
| 12417 |
+
[INFO|trainer.py:2525] 2025-10-22 20:41:55,693 >> Total train batch size (w. parallel, distributed & accumulation) = 4
|
| 12418 |
+
[INFO|trainer.py:2526] 2025-10-22 20:41:55,693 >> Gradient Accumulation steps = 1
|
| 12419 |
+
[INFO|trainer.py:2527] 2025-10-22 20:41:55,693 >> Total optimization steps = 100
|
| 12420 |
+
[INFO|trainer.py:2528] 2025-10-22 20:41:55,694 >> Number of trainable parameters = 4,399,104
|
| 12421 |
+
14%| | 14/100 [00:05<00:29, 2.95it/s] 15%| | 15/100 [00:05<00:29, 2.87it/s] 16%| | 16/100 [00:05<00:26, 3.15it/s] 17%| | 17/100 [00:05<00:26, 3.08it/s] 18%| | 18/100 [00:06<00:27, 2.98it/s] 19%| | 19/100 [00:06<00:27, 2.90it/s] 20%| | 20/100 [00:07<00:26, 3.02it/s] {'loss': 1.1751, 'grad_norm': 0.3897131383419037, 'learning_rate': 4.05e-05, 'epoch': 0.02}
|
| 12422 |
+
20%| | 20/100 [00:07<00:26, 3.02it/s] 21%| | 21/100 [00:07<00:31, 2.50it/s] 22%| | 22/100 [00:07<00:29, 2.61it/s] 23%| | 23/100 [00:08<00:29, 2.64it/s] 24%| | 24/100 [00:08<00:26, 2.88it/s] 25%| | 25/100 [00:09<00:31, 2.35it/s] 26%| | 26/100 [00:09<00:28, 2.64it/s] 27%| | 27/100 [00:09<00:25, 2.88it/s] 28%| | 28/100 [00:10<00:24, 2.98it/s] 29%| | 29/100 [00:10<00:22, 3.11it/s] 30%| | 30/100 [00:10<00:21, 3.19it/s] {'loss': 1.1373, 'grad_norm': 0.42557743191719055, 'learning_rate': 3.55e-05, 'epoch': 0.03}
|
| 12423 |
+
30%| | 30/100 [00:10<00:21, 3.19it/s] 31%| | 31/100 [00:10<00:22, 3.01it/s] 32%| | 32/100 [00:11<00:21, 3.11it/s] 33%| | 33/100 [00:11<00:20, 3.25it/s] 34%| | 34/100 [00:12<00:25, 2.62it/s] 35%| | 35/100 [00:12<00:23, 2.71it/s] 36%| | 36/100 [00:12<00:22, 2.90it/s] 37%| | 37/100 [00:12<00:19, 3.30it/s] 38%| | 38/100 [00:13<00:18, 3.40it/s] 39%| | 39/100 [00:13<00:16, 3.70it/s] 40%| | 40/100 [00:13<00:18, 3.17it/s] {'loss': 1.0636, 'grad_norm': 0.42947664856910706, 'learning_rate': 3.05e-05, 'epoch': 0.04}
|
| 12424 |
+
40%| | 40/100 [00:13<00:18, 3.17it/s] 41%| | 41/100 [00:14<00:17, 3.28it/s] 42%| | 42/100 [00:14<00:17, 3.40it/s] 43%| | 43/100 [00:14<00:17, 3.20it/s] 44%| | 44/100 [00:15<00:16, 3.35it/s] 45%| | 45/100 [00:15<00:16, 3.36it/s] 46%| | 46/100 [00:15<00:16, 3.18it/s] 47%| | 47/100 [00:15<00:15, 3.43it/s] 48%| | 48/100 [00:16<00:15, 3.40it/s] 49%| | 49/100 [00:16<00:15, 3.31it/s] 50%| | 50/100 [00:16<00:15, 3.28it/s] {'loss': 1.0329, 'grad_norm': 0.43117761611938477, 'learning_rate': 2.5500000000000003e-05, 'epoch': 0.06}
|
| 12425 |
+
50%| | 50/100 [00:16<00:15, 3.28it/s][INFO|trainer.py:4309] 2025-10-22 20:42:13,492 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50
|
| 12426 |
+
[INFO|configuration_utils.py:765] 2025-10-22 20:42:13,619 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 12427 |
+
[INFO|configuration_utils.py:839] 2025-10-22 20:42:13,619 >> Model config Qwen2Config {
|
| 12428 |
+
"architectures": [
|
| 12429 |
+
"Qwen2ForCausalLM"
|
| 12430 |
+
],
|
| 12431 |
+
"attention_dropout": 0.0,
|
| 12432 |
+
"bos_token_id": 151643,
|
| 12433 |
+
"dtype": "bfloat16",
|
| 12434 |
+
"eos_token_id": 151643,
|
| 12435 |
+
"hidden_act": "silu",
|
| 12436 |
+
"hidden_size": 896,
|
| 12437 |
+
"initializer_range": 0.02,
|
| 12438 |
+
"intermediate_size": 4864,
|
| 12439 |
+
"layer_types": [
|
| 12440 |
+
"full_attention",
|
| 12441 |
+
"full_attention",
|
| 12442 |
+
"full_attention",
|
| 12443 |
+
"full_attention",
|
| 12444 |
+
"full_attention",
|
| 12445 |
+
"full_attention",
|
| 12446 |
+
"full_attention",
|
| 12447 |
+
"full_attention",
|
| 12448 |
+
"full_attention",
|
| 12449 |
+
"full_attention",
|
| 12450 |
+
"full_attention",
|
| 12451 |
+
"full_attention",
|
| 12452 |
+
"full_attention",
|
| 12453 |
+
"full_attention",
|
| 12454 |
+
"full_attention",
|
| 12455 |
+
"full_attention",
|
| 12456 |
+
"full_attention",
|
| 12457 |
+
"full_attention",
|
| 12458 |
+
"full_attention",
|
| 12459 |
+
"full_attention",
|
| 12460 |
+
"full_attention",
|
| 12461 |
+
"full_attention",
|
| 12462 |
+
"full_attention",
|
| 12463 |
+
"full_attention"
|
| 12464 |
+
],
|
| 12465 |
+
"max_position_embeddings": 32768,
|
| 12466 |
+
"max_window_layers": 24,
|
| 12467 |
+
"model_type": "qwen2",
|
| 12468 |
+
"num_attention_heads": 14,
|
| 12469 |
+
"num_hidden_layers": 24,
|
| 12470 |
+
"num_key_value_heads": 2,
|
| 12471 |
+
"rms_norm_eps": 1e-06,
|
| 12472 |
+
"rope_scaling": null,
|
| 12473 |
+
"rope_theta": 1000000.0,
|
| 12474 |
+
"sliding_window": null,
|
| 12475 |
+
"tie_word_embeddings": true,
|
| 12476 |
+
"transformers_version": "4.57.1",
|
| 12477 |
+
"use_cache": true,
|
| 12478 |
+
"use_mrope": false,
|
| 12479 |
+
"use_sliding_window": false,
|
| 12480 |
+
"vocab_size": 151936
|
| 12481 |
+
}
|
| 12482 |
+
|
| 12483 |
+
[INFO|tokenization_utils_base.py:2421] 2025-10-22 20:42:13,785 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/chat_template.jinja
|
| 12484 |
+
[INFO|tokenization_utils_base.py:2590] 2025-10-22 20:42:13,791 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/tokenizer_config.json
|
| 12485 |
+
[INFO|tokenization_utils_base.py:2599] 2025-10-22 20:42:13,810 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/special_tokens_map.json
|
| 12486 |
+
51%| | 51/100 [00:17<00:26, 1.84it/s] 52%| | 52/100 [00:18<00:22, 2.09it/s] 53%| | 53/100 [00:18<00:21, 2.16it/s] 54%| | 54/100 [00:19<00:20, 2.28it/s] 55%| | 55/100 [00:19<00:18, 2.43it/s] 56%| | 56/100 [00:19<00:16, 2.65it/s] 57%| | 57/100 [00:20<00:15, 2.82it/s] 58%| | 58/100 [00:20<00:14, 2.86it/s] 59%| | 59/100 [00:20<00:13, 2.97it/s] 60%| | 60/100 [00:20<00:13, 3.07it/s] {'loss': 0.9981, 'grad_norm': 0.45059695839881897, 'learning_rate': 2.05e-05, 'epoch': 0.07}
|
| 12487 |
+
60%| | 60/100 [00:20<00:13, 3.07it/s] 61%| | 61/100 [00:21<00:12, 3.13it/s] 62%| | 62/100 [00:21<00:11, 3.20it/s] 63%| | 63/100 [00:21<00:11, 3.33it/s] 64%| | 64/100 [00:22<00:10, 3.48it/s] 65%| | 65/100 [00:22<00:10, 3.42it/s] 66%| | 66/100 [00:22<00:10, 3.20it/s] 67%| | 67/100 [00:23<00:09, 3.37it/s] 68%| | 68/100 [00:23<00:09, 3.40it/s] 69%| | 69/100 [00:23<00:09, 3.25it/s] 70%| | 70/100 [00:23<00:09, 3.23it/s] {'loss': 0.9991, 'grad_norm': 0.43518301844596863, 'learning_rate': 1.55e-05, 'epoch': 0.08}
|
| 12488 |
+
70%| | 70/100 [00:23<00:09, 3.23it/s] 71%| | 71/100 [00:24<00:08, 3.37it/s] 72%| | 72/100 [00:24<00:07, 3.60it/s] 73%| | 73/100 [00:24<00:07, 3.83it/s] 74%| | 74/100 [00:25<00:09, 2.81it/s] 75%| | 75/100 [00:25<00:08, 3.04it/s] 76%| | 76/100 [00:25<00:07, 3.36it/s] 77%| | 77/100 [00:26<00:07, 3.23it/s] 78%| | 78/100 [00:26<00:06, 3.56it/s] 79%| | 79/100 [00:26<00:05, 3.56it/s] 80%| | 80/100 [00:26<00:05, 3.81it/s] {'loss': 0.9537, 'grad_norm': 0.46800264716148376, 'learning_rate': 1.05e-05, 'epoch': 0.09}
|
| 12489 |
+
80%| | 80/100 [00:26<00:05, 3.81it/s] 81%| | 81/100 [00:27<00:04, 3.81it/s] 82%| | 82/100 [00:27<00:04, 3.77it/s] 83%| | 83/100 [00:27<00:05, 3.35it/s] 84%| | 84/100 [00:28<00:04, 3.41it/s] 85%| | 85/100 [00:28<00:04, 3.45it/s] 86%| | 86/100 [00:28<00:04, 3.48it/s] 87%| | 87/100 [00:28<00:03, 3.51it/s] 88%| | 88/100 [00:29<00:03, 3.61it/s] 89%| | 89/100 [00:29<00:02, 3.69it/s] 90%| | 90/100 [00:29<00:02, 3.51it/s] {'loss': 0.9677, 'grad_norm': 0.4698624014854431, 'learning_rate': 5.500000000000001e-06, 'epoch': 0.1}
|
| 12490 |
+
90%| | 90/100 [00:29<00:02, 3.51it/s] 91%| | 91/100 [00:29<00:02, 3.46it/s] 92%|| 92/100 [00:30<00:02, 2.73it/s] 93%|| 93/100 [00:30<00:02, 2.80it/s] 94%|| 94/100 [00:31<00:02, 2.85it/s] 95%|| 95/100 [00:31<00:01, 3.02it/s] 96%|| 96/100 [00:31<00:01, 3.16it/s] 97%|| 97/100 [00:32<00:00, 3.29it/s] 98%|| 98/100 [00:32<00:00, 3.16it/s] 99%|| 99/100 [00:32<00:00, 3.02it/s]100%|| 100/100 [00:33<00:00, 2.94it/s] {'loss': 0.9472, 'grad_norm': 0.45893919467926025, 'learning_rate': 5.000000000000001e-07, 'epoch': 0.11}
|
| 12491 |
+
100%|| 100/100 [00:33<00:00, 2.94it/s][INFO|trainer.py:4309] 2025-10-22 20:42:29,757 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
|
| 12492 |
+
[INFO|configuration_utils.py:765] 2025-10-22 20:42:29,909 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 12493 |
+
[INFO|configuration_utils.py:839] 2025-10-22 20:42:29,910 >> Model config Qwen2Config {
|
| 12494 |
+
"architectures": [
|
| 12495 |
+
"Qwen2ForCausalLM"
|
| 12496 |
+
],
|
| 12497 |
+
"attention_dropout": 0.0,
|
| 12498 |
+
"bos_token_id": 151643,
|
| 12499 |
+
"dtype": "bfloat16",
|
| 12500 |
+
"eos_token_id": 151643,
|
| 12501 |
+
"hidden_act": "silu",
|
| 12502 |
+
"hidden_size": 896,
|
| 12503 |
+
"initializer_range": 0.02,
|
| 12504 |
+
"intermediate_size": 4864,
|
| 12505 |
+
"layer_types": [
|
| 12506 |
+
"full_attention",
|
| 12507 |
+
"full_attention",
|
| 12508 |
+
"full_attention",
|
| 12509 |
+
"full_attention",
|
| 12510 |
+
"full_attention",
|
| 12511 |
+
"full_attention",
|
| 12512 |
+
"full_attention",
|
| 12513 |
+
"full_attention",
|
| 12514 |
+
"full_attention",
|
| 12515 |
+
"full_attention",
|
| 12516 |
+
"full_attention",
|
| 12517 |
+
"full_attention",
|
| 12518 |
+
"full_attention",
|
| 12519 |
+
"full_attention",
|
| 12520 |
+
"full_attention",
|
| 12521 |
+
"full_attention",
|
| 12522 |
+
"full_attention",
|
| 12523 |
+
"full_attention",
|
| 12524 |
+
"full_attention",
|
| 12525 |
+
"full_attention",
|
| 12526 |
+
"full_attention",
|
| 12527 |
+
"full_attention",
|
| 12528 |
+
"full_attention",
|
| 12529 |
+
"full_attention"
|
| 12530 |
+
],
|
| 12531 |
+
"max_position_embeddings": 32768,
|
| 12532 |
+
"max_window_layers": 24,
|
| 12533 |
+
"model_type": "qwen2",
|
| 12534 |
+
"num_attention_heads": 14,
|
| 12535 |
+
"num_hidden_layers": 24,
|
| 12536 |
+
"num_key_value_heads": 2,
|
| 12537 |
+
"rms_norm_eps": 1e-06,
|
| 12538 |
+
"rope_scaling": null,
|
| 12539 |
+
"rope_theta": 1000000.0,
|
| 12540 |
+
"sliding_window": null,
|
| 12541 |
+
"tie_word_embeddings": true,
|
| 12542 |
+
"transformers_version": "4.57.1",
|
| 12543 |
+
"use_cache": true,
|
| 12544 |
+
"use_mrope": false,
|
| 12545 |
+
"use_sliding_window": false,
|
| 12546 |
+
"vocab_size": 151936
|
| 12547 |
+
}
|
| 12548 |
+
|
| 12549 |
+
[INFO|tokenization_utils_base.py:2421] 2025-10-22 20:42:30,068 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/chat_template.jinja
|
| 12550 |
+
[INFO|tokenization_utils_base.py:2590] 2025-10-22 20:42:30,073 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/tokenizer_config.json
|
| 12551 |
+
[INFO|tokenization_utils_base.py:2599] 2025-10-22 20:42:30,092 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/special_tokens_map.json
|
| 12552 |
+
[INFO|trainer.py:2810] 2025-10-22 20:42:30,561 >>
|
| 12553 |
+
|
| 12554 |
+
Training completed. Do not forget to share your model on huggingface.co/models =)
|
| 12555 |
+
|
| 12556 |
+
|
| 12557 |
+
{'train_runtime': 34.8678, 'train_samples_per_second': 11.472, 'train_steps_per_second': 2.868, 'train_loss': 1.0560622215270996, 'epoch': 0.11}
|
| 12558 |
+
100%|| 100/100 [00:33<00:00, 2.94it/s]100%|| 100/100 [00:33<00:00, 2.95it/s]
|
| 12559 |
+
[INFO|trainer.py:4309] 2025-10-22 20:42:30,573 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
|
| 12560 |
+
[INFO|configuration_utils.py:765] 2025-10-22 20:42:30,670 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 12561 |
+
[INFO|configuration_utils.py:839] 2025-10-22 20:42:30,670 >> Model config Qwen2Config {
|
| 12562 |
+
"architectures": [
|
| 12563 |
+
"Qwen2ForCausalLM"
|
| 12564 |
+
],
|
| 12565 |
+
"attention_dropout": 0.0,
|
| 12566 |
+
"bos_token_id": 151643,
|
| 12567 |
+
"dtype": "bfloat16",
|
| 12568 |
+
"eos_token_id": 151643,
|
| 12569 |
+
"hidden_act": "silu",
|
| 12570 |
+
"hidden_size": 896,
|
| 12571 |
+
"initializer_range": 0.02,
|
| 12572 |
+
"intermediate_size": 4864,
|
| 12573 |
+
"layer_types": [
|
| 12574 |
+
"full_attention",
|
| 12575 |
+
"full_attention",
|
| 12576 |
+
"full_attention",
|
| 12577 |
+
"full_attention",
|
| 12578 |
+
"full_attention",
|
| 12579 |
+
"full_attention",
|
| 12580 |
+
"full_attention",
|
| 12581 |
+
"full_attention",
|
| 12582 |
+
"full_attention",
|
| 12583 |
+
"full_attention",
|
| 12584 |
+
"full_attention",
|
| 12585 |
+
"full_attention",
|
| 12586 |
+
"full_attention",
|
| 12587 |
+
"full_attention",
|
| 12588 |
+
"full_attention",
|
| 12589 |
+
"full_attention",
|
| 12590 |
+
"full_attention",
|
| 12591 |
+
"full_attention",
|
| 12592 |
+
"full_attention",
|
| 12593 |
+
"full_attention",
|
| 12594 |
+
"full_attention",
|
| 12595 |
+
"full_attention",
|
| 12596 |
+
"full_attention",
|
| 12597 |
+
"full_attention"
|
| 12598 |
+
],
|
| 12599 |
+
"max_position_embeddings": 32768,
|
| 12600 |
+
"max_window_layers": 24,
|
| 12601 |
+
"model_type": "qwen2",
|
| 12602 |
+
"num_attention_heads": 14,
|
| 12603 |
+
"num_hidden_layers": 24,
|
| 12604 |
+
"num_key_value_heads": 2,
|
| 12605 |
+
"rms_norm_eps": 1e-06,
|
| 12606 |
+
"rope_scaling": null,
|
| 12607 |
+
"rope_theta": 1000000.0,
|
| 12608 |
+
"sliding_window": null,
|
| 12609 |
+
"tie_word_embeddings": true,
|
| 12610 |
+
"transformers_version": "4.57.1",
|
| 12611 |
+
"use_cache": true,
|
| 12612 |
+
"use_mrope": false,
|
| 12613 |
+
"use_sliding_window": false,
|
| 12614 |
+
"vocab_size": 151936
|
| 12615 |
+
}
|
| 12616 |
+
|
| 12617 |
+
[INFO|tokenization_utils_base.py:2421] 2025-10-22 20:42:30,782 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/chat_template.jinja
|
| 12618 |
+
[INFO|tokenization_utils_base.py:2590] 2025-10-22 20:42:30,787 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/tokenizer_config.json
|
| 12619 |
+
[INFO|tokenization_utils_base.py:2599] 2025-10-22 20:42:30,792 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/special_tokens_map.json
|
| 12620 |
+
***** train metrics *****
|
| 12621 |
+
epoch = 0.1111
|
| 12622 |
+
total_flos = 2407106GF
|
| 12623 |
+
train_loss = 1.0561
|
| 12624 |
+
train_runtime = 0:00:34.86
|
| 12625 |
+
train_samples_per_second = 11.472
|
| 12626 |
+
train_steps_per_second = 2.868
|
| 12627 |
+
[INFO|modelcard.py:456] 2025-10-22 20:42:30,948 >> Dropping the following result as it does not have all the necessary fields:
|
| 12628 |
+
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
|
| 12629 |
+
gl064:2628227:2628227 [1] NCCL INFO comm 0x13dac3f0 rank 1 nranks 4 cudaDev 1 busId 59000 - Destroy COMPLETE
|
| 12630 |
+
gl064:2628226:2628226 [0] NCCL INFO comm 0x14bded40 rank 0 nranks 4 cudaDev 0 busId 47000 - Destroy COMPLETE
|
| 12631 |
+
[1;34mwandb[0m:
|
| 12632 |
+
[1;34mwandb[0m: View run [33minteractive_test[0m at: [34m[0m
|
| 12633 |
+
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20251022_204155-v2077oxb/logs[0m
|
| 12634 |
+
|
| 12635 |
+
========================================
|
| 12636 |
+
Training completed successfully
|
| 12637 |
+
End Time: Wed Oct 22 08:42:32 PM EDT 2025
|
| 12638 |
+
========================================
|
| 12639 |
+
|
| 12640 |
+
========================================
|
| 12641 |
+
STAGE 2: Merging/Exporting Model
|
| 12642 |
+
Start Time: Wed Oct 22 08:42:32 PM EDT 2025
|
| 12643 |
+
========================================
|
| 12644 |
+
Looking for checkpoints in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
|
| 12645 |
+
Analyzing checkpoints to find the one from current training run...
|
| 12646 |
+
- checkpoint-100: trainer_state.json modified at Wed Oct 22 08:42:30 PM EDT 2025
|
| 12647 |
+
- checkpoint-50: trainer_state.json modified at Wed Oct 22 08:42:14 PM EDT 2025
|
| 12648 |
+
|
| 12649 |
+
Selected checkpoint: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
|
| 12650 |
+
This checkpoint has the most recently updated trainer_state.json
|
| 12651 |
+
Checkpoint details:
|
| 12652 |
+
Path: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
|
| 12653 |
+
Last modified: 2025-10-22 16:54:17.414188691 -0400
|
| 12654 |
+
Training step: 100
|
| 12655 |
+
Updating merge config to point to checkpoint...
|
| 12656 |
+
Successfully updated merge config
|
| 12657 |
+
Updated merge config to use: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
|
| 12658 |
+
|
| 12659 |
+
Merge config contents:
|
| 12660 |
+
model_name_or_path: Qwen/Qwen2.5-0.5B
|
| 12661 |
+
finetuning_type: lora
|
| 12662 |
+
trust_remote_code: true
|
| 12663 |
+
adapter_name_or_path: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
|
| 12664 |
+
template: default
|
| 12665 |
+
export_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
|
| 12666 |
+
|
| 12667 |
+
Executing command: llamafactory-cli export /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
|
| 12668 |
+
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
|
| 12669 |
+
warnings.warn(
|
| 12670 |
+
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
|
| 12671 |
+
import pkg_resources
|
| 12672 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:42:40,849 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
|
| 12673 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:42:40,849 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
|
| 12674 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:42:40,850 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
|
| 12675 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:42:40,850 >> loading file added_tokens.json from cache at None
|
| 12676 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:42:40,850 >> loading file special_tokens_map.json from cache at None
|
| 12677 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:42:40,850 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
|
| 12678 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:42:40,850 >> loading file chat_template.jinja from cache at None
|
| 12679 |
+
[INFO|tokenization_utils_base.py:2364] 2025-10-22 20:42:41,020 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
| 12680 |
+
[INFO|configuration_utils.py:765] 2025-10-22 20:42:41,199 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 12681 |
+
[INFO|configuration_utils.py:839] 2025-10-22 20:42:41,201 >> Model config Qwen2Config {
|
| 12682 |
+
"architectures": [
|
| 12683 |
+
"Qwen2ForCausalLM"
|
| 12684 |
+
],
|
| 12685 |
+
"attention_dropout": 0.0,
|
| 12686 |
+
"bos_token_id": 151643,
|
| 12687 |
+
"dtype": "bfloat16",
|
| 12688 |
+
"eos_token_id": 151643,
|
| 12689 |
+
"hidden_act": "silu",
|
| 12690 |
+
"hidden_size": 896,
|
| 12691 |
+
"initializer_range": 0.02,
|
| 12692 |
+
"intermediate_size": 4864,
|
| 12693 |
+
"layer_types": [
|
| 12694 |
+
"full_attention",
|
| 12695 |
+
"full_attention",
|
| 12696 |
+
"full_attention",
|
| 12697 |
+
"full_attention",
|
| 12698 |
+
"full_attention",
|
| 12699 |
+
"full_attention",
|
| 12700 |
+
"full_attention",
|
| 12701 |
+
"full_attention",
|
| 12702 |
+
"full_attention",
|
| 12703 |
+
"full_attention",
|
| 12704 |
+
"full_attention",
|
| 12705 |
+
"full_attention",
|
| 12706 |
+
"full_attention",
|
| 12707 |
+
"full_attention",
|
| 12708 |
+
"full_attention",
|
| 12709 |
+
"full_attention",
|
| 12710 |
+
"full_attention",
|
| 12711 |
+
"full_attention",
|
| 12712 |
+
"full_attention",
|
| 12713 |
+
"full_attention",
|
| 12714 |
+
"full_attention",
|
| 12715 |
+
"full_attention",
|
| 12716 |
+
"full_attention",
|
| 12717 |
+
"full_attention"
|
| 12718 |
+
],
|
| 12719 |
+
"max_position_embeddings": 32768,
|
| 12720 |
+
"max_window_layers": 24,
|
| 12721 |
+
"model_type": "qwen2",
|
| 12722 |
+
"num_attention_heads": 14,
|
| 12723 |
+
"num_hidden_layers": 24,
|
| 12724 |
+
"num_key_value_heads": 2,
|
| 12725 |
+
"rms_norm_eps": 1e-06,
|
| 12726 |
+
"rope_scaling": null,
|
| 12727 |
+
"rope_theta": 1000000.0,
|
| 12728 |
+
"sliding_window": null,
|
| 12729 |
+
"tie_word_embeddings": true,
|
| 12730 |
+
"transformers_version": "4.57.1",
|
| 12731 |
+
"use_cache": true,
|
| 12732 |
+
"use_mrope": false,
|
| 12733 |
+
"use_sliding_window": false,
|
| 12734 |
+
"vocab_size": 151936
|
| 12735 |
+
}
|
| 12736 |
+
|
| 12737 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:42:41,261 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
|
| 12738 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:42:41,261 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
|
| 12739 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:42:41,261 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
|
| 12740 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:42:41,261 >> loading file added_tokens.json from cache at None
|
| 12741 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:42:41,261 >> loading file special_tokens_map.json from cache at None
|
| 12742 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:42:41,261 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
|
| 12743 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 20:42:41,261 >> loading file chat_template.jinja from cache at None
|
| 12744 |
+
[INFO|tokenization_utils_base.py:2364] 2025-10-22 20:42:41,425 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
| 12745 |
+
[INFO|configuration_utils.py:765] 2025-10-22 20:42:41,469 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 12746 |
+
[INFO|configuration_utils.py:839] 2025-10-22 20:42:41,470 >> Model config Qwen2Config {
|
| 12747 |
+
"architectures": [
|
| 12748 |
+
"Qwen2ForCausalLM"
|
| 12749 |
+
],
|
| 12750 |
+
"attention_dropout": 0.0,
|
| 12751 |
+
"bos_token_id": 151643,
|
| 12752 |
+
"dtype": "bfloat16",
|
| 12753 |
+
"eos_token_id": 151643,
|
| 12754 |
+
"hidden_act": "silu",
|
| 12755 |
+
"hidden_size": 896,
|
| 12756 |
+
"initializer_range": 0.02,
|
| 12757 |
+
"intermediate_size": 4864,
|
| 12758 |
+
"layer_types": [
|
| 12759 |
+
"full_attention",
|
| 12760 |
+
"full_attention",
|
| 12761 |
+
"full_attention",
|
| 12762 |
+
"full_attention",
|
| 12763 |
+
"full_attention",
|
| 12764 |
+
"full_attention",
|
| 12765 |
+
"full_attention",
|
| 12766 |
+
"full_attention",
|
| 12767 |
+
"full_attention",
|
| 12768 |
+
"full_attention",
|
| 12769 |
+
"full_attention",
|
| 12770 |
+
"full_attention",
|
| 12771 |
+
"full_attention",
|
| 12772 |
+
"full_attention",
|
| 12773 |
+
"full_attention",
|
| 12774 |
+
"full_attention",
|
| 12775 |
+
"full_attention",
|
| 12776 |
+
"full_attention",
|
| 12777 |
+
"full_attention",
|
| 12778 |
+
"full_attention",
|
| 12779 |
+
"full_attention",
|
| 12780 |
+
"full_attention",
|
| 12781 |
+
"full_attention",
|
| 12782 |
+
"full_attention"
|
| 12783 |
+
],
|
| 12784 |
+
"max_position_embeddings": 32768,
|
| 12785 |
+
"max_window_layers": 24,
|
| 12786 |
+
"model_type": "qwen2",
|
| 12787 |
+
"num_attention_heads": 14,
|
| 12788 |
+
"num_hidden_layers": 24,
|
| 12789 |
+
"num_key_value_heads": 2,
|
| 12790 |
+
"rms_norm_eps": 1e-06,
|
| 12791 |
+
"rope_scaling": null,
|
| 12792 |
+
"rope_theta": 1000000.0,
|
| 12793 |
+
"sliding_window": null,
|
| 12794 |
+
"tie_word_embeddings": true,
|
| 12795 |
+
"transformers_version": "4.57.1",
|
| 12796 |
+
"use_cache": true,
|
| 12797 |
+
"use_mrope": false,
|
| 12798 |
+
"use_sliding_window": false,
|
| 12799 |
+
"vocab_size": 151936
|
| 12800 |
+
}
|
| 12801 |
+
|
| 12802 |
+
[WARNING|logging.py:328] 2025-10-22 20:42:41,470 >> `torch_dtype` is deprecated! Use `dtype` instead!
|
| 12803 |
+
[INFO|2025-10-22 20:42:41] llamafactory.model.model_utils.kv_cache:143 >> KV cache is enabled for faster generation.
|
| 12804 |
+
[WARNING|logging.py:328] 2025-10-22 20:42:41,792 >> `torch_dtype` is deprecated! Use `dtype` instead!
|
| 12805 |
+
[INFO|modeling_utils.py:1172] 2025-10-22 20:42:41,792 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
|
| 12806 |
+
[INFO|modeling_utils.py:2341] 2025-10-22 20:42:41,793 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
|
| 12807 |
+
[INFO|configuration_utils.py:986] 2025-10-22 20:42:41,794 >> Generate config GenerationConfig {
|
| 12808 |
+
"bos_token_id": 151643,
|
| 12809 |
+
"eos_token_id": 151643
|
| 12810 |
+
}
|
| 12811 |
+
|
| 12812 |
+
[INFO|configuration_utils.py:941] 2025-10-22 20:42:41,880 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
|
| 12813 |
+
[INFO|configuration_utils.py:986] 2025-10-22 20:42:41,880 >> Generate config GenerationConfig {
|
| 12814 |
+
"bos_token_id": 151643,
|
| 12815 |
+
"eos_token_id": 151643,
|
| 12816 |
+
"max_new_tokens": 2048
|
| 12817 |
+
}
|
| 12818 |
+
|
| 12819 |
+
[INFO|dynamic_module_utils.py:423] 2025-10-22 20:42:41,910 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
|
| 12820 |
+
[INFO|2025-10-22 20:42:41] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
|
| 12821 |
+
[INFO|2025-10-22 20:42:42] llamafactory.model.adapter:143 >> Merged 1 adapter(s).
|
| 12822 |
+
[INFO|2025-10-22 20:42:42] llamafactory.model.adapter:143 >> Loaded adapter(s): /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
|
| 12823 |
+
[INFO|2025-10-22 20:42:42] llamafactory.model.loader:143 >> all params: 494,032,768
|
| 12824 |
+
[INFO|2025-10-22 20:42:42] llamafactory.train.tuner:143 >> Convert model dtype to: torch.bfloat16.
|
| 12825 |
+
[INFO|configuration_utils.py:491] 2025-10-22 20:42:42,694 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/config.json
|
| 12826 |
+
[INFO|configuration_utils.py:757] 2025-10-22 20:42:42,697 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/generation_config.json
|
| 12827 |
+
[INFO|modeling_utils.py:4181] 2025-10-22 20:42:44,339 >> Model weights saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/model.safetensors
|
| 12828 |
+
[INFO|tokenization_utils_base.py:2421] 2025-10-22 20:42:44,344 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/chat_template.jinja
|
| 12829 |
+
[INFO|tokenization_utils_base.py:2590] 2025-10-22 20:42:44,349 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/tokenizer_config.json
|
| 12830 |
+
[INFO|tokenization_utils_base.py:2599] 2025-10-22 20:42:44,354 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/special_tokens_map.json
|
| 12831 |
+
[INFO|2025-10-22 20:42:44] llamafactory.train.tuner:143 >> Ollama modelfile saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/Modelfile
|
| 12832 |
+
|
| 12833 |
+
========================================
|
| 12834 |
+
Merge/Export completed successfully
|
| 12835 |
+
End Time: Wed Oct 22 08:42:45 PM EDT 2025
|
| 12836 |
+
========================================
|
| 12837 |
+
|
| 12838 |
+
========================================
|
| 12839 |
+
Preparing Training Artifacts
|
| 12840 |
+
========================================
|
| 12841 |
+
Copying configuration files...
|
| 12842 |
+
Copying and cleaning training logs...
|