Zaynes commited on
Commit
fd0a709
·
verified ·
1 Parent(s): 99d5ec2

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,36 +1,8 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
1
+ # Mark all log files as text to prevent binary file issues
2
+ *.log text
3
+ *.txt text
4
+ *.out text
5
+ *.err text
6
+ training_artifacts/logs/* text
7
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f60fb98dff649a27c956bcb87cddc95c45fe2a6804ba093301a8595820f8858
3
  size 988097824
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:258d5aa4b5952c31da6d4f45ad8ad8c963d4577e0800b99258da45caf9e41f18
3
  size 988097824
training_artifacts/README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Training Artifacts
2
+
3
+ This directory contains the training configuration and logs for this model.
4
+
5
+ ## Contents
6
+
7
+ - **hydra_config.yaml**: Complete Hydra configuration used for training
8
+ - **train_config.yaml**: LlamaFactory training configuration
9
+ - **merge_config.yaml**: LlamaFactory merge/export configuration
10
+ - **logs/**: Training logs from the job (cleaned for text format)
11
+
12
+ ## Job Information
13
+
14
+ - Job Name: lf_torch_test__interactive
15
+ - Timestamp: 2025-10-22 20:02:45 UTC
16
+ - Execution Mode: Local
training_artifacts/hydra_config.yaml ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ? ''
2
+ : ? ''
3
+ : ? ''
4
+ : hydra:
5
+ run:
6
+ dir: .
7
+ output_subdir: null
8
+ job:
9
+ chdir: false
10
+ _target_: null
11
+ job:
12
+ name: ???
13
+ mode: slurm
14
+ work_dir: null
15
+ dry_run: false
16
+ slurm:
17
+ time_limit: ???
18
+ constraint:
19
+ - h200
20
+ memory: 200
21
+ cpus_per_task: 16
22
+ partition: null
23
+ mail_user: user@example.com
24
+ execution:
25
+ nodes: null
26
+ gpus_per_node: null
27
+ num_gpus: null
28
+ hostfile: null
29
+ secrets_file: null
30
+ model:
31
+ name_or_path: ???
32
+ finetuning_type: lora
33
+ dataset:
34
+ name: ???
35
+ dir: null
36
+ info_json: null
37
+ template: default
38
+ cutoff_len: 1024
39
+ val_size: 0.1
40
+ hf_hub_url: null
41
+ formatting: alpaca
42
+ ranking: false
43
+ subset: null
44
+ split: train
45
+ folder: null
46
+ num_samples: null
47
+ columns:
48
+ prompt: null
49
+ query: null
50
+ response: null
51
+ history: null
52
+ messages: null
53
+ system: null
54
+ tools: null
55
+ images: null
56
+ videos: null
57
+ audios: null
58
+ chosen: null
59
+ rejected: null
60
+ kto_tag: null
61
+ tags:
62
+ role: null
63
+ content: null
64
+ user: null
65
+ assistant: null
66
+ observation: null
67
+ function: null
68
+ system: null
69
+ training:
70
+ stage: sft
71
+ do_train: true
72
+ model_name_or_path: null
73
+ finetuning_type: lora
74
+ trust_remote_code: true
75
+ dataset: null
76
+ dataset_dir: null
77
+ template: default
78
+ cutoff_len: 1024
79
+ val_size: 0.1
80
+ preprocessing_num_workers: 1
81
+ dataset_num_proc: 1
82
+ dataloader_num_workers: 0
83
+ streaming: false
84
+ learning_rate: 5.0e-05
85
+ num_train_epochs: 3.0
86
+ per_device_train_batch_size: 1
87
+ per_device_eval_batch_size: 1
88
+ gradient_accumulation_steps: 8
89
+ lr_scheduler_type: cosine
90
+ warmup_ratio: 0.1
91
+ warmup_steps: 0
92
+ lora_rank: 8
93
+ lora_alpha: 16
94
+ lora_dropout: 0.05
95
+ lora_target: all
96
+ optim: adamw_torch
97
+ bf16: true
98
+ fp16: false
99
+ output_dir: null
100
+ save_strategy: epoch
101
+ save_steps: 500
102
+ save_total_limit: 3
103
+ save_only_model: false
104
+ eval_strategy: steps
105
+ eval_steps: 500
106
+ do_eval: true
107
+ logging_steps: 10
108
+ plot_loss: true
109
+ report_to: none
110
+ gradient_checkpointing: true
111
+ ddp_timeout: 180000000
112
+ include_num_input_tokens_seen: true
113
+ overwrite_output_dir: true
114
+ overwrite_cache: false
115
+ seed: 42
116
+ lora:
117
+ rank: 8
118
+ alpha: 16
119
+ dropout: 0.05
120
+ target: all
121
+ output:
122
+ experiment_dir: ./experiments
123
+ merge:
124
+ stage: export
125
+ model_name_or_path: null
126
+ adapter_name_or_path: null
127
+ template: default
128
+ export_dir: null
129
+ export_size: 2
130
+ export_device: auto
131
+ export_legacy_format: false
132
+ finetuning_type: lora
133
+ wandb:
134
+ project: null
135
+ run_name: null
136
+ entity: null
137
+ hf:
138
+ repo_id: null
139
+ private: false
140
+ upload_artifacts: true
141
+ cleanup:
142
+ checkpoints: false
143
+ merged: false
144
+ job:
145
+ name: lf_torch_test__interactive
146
+ mode: local
147
+ work_dir: null
148
+ dry_run: false
149
+ slurm:
150
+ time_limit: null
151
+ constraint: null
152
+ memory: null
153
+ partition: null
154
+ mail_user: null
155
+ execution:
156
+ nodes: 2
157
+ gpus_per_node: 2
158
+ num_gpus: null
159
+ hostfile: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/hostfile_auto_generated.txt
160
+ secrets_file: ./secrets.env
161
+ model:
162
+ name_or_path: Qwen/Qwen2.5-0.5B
163
+ finetuning_type: lora
164
+ lora:
165
+ rank: 8
166
+ alpha: 16
167
+ dropout: 0.05
168
+ target: all
169
+ dataset:
170
+ name: my_custom_sft
171
+ dir: null
172
+ info_json: null
173
+ template: default
174
+ cutoff_len: 8096
175
+ val_size: 0.1
176
+ hf_hub_url: TAUR-dev/D-SFT_C-sft_exp_AT_pvv2__fixed-sft-data
177
+ formatting: sharegpt
178
+ ranking: false
179
+ subset: null
180
+ split: train
181
+ folder: null
182
+ num_samples: null
183
+ columns:
184
+ messages: conversations
185
+ tags:
186
+ role: role
187
+ content: content
188
+ user: user
189
+ assistant: assistant
190
+ output:
191
+ experiment_dir: ./experiments
192
+ wandb:
193
+ project: null
194
+ run_name: interactive_test
195
+ entity: null
196
+ hf:
197
+ repo_id: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
198
+ private: false
199
+ cleanup:
200
+ checkpoints: false
201
+ merged: false
202
+ training:
203
+ stage: sft
204
+ do_train: true
205
+ max_steps: 150
206
+ do_eval: false
207
+ save_strategy: steps
208
+ save_steps: 50
209
+ logging_steps: 10
210
+ fp16: true
211
+ bf16: false
212
+ overwrite_output_dir: true
213
+ per_device_train_batch_size: 1
214
+ gradient_accumulation_steps: 1
215
+ gradient_checkpointing: true
216
+ merge: {}
training_artifacts/logs/pipeline_cleaned.txt ADDED
@@ -0,0 +1,1050 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================================
2
+ Job Name: lf_torch_test__interactive
3
+ Hostname: gl064.hpc.nyu.edu
4
+ Number of nodes: 2
5
+ GPUs per node: 2
6
+ Start Time: Wed Oct 22 04:01:29 PM EDT 2025
7
+ Log file: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/logs/pipeline.log
8
+ ========================================
9
+ Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
10
+
11
+ ========================================
12
+ Configuration Paths
13
+ ========================================
14
+ Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
15
+ Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
16
+ Dataset Info:
17
+ Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
18
+ Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
19
+ HF Repo ID: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
20
+
21
+
22
+ ========================================
23
+ Multi-Node Coordination
24
+ ========================================
25
+ This is the master node - coordinating worker nodes...
26
+ Master node: gl064
27
+ Master port: 29500
28
+ World size: 2
29
+
30
+ Launching on worker node 1: gl065
31
+ All worker nodes launched successfully
32
+ Master node (this node) will now join training as rank 0
33
+
34
+
35
+ ========================================
36
+ STAGE 1: Training Model
37
+ Start Time: Wed Oct 22 04:01:31 PM EDT 2025
38
+ ========================================
39
+ Multi-node training detected
40
+ Nodes: 2, GPUs per node: 2
41
+ Master address: gl064
42
+ Master port: 29500
43
+ Node rank: 0
44
+ World size: 2
45
+ CUDA_VISIBLE_DEVICES: 0,1
46
+ LLaMA-Factory path: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
47
+ Training config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
48
+
49
+ Starting distributed training with torch.distributed.run...
50
+
51
+ *****************************************
52
+ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
53
+ *****************************************
54
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
55
+ warnings.warn(
56
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
57
+ warnings.warn(
58
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
59
+ import pkg_resources
60
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
61
+ import pkg_resources
62
+ [INFO|2025-10-22 16:01:48] llamafactory.hparams.parser:423 >> Process rank: 1, world size: 4, device: cuda:1, distributed training: True, compute dtype: torch.float16
63
+ [INFO|2025-10-22 16:01:48] llamafactory.hparams.parser:143 >> Set `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.
64
+ [INFO|2025-10-22 16:01:48] llamafactory.hparams.parser:423 >> Process rank: 0, world size: 4, device: cuda:0, distributed training: True, compute dtype: torch.float16
65
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,287 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
66
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,287 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
67
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,288 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
68
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,288 >> loading file added_tokens.json from cache at None
69
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,288 >> loading file special_tokens_map.json from cache at None
70
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,288 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
71
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,288 >> loading file chat_template.jinja from cache at None
72
+ [INFO|tokenization_utils_base.py:2364] 2025-10-22 16:01:48,457 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
73
+ [INFO|configuration_utils.py:765] 2025-10-22 16:01:48,674 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
74
+ [INFO|configuration_utils.py:839] 2025-10-22 16:01:48,676 >> Model config Qwen2Config {
75
+ "architectures": [
76
+ "Qwen2ForCausalLM"
77
+ ],
78
+ "attention_dropout": 0.0,
79
+ "bos_token_id": 151643,
80
+ "dtype": "bfloat16",
81
+ "eos_token_id": 151643,
82
+ "hidden_act": "silu",
83
+ "hidden_size": 896,
84
+ "initializer_range": 0.02,
85
+ "intermediate_size": 4864,
86
+ "layer_types": [
87
+ "full_attention",
88
+ "full_attention",
89
+ "full_attention",
90
+ "full_attention",
91
+ "full_attention",
92
+ "full_attention",
93
+ "full_attention",
94
+ "full_attention",
95
+ "full_attention",
96
+ "full_attention",
97
+ "full_attention",
98
+ "full_attention",
99
+ "full_attention",
100
+ "full_attention",
101
+ "full_attention",
102
+ "full_attention",
103
+ "full_attention",
104
+ "full_attention",
105
+ "full_attention",
106
+ "full_attention",
107
+ "full_attention",
108
+ "full_attention",
109
+ "full_attention",
110
+ "full_attention"
111
+ ],
112
+ "max_position_embeddings": 32768,
113
+ "max_window_layers": 24,
114
+ "model_type": "qwen2",
115
+ "num_attention_heads": 14,
116
+ "num_hidden_layers": 24,
117
+ "num_key_value_heads": 2,
118
+ "rms_norm_eps": 1e-06,
119
+ "rope_scaling": null,
120
+ "rope_theta": 1000000.0,
121
+ "sliding_window": null,
122
+ "tie_word_embeddings": true,
123
+ "transformers_version": "4.57.1",
124
+ "use_cache": true,
125
+ "use_mrope": false,
126
+ "use_sliding_window": false,
127
+ "vocab_size": 151936
128
+ }
129
+
130
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,738 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
131
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,738 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
132
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,738 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
133
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,738 >> loading file added_tokens.json from cache at None
134
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,738 >> loading file special_tokens_map.json from cache at None
135
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,738 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
136
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,738 >> loading file chat_template.jinja from cache at None
137
+ [INFO|tokenization_utils_base.py:2364] 2025-10-22 16:01:48,904 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
138
+ [INFO|2025-10-22 16:01:48] llamafactory.data.loader:143 >> Loading dataset TAUR-dev/D-SFT_C-sft_exp_AT_pvv2__fixed-sft-data...
139
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4876: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.
140
+ warnings.warn( # warn only once
141
+ [rank0]:[W1022 16:01:49.085275271 ProcessGroupNCCL.cpp:5068] Guessing device ID based on global rank. This can cause a hang if rank to GPU mapping is heterogeneous. You can specify device_id in init_process_group()
142
+ gl064:2368555:2368555 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
143
+ gl064:2368555:2368555 [0] NCCL INFO Bootstrap: Using ibs3:10.0.5.0<0>
144
+ gl064:2368555:2368555 [0] NCCL INFO cudaDriverVersion 13000
145
+ gl064:2368555:2368555 [0] NCCL INFO NCCL version 2.27.5+cuda12.9
146
+ gl064:2368555:2368555 [0] NCCL INFO Comm config Blocking set to 1
147
+ gl064:2368556:2368556 [1] NCCL INFO cudaDriverVersion 13000
148
+ gl064:2368556:2368556 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
149
+ gl064:2368556:2368556 [1] NCCL INFO Bootstrap: Using ibs3:10.0.5.0<0>
150
+ gl064:2368556:2368556 [1] NCCL INFO NCCL version 2.27.5+cuda12.9
151
+ gl064:2368556:2368556 [1] NCCL INFO Comm config Blocking set to 1
152
+ gl064:2368555:2368616 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
153
+ gl064:2368555:2368616 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
154
+ gl064:2368555:2368616 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
155
+ gl064:2368555:2368616 [0] NCCL INFO NCCL_IB_HCA set to mlx5
156
+ gl064:2368556:2368617 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
157
+ gl064:2368556:2368617 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
158
+ gl064:2368556:2368617 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
159
+ gl064:2368556:2368617 [1] NCCL INFO NCCL_IB_HCA set to mlx5
160
+ gl064:2368555:2368616 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.0<0>
161
+ gl064:2368555:2368616 [0] NCCL INFO Initialized NET plugin IB
162
+ gl064:2368555:2368616 [0] NCCL INFO Assigned NET plugin IB to comm
163
+ gl064:2368555:2368616 [0] NCCL INFO Using network IB
164
+ gl064:2368555:2368616 [0] NCCL INFO ncclCommInitRankConfig comm 0x14bb0450 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x503fff15b9ea778b - Init START
165
+ gl064:2368556:2368617 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.0<0>
166
+ gl064:2368556:2368617 [1] NCCL INFO Initialized NET plugin IB
167
+ gl064:2368556:2368617 [1] NCCL INFO Assigned NET plugin IB to comm
168
+ gl064:2368556:2368617 [1] NCCL INFO Using network IB
169
+ gl064:2368556:2368617 [1] NCCL INFO ncclCommInitRankConfig comm 0x15c0db00 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x503fff15b9ea778b - Init START
170
+ gl064:2368555:2368616 [0] NCCL INFO RAS client listening socket at ::1<28028>
171
+ gl064:2368556:2368617 [1] NCCL INFO RAS client listening socket at ::1<28028>
172
+ gl064:2368555:2368616 [0] NCCL INFO Bootstrap timings total 0.321405 (create 0.000022, send 0.000239, recv 0.002956, ring 0.302954, delay 0.000000)
173
+ gl064:2368556:2368617 [1] NCCL INFO Bootstrap timings total 0.319316 (create 0.000023, send 0.000069, recv 0.316285, ring 0.001306, delay 0.000000)
174
+ gl064:2368555:2368616 [0] NCCL INFO Setting affinity for GPU 0 to 0-15
175
+ gl064:2368556:2368617 [1] NCCL INFO Setting affinity for GPU 1 to 0-15
176
+ gl064:2368556:2368617 [1] NCCL INFO comm 0x15c0db00 rank 1 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
177
+ gl064:2368555:2368616 [0] NCCL INFO comm 0x14bb0450 rank 0 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
178
+ gl064:2368555:2368616 [0] NCCL INFO Channel 00/02 : 0 1 2 3
179
+ gl064:2368556:2368617 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
180
+ gl064:2368555:2368616 [0] NCCL INFO Channel 01/02 : 0 1 2 3
181
+ gl064:2368556:2368617 [1] NCCL INFO P2P Chunksize set to 131072
182
+ gl064:2368555:2368616 [0] NCCL INFO Trees [0] 1/2/-1->0->-1 [1] 1/-1/-1->0->2
183
+ gl064:2368555:2368616 [0] NCCL INFO P2P Chunksize set to 131072
184
+ gl064:2368556:2368617 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
185
+ gl064:2368555:2368616 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
186
+ gl064:2368555:2368616 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
187
+ gl064:2368555:2368623 [0] NCCL INFO [Proxy Service] Device 0 CPU core 9
188
+ gl064:2368556:2368624 [1] NCCL INFO [Proxy Service] Device 1 CPU core 3
189
+ gl064:2368555:2368625 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 10
190
+ gl064:2368556:2368626 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 6
191
+ gl064:2368556:2368617 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
192
+ gl064:2368556:2368617 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
193
+ gl064:2368555:2368616 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
194
+ gl064:2368555:2368616 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
195
+ gl064:2368555:2368616 [0] NCCL INFO CC Off, workFifoBytes 1048576
196
+ gl064:2368556:2368617 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
197
+ gl064:2368556:2368617 [1] NCCL INFO ncclCommInitRankConfig comm 0x15c0db00 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x503fff15b9ea778b - Init COMPLETE
198
+ gl064:2368556:2368617 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.45 (kernels 0.09, alloc 0.01, bootstrap 0.32, allgathers 0.01, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
199
+ gl064:2368555:2368616 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
200
+ gl064:2368555:2368616 [0] NCCL INFO ncclCommInitRankConfig comm 0x14bb0450 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x503fff15b9ea778b - Init COMPLETE
201
+ gl064:2368555:2368616 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 0.45 (kernels 0.09, alloc 0.01, bootstrap 0.32, allgathers 0.01, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
202
+ gl064:2368555:2368627 [0] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [receive] via NET/IB/0
203
+ gl064:2368555:2368629 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 3
204
+ gl064:2368555:2368627 [0] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [receive] via NET/IB/0
205
+ gl064:2368555:2368627 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
206
+ gl064:2368555:2368627 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
207
+ gl064:2368556:2368628 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [send] via NET/IB/0
208
+ gl064:2368556:2368628 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [send] via NET/IB/0
209
+ gl064:2368556:2368630 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 7
210
+ gl064:2368556:2368628 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
211
+ gl064:2368555:2368627 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
212
+ training example:
213
+ input_ids:
214
+ [33975, 25, 21806, 279, 2701, 3491, 13, 81917, 697, 32711, 3019, 553, 3019, 13, 3197, 498, 525, 8060, 11, 2968, 697, 4226, 304, 419, 3561, 25, 366, 9217, 2235, 21732, 4226, 12533, 9217, 94367, 2, 22079, 198, 16429, 279, 5109, 304, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 1125, 1855, 458, 23606, 429, 16819, 220, 16, 21, 24, 13, 1446, 646, 990, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 8, 323, 1817, 1372, 646, 1172, 387, 1483, 3055, 13, 4615, 6291, 1265, 2924, 264, 4013, 315, 7354, 330, 8304, 1599, 2974, 1380, 1817, 3019, 374, 264, 35972, 5666, 323, 279, 1590, 3019, 13653, 11508, 311, 279, 2169, 1372, 476, 432, 1265, 387, 264, 3175, 23606, 429, 3059, 304, 279, 2169, 382, 35127, 697, 4226, 304, 279, 2701, 3561, 510, 27, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 1339, 9064, 11993, 21732, 4226, 9940, 374, 279, 1140, 315, 7354, 311, 5545, 279, 2169, 1372, 476, 432, 1265, 387, 264, 3175, 23606, 429, 3059, 304, 279, 2169, 13, 4710, 2461, 3110, 510, 2679, 279, 1140, 315, 5109, 572, 508, 16, 11, 220, 17, 11, 220, 18, 60, 323, 279, 2169, 572, 220, 16, 11, 498, 1410, 3270, 510, 27, 9217, 397, 8304, 220, 16, 25, 220, 16, 488, 220, 17, 284, 220, 18, 198, 8304, 220, 17, 25, 220, 18, 608, 220, 18, 284, 220, 16, 198, 522, 9217, 1339, 269, 4710, 27, 9217, 397, 7, 16, 488, 220, 17, 8, 608, 220, 18, 198, 522, 9217, 1339, 10061, 594, 1744, 3019, 553, 3019, 13, 151643, 198, 71703, 25, 13708, 766, 397, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 8304, 220, 16, 25, 64547, 279, 5795, 481, 1205, 1184, 311, 1855, 458, 23606, 1667, 279, 5109, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 17, 25, 21144, 3204, 7525, 481, 78646, 11, 75240, 11, 46444, 11, 323, 12804, 624, 8304, 220, 18, 25, 9735, 2155, 27459, 315, 1493, 7525, 448, 279, 2661, 5109, 624, 8304, 220, 19, 25, 3393, 5257, 37906, 3080, 582, 1477, 825, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 20, 25, 9646, 582, 614, 1730, 279, 4396, 23606, 11, 3042, 432, 304, 279, 5189, 3561, 624, 6025, 2087, 1526, 678, 1493, 7354, 11, 1588, 594, 279, 1590, 23606, 429, 16819, 220, 16, 21, 24, 1447, 27, 9217, 397, 7, 21, 22, 353, 220, 17, 8, 488, 220, 22, 16, 481, 220, 18, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 488, 220, 22, 16, 284, 220, 16, 18, 23, 11, 1221, 220, 16, 18, 23, 481, 220, 18, 16, 284, 220, 16, 15, 22, 13, 1096, 1558, 537, 6144, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 10061, 752, 10146, 847, 4226, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 10253, 279, 5109, 504, 279, 2661, 738, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 311, 1352, 458, 23606, 6144, 287, 220, 16, 21, 24, 1667, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 3593, 8304, 220, 16, 25, 71059, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 2691, 220, 22, 16, 311, 279, 1102, 504, 14822, 220, 16, 624, 8304, 220, 18, 25, 576, 1590, 1102, 686, 387, 220, 16, 21, 24, 382, 27, 9217, 397, 7, 21, 22, 353, 220, 18, 16, 8, 488, 220, 22, 16, 284, 220, 16, 21, 24, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 11, 15, 22, 22, 11, 1221, 220, 17, 11, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 11, 15, 15, 21, 13, 1096, 374, 537, 6144, 311, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 1462, 311, 3395, 847, 2033, 825, 803, 882, 624, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 785, 5795, 374, 311, 1855, 458, 23606, 1667, 279, 5109, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 429, 16819, 220, 16, 21, 24, 624, 40, 686, 1191, 553, 4460, 2155, 27459, 315, 1493, 2326, 5109, 323, 862, 3204, 7525, 17973, 11, 85922, 11777, 608, 568, 715, 12209, 358, 686, 15442, 1817, 10601, 323, 1779, 421, 807, 6144, 220, 16, 21, 24, 13, 1416, 537, 11, 358, 686, 3271, 389, 311, 279, 1790, 10601, 3080, 358, 1477, 825, 429, 4278, 624, 6025, 9271, 279, 4396, 10601, 11, 358, 686, 3042, 847, 6291, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 624, 8420, 594, 847, 11682, 3119, 1447, 16, 13, 4230, 678, 3204, 27459, 315, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 1667, 5256, 11, 75240, 11, 46444, 11, 323, 12804, 624, 17, 13, 54115, 1817, 10601, 311, 1490, 421, 432, 16819, 220, 16, 21, 24, 624, 18, 13, 1416, 902, 10601, 16819, 220, 16, 21, 24, 11, 13153, 279, 1882, 448, 501, 27459, 3080, 358, 1477, 825, 429, 4278, 624, 19, 13, 9646, 358, 614, 1730, 279, 4396, 10601, 11, 3158, 432, 438, 264, 3175, 23606, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 382, 7039, 1077, 752, 3161, 15039, 369, 279, 4396, 10601, 4894, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 16, 13, 14822, 220, 16, 25, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 15, 22, 22, 198, 17, 13, 14822, 220, 17, 25, 220, 17, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 15, 15, 21, 271, 54815, 11, 279, 1590, 1102, 374, 220, 17, 15, 15, 21, 11, 892, 1558, 537, 6144, 220, 16, 21, 24, 13, 576, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 3983, 3783, 11, 1077, 752, 1744, 911, 432, 1549, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 3838, 374, 279, 1102, 979, 498, 912, 1378, 5109, 504, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 3786, 1939, 8304, 220, 16, 25, 2691, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 11778, 279, 2629, 504, 14822, 220, 16, 323, 912, 220, 22, 16, 382, 19357, 84670, 25, 320, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 271, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 25, 5512, 11, 220, 21, 22, 488, 220, 18, 16, 284, 220, 24, 23, 11, 1221, 220, 24, 23, 488, 220, 22, 16, 284, 220, 16, 21, 24, 13, 576, 4226, 374, 4396, 382, 27, 423, 8477, 397, 33092, 198, 522, 423, 8477, 397, 522, 34913, 397, 522, 26865, 10370, 39, 763, 11, 279, 4396, 4226, 374, 1447, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 29, 151643, 198]
215
+ inputs:
216
+ Human: Answer the following problem. Explain your reasoning step by step. When you are finished, give your answer in this format: <answer>(your answer)</answer>.
217
+
218
+ # Problem
219
+ Using the numbers in the list [67, 71, 31], create an equation that equals 169. You can use basic arithmetic operations (+, -, *, /) and each number can only be used once. Your solution should include a series of steps "Step X:" where each step is a mathematical operation and the final step ultimately leads to the target number or it should be a single equation that results in the target.
220
+
221
+ Give your answer in the following format:
222
+ <answer>
223
+ (your answer)
224
+ </answer>
225
+
226
+ Where "(your answer)" is the list of steps to reach the target number or it should be a single equation that results in the target.
227
+
228
+ For example:
229
+ If the list of numbers was [1, 2, 3] and the target was 1, you could write:
230
+ <answer>
231
+ Step 1: 1 + 2 = 3
232
+ Step 2: 3 / 3 = 1
233
+ </answer>
234
+
235
+ or
236
+
237
+ <answer>
238
+ (1 + 2) / 3
239
+ </answer>
240
+
241
+ Let's think step by step.<|endoftext|>
242
+ Assistant:<think>
243
+ <sample>
244
+ First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
245
+ Step 1: Identify the goal - We need to create an equation using the numbers [67, 71, 31] that equals 169.
246
+ Step 2: Consider possible operations - Addition, subtraction, multiplication, and division.
247
+ Step 3: Try different combinations of these operations with the given numbers.
248
+ Step 4: Test various equations until we find one that equals 169.
249
+ Step 5: Once we have found the correct equation, present it in the specified format.
250
+ After going through all these steps, here's the final equation that equals 169:
251
+
252
+ <answer>
253
+ (67 * 2) + 71 - 31
254
+ </answer>
255
+ </sample>
256
+ <reflect>
257
+ Let us verify this answer:
258
+
259
+ First, 67 + 71 = 138, then 138 - 31 = 107. This does not equal 169, so the answer is incorrect.
260
+
261
+ <verdict>
262
+ Incorrect
263
+ </verdict>
264
+ </reflect>
265
+ Let me verify my answer.
266
+ <sample>
267
+ Rewritten Question:
268
+
269
+ Use the numbers from the given set [67, 71, 31] to form an equation equaling 169 using basic arithmetic operations (+, -, *, /).
270
+
271
+ Step 1: Multiply 67 and 31.
272
+ Step 2: Add 71 to the result from Step 1.
273
+ Step 3: The final result will be 169.
274
+
275
+ <answer>
276
+ (67 * 31) + 71 = 169
277
+ </answer>
278
+ </sample>
279
+ <reflect>
280
+ Let us verify this answer:
281
+
282
+ First, 67 * 31 = 2,077, then 2,077 - 71 = 2,006. This is not equal to 169, so the answer is incorrect.
283
+
284
+ <verdict>
285
+ Incorrect
286
+ </verdict>
287
+ </reflect>
288
+ Time to review my response one more time.
289
+ <sample>
290
+ First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
291
+ The goal is to create an equation using the numbers 67, 71, and 31 that equals 169.
292
+ I will start by trying different combinations of these three numbers and their possible operations (+, -, *, /).
293
+ Then I will evaluate each combination and check if they equal 169. If not, I will move on to the next combination until I find one that works.
294
+ After finding the correct combination, I will present my solution in the "<answer>
295
+ (your answer)
296
+ </answer>" format.
297
+ Here's my detailed plan:
298
+
299
+ 1. Create all possible combinations of 67, 71, and 31 using addition, subtraction, multiplication, and division.
300
+ 2. Evaluate each combination to see if it equals 169.
301
+ 3. If no combination equals 169, repeat the process with new combinations until I find one that works.
302
+ 4. Once I have found the correct combination, express it as a single equation in the "<answer>
303
+ (your answer)
304
+ </answer>" format.
305
+
306
+ Now let me begin searching for the correct combination!
307
+ </sample>
308
+ <reflect>
309
+ Let us verify this answer:
310
+
311
+ 1. Step 1: 67 * 31 = 2077
312
+ 2. Step 2: 2077 - 71 = 2006
313
+
314
+ Therefore, the final result is 2006, which does not equal 169. The answer is incorrect.
315
+
316
+ <verdict>
317
+ Incorrect
318
+ </verdict>
319
+ </reflect>
320
+ But wait, let me think about it again.
321
+ <sample>
322
+ Rewritten Question:
323
+
324
+ What is the result when you add two numbers from the list [67, 71, 31] together?
325
+
326
+ Step 1: Add 67 and 31.
327
+ Step 2: Take the sum from Step 1 and add 71.
328
+
329
+ Final Equation: (67 + 31) + 71
330
+
331
+ <answer>
332
+ (67 + 31) + 71
333
+ </answer>
334
+ </sample>
335
+ <reflect>
336
+ Let us verify this answer: First, 67 + 31 = 98, then 98 + 71 = 169. The answer is correct.
337
+
338
+ <verdict>
339
+ Correct
340
+ </verdict>
341
+ </reflect>
342
+ </think>
343
+
344
+
345
+ Hence, the correct answer is:
346
+
347
+ <answer>
348
+ (67 + 31) + 71
349
+ </answer><|endoftext|>
350
+
351
+ label_ids:
352
+ [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 13708, 766, 397, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 8304, 220, 16, 25, 64547, 279, 5795, 481, 1205, 1184, 311, 1855, 458, 23606, 1667, 279, 5109, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 17, 25, 21144, 3204, 7525, 481, 78646, 11, 75240, 11, 46444, 11, 323, 12804, 624, 8304, 220, 18, 25, 9735, 2155, 27459, 315, 1493, 7525, 448, 279, 2661, 5109, 624, 8304, 220, 19, 25, 3393, 5257, 37906, 3080, 582, 1477, 825, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 20, 25, 9646, 582, 614, 1730, 279, 4396, 23606, 11, 3042, 432, 304, 279, 5189, 3561, 624, 6025, 2087, 1526, 678, 1493, 7354, 11, 1588, 594, 279, 1590, 23606, 429, 16819, 220, 16, 21, 24, 1447, 27, 9217, 397, 7, 21, 22, 353, 220, 17, 8, 488, 220, 22, 16, 481, 220, 18, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 488, 220, 22, 16, 284, 220, 16, 18, 23, 11, 1221, 220, 16, 18, 23, 481, 220, 18, 16, 284, 220, 16, 15, 22, 13, 1096, 1558, 537, 6144, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 10061, 752, 10146, 847, 4226, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 10253, 279, 5109, 504, 279, 2661, 738, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 311, 1352, 458, 23606, 6144, 287, 220, 16, 21, 24, 1667, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 3593, 8304, 220, 16, 25, 71059, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 2691, 220, 22, 16, 311, 279, 1102, 504, 14822, 220, 16, 624, 8304, 220, 18, 25, 576, 1590, 1102, 686, 387, 220, 16, 21, 24, 382, 27, 9217, 397, 7, 21, 22, 353, 220, 18, 16, 8, 488, 220, 22, 16, 284, 220, 16, 21, 24, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 11, 15, 22, 22, 11, 1221, 220, 17, 11, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 11, 15, 15, 21, 13, 1096, 374, 537, 6144, 311, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 1462, 311, 3395, 847, 2033, 825, 803, 882, 624, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 785, 5795, 374, 311, 1855, 458, 23606, 1667, 279, 5109, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 429, 16819, 220, 16, 21, 24, 624, 40, 686, 1191, 553, 4460, 2155, 27459, 315, 1493, 2326, 5109, 323, 862, 3204, 7525, 17973, 11, 85922, 11777, 608, 568, 715, 12209, 358, 686, 15442, 1817, 10601, 323, 1779, 421, 807, 6144, 220, 16, 21, 24, 13, 1416, 537, 11, 358, 686, 3271, 389, 311, 279, 1790, 10601, 3080, 358, 1477, 825, 429, 4278, 624, 6025, 9271, 279, 4396, 10601, 11, 358, 686, 3042, 847, 6291, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 624, 8420, 594, 847, 11682, 3119, 1447, 16, 13, 4230, 678, 3204, 27459, 315, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 1667, 5256, 11, 75240, 11, 46444, 11, 323, 12804, 624, 17, 13, 54115, 1817, 10601, 311, 1490, 421, 432, 16819, 220, 16, 21, 24, 624, 18, 13, 1416, 902, 10601, 16819, 220, 16, 21, 24, 11, 13153, 279, 1882, 448, 501, 27459, 3080, 358, 1477, 825, 429, 4278, 624, 19, 13, 9646, 358, 614, 1730, 279, 4396, 10601, 11, 3158, 432, 438, 264, 3175, 23606, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 382, 7039, 1077, 752, 3161, 15039, 369, 279, 4396, 10601, 4894, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 16, 13, 14822, 220, 16, 25, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 15, 22, 22, 198, 17, 13, 14822, 220, 17, 25, 220, 17, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 15, 15, 21, 271, 54815, 11, 279, 1590, 1102, 374, 220, 17, 15, 15, 21, 11, 892, 1558, 537, 6144, 220, 16, 21, 24, 13, 576, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 3983, 3783, 11, 1077, 752, 1744, 911, 432, 1549, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 3838, 374, 279, 1102, 979, 498, 912, 1378, 5109, 504, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 3786, 1939, 8304, 220, 16, 25, 2691, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 11778, 279, 2629, 504, 14822, 220, 16, 323, 912, 220, 22, 16, 382, 19357, 84670, 25, 320, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 271, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 25, 5512, 11, 220, 21, 22, 488, 220, 18, 16, 284, 220, 24, 23, 11, 1221, 220, 24, 23, 488, 220, 22, 16, 284, 220, 16, 21, 24, 13, 576, 4226, 374, 4396, 382, 27, 423, 8477, 397, 33092, 198, 522, 423, 8477, 397, 522, 34913, 397, 522, 26865, 10370, 39, 763, 11, 279, 4396, 4226, 374, 1447, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 29, 151643, 198]
353
+ labels:
354
+ <think>
355
+ <sample>
356
+ First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
357
+ Step 1: Identify the goal - We need to create an equation using the numbers [67, 71, 31] that equals 169.
358
+ Step 2: Consider possible operations - Addition, subtraction, multiplication, and division.
359
+ Step 3: Try different combinations of these operations with the given numbers.
360
+ Step 4: Test various equations until we find one that equals 169.
361
+ Step 5: Once we have found the correct equation, present it in the specified format.
362
+ After going through all these steps, here's the final equation that equals 169:
363
+
364
+ <answer>
365
+ (67 * 2) + 71 - 31
366
+ </answer>
367
+ </sample>
368
+ <reflect>
369
+ Let us verify this answer:
370
+
371
+ First, 67 + 71 = 138, then 138 - 31 = 107. This does not equal 169, so the answer is incorrect.
372
+
373
+ <verdict>
374
+ Incorrect
375
+ </verdict>
376
+ </reflect>
377
+ Let me verify my answer.
378
+ <sample>
379
+ Rewritten Question:
380
+
381
+ Use the numbers from the given set [67, 71, 31] to form an equation equaling 169 using basic arithmetic operations (+, -, *, /).
382
+
383
+ Step 1: Multiply 67 and 31.
384
+ Step 2: Add 71 to the result from Step 1.
385
+ Step 3: The final result will be 169.
386
+
387
+ <answer>
388
+ (67 * 31) + 71 = 169
389
+ </answer>
390
+ </sample>
391
+ <reflect>
392
+ Let us verify this answer:
393
+
394
+ First, 67 * 31 = 2,077, then 2,077 - 71 = 2,006. This is not equal to 169, so the answer is incorrect.
395
+
396
+ <verdict>
397
+ Incorrect
398
+ </verdict>
399
+ </reflect>
400
+ Time to review my response one more time.
401
+ <sample>
402
+ First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
403
+ The goal is to create an equation using the numbers 67, 71, and 31 that equals 169.
404
+ I will start by trying different combinations of these three numbers and their possible operations (+, -, *, /).
405
+ Then I will evaluate each combination and check if they equal 169. If not, I will move on to the next combination until I find one that works.
406
+ After finding the correct combination, I will present my solution in the "<answer>
407
+ (your answer)
408
+ </answer>" format.
409
+ Here's my detailed plan:
410
+
411
+ 1. Create all possible combinations of 67, 71, and 31 using addition, subtraction, multiplication, and division.
412
+ 2. Evaluate each combination to see if it equals 169.
413
+ 3. If no combination equals 169, repeat the process with new combinations until I find one that works.
414
+ 4. Once I have found the correct combination, express it as a single equation in the "<answer>
415
+ (your answer)
416
+ </answer>" format.
417
+
418
+ Now let me begin searching for the correct combination!
419
+ </sample>
420
+ <reflect>
421
+ Let us verify this answer:
422
+
423
+ 1. Step 1: 67 * 31 = 2077
424
+ 2. Step 2: 2077 - 71 = 2006
425
+
426
+ Therefore, the final result is 2006, which does not equal 169. The answer is incorrect.
427
+
428
+ <verdict>
429
+ Incorrect
430
+ </verdict>
431
+ </reflect>
432
+ But wait, let me think about it again.
433
+ <sample>
434
+ Rewritten Question:
435
+
436
+ What is the result when you add two numbers from the list [67, 71, 31] together?
437
+
438
+ Step 1: Add 67 and 31.
439
+ Step 2: Take the sum from Step 1 and add 71.
440
+
441
+ Final Equation: (67 + 31) + 71
442
+
443
+ <answer>
444
+ (67 + 31) + 71
445
+ </answer>
446
+ </sample>
447
+ <reflect>
448
+ Let us verify this answer: First, 67 + 31 = 98, then 98 + 71 = 169. The answer is correct.
449
+
450
+ <verdict>
451
+ Correct
452
+ </verdict>
453
+ </reflect>
454
+ </think>
455
+
456
+
457
+ Hence, the correct answer is:
458
+
459
+ <answer>
460
+ (67 + 31) + 71
461
+ </answer><|endoftext|>
462
+
463
+ [INFO|configuration_utils.py:765] 2025-10-22 16:01:50,484 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
464
+ [INFO|configuration_utils.py:839] 2025-10-22 16:01:50,485 >> Model config Qwen2Config {
465
+ "architectures": [
466
+ "Qwen2ForCausalLM"
467
+ ],
468
+ "attention_dropout": 0.0,
469
+ "bos_token_id": 151643,
470
+ "dtype": "bfloat16",
471
+ "eos_token_id": 151643,
472
+ "hidden_act": "silu",
473
+ "hidden_size": 896,
474
+ "initializer_range": 0.02,
475
+ "intermediate_size": 4864,
476
+ "layer_types": [
477
+ "full_attention",
478
+ "full_attention",
479
+ "full_attention",
480
+ "full_attention",
481
+ "full_attention",
482
+ "full_attention",
483
+ "full_attention",
484
+ "full_attention",
485
+ "full_attention",
486
+ "full_attention",
487
+ "full_attention",
488
+ "full_attention",
489
+ "full_attention",
490
+ "full_attention",
491
+ "full_attention",
492
+ "full_attention",
493
+ "full_attention",
494
+ "full_attention",
495
+ "full_attention",
496
+ "full_attention",
497
+ "full_attention",
498
+ "full_attention",
499
+ "full_attention",
500
+ "full_attention"
501
+ ],
502
+ "max_position_embeddings": 32768,
503
+ "max_window_layers": 24,
504
+ "model_type": "qwen2",
505
+ "num_attention_heads": 14,
506
+ "num_hidden_layers": 24,
507
+ "num_key_value_heads": 2,
508
+ "rms_norm_eps": 1e-06,
509
+ "rope_scaling": null,
510
+ "rope_theta": 1000000.0,
511
+ "sliding_window": null,
512
+ "tie_word_embeddings": true,
513
+ "transformers_version": "4.57.1",
514
+ "use_cache": true,
515
+ "use_mrope": false,
516
+ "use_sliding_window": false,
517
+ "vocab_size": 151936
518
+ }
519
+
520
+ [INFO|2025-10-22 16:01:50] llamafactory.model.model_utils.kv_cache:143 >> KV cache is disabled during training.
521
+ [WARNING|logging.py:328] 2025-10-22 16:01:50,806 >> `torch_dtype` is deprecated! Use `dtype` instead!
522
+ [INFO|modeling_utils.py:1172] 2025-10-22 16:01:50,807 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
523
+ [INFO|modeling_utils.py:2341] 2025-10-22 16:01:50,808 >> Instantiating Qwen2ForCausalLM model under default dtype torch.float16.
524
+ [INFO|configuration_utils.py:986] 2025-10-22 16:01:50,808 >> Generate config GenerationConfig {
525
+ "bos_token_id": 151643,
526
+ "eos_token_id": 151643,
527
+ "use_cache": false
528
+ }
529
+
530
+ `torch_dtype` is deprecated! Use `dtype` instead!
531
+ [INFO|configuration_utils.py:941] 2025-10-22 16:01:51,084 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
532
+ [INFO|configuration_utils.py:986] 2025-10-22 16:01:51,085 >> Generate config GenerationConfig {
533
+ "bos_token_id": 151643,
534
+ "eos_token_id": 151643,
535
+ "max_new_tokens": 2048
536
+ }
537
+
538
+ [INFO|dynamic_module_utils.py:423] 2025-10-22 16:01:51,114 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
539
+ [INFO|2025-10-22 16:01:51] llamafactory.model.model_utils.checkpointing:143 >> Gradient checkpointing enabled.
540
+ [INFO|2025-10-22 16:01:51] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
541
+ [INFO|2025-10-22 16:01:51] llamafactory.model.adapter:143 >> Upcasting trainable params to float32.
542
+ [INFO|2025-10-22 16:01:51] llamafactory.model.adapter:143 >> Fine-tuning method: LoRA
543
+ [INFO|2025-10-22 16:01:51] llamafactory.model.model_utils.misc:143 >> Found linear modules: up_proj,v_proj,q_proj,down_proj,gate_proj,k_proj,o_proj
544
+ [INFO|2025-10-22 16:01:51] llamafactory.model.loader:143 >> trainable params: 4,399,104 || all params: 498,431,872 || trainable%: 0.8826
545
+ [WARNING|trainer.py:906] 2025-10-22 16:01:51,639 >> The model is already on multiple devices. Skipping the move to device specified in `args`.
546
+ [INFO|trainer.py:699] 2025-10-22 16:01:51,642 >> max_steps is given, it will override any value given in num_train_epochs
547
+ [INFO|trainer.py:749] 2025-10-22 16:01:51,642 >> Using auto half precision backend
548
+ [WARNING|trainer.py:982] 2025-10-22 16:01:51,643 >> The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
549
+ The model is already on multiple devices. Skipping the move to device specified in `args`.
550
+ The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
551
+ [INFO|trainer.py:2519] 2025-10-22 16:01:51,823 >> ***** Running training *****
552
+ [INFO|trainer.py:2520] 2025-10-22 16:01:51,823 >> Num examples = 48,600
553
+ [INFO|trainer.py:2521] 2025-10-22 16:01:51,823 >> Num Epochs = 1
554
+ [INFO|trainer.py:2522] 2025-10-22 16:01:51,823 >> Instantaneous batch size per device = 1
555
+ [INFO|trainer.py:2525] 2025-10-22 16:01:51,823 >> Total train batch size (w. parallel, distributed & accumulation) = 4
556
+ [INFO|trainer.py:2526] 2025-10-22 16:01:51,823 >> Gradient Accumulation steps = 1
557
+ [INFO|trainer.py:2527] 2025-10-22 16:01:51,823 >> Total optimization steps = 150
558
+ [INFO|trainer.py:2528] 2025-10-22 16:01:51,825 >> Number of trainable parameters = 4,399,104
559
+ [INFO|integration_utils.py:867] 2025-10-22 16:01:51,847 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
560
+ wandb: Currently logged in as: zsprague (ut_nlp_deduce) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
561
+ wandb: Tracking run with wandb version 0.22.2
562
+ wandb: Run data is saved locally in /scratch/zrs2020/LlamaFactoryHelper/wandb/run-20251022_160152-f7vqjhyf
563
+ wandb: Run `wandb offline` to turn off syncing.
564
+ wandb: Syncing run interactive_test
565
+ wandb: View project at https://wandb.ai/ut_nlp_deduce/llamafactory
566
+ wandb: View run at https://wandb.ai/ut_nlp_deduce/llamafactory/runs/f7vqjhyf
567
+ 0%| | 0/150 [00:00<?, ?it/s] 1%| | 1/150 [00:00<01:26, 1.72it/s] 1%| | 2/150 [00:00<00:50, 2.95it/s] 2%| | 3/150 [00:00<00:37, 3.88it/s] 3%| | 4/150 [00:01<00:35, 4.09it/s] 3%| | 5/150 [00:01<00:54, 2.68it/s] 4%| | 6/150 [00:01<00:47, 3.02it/s] 5%| | 7/150 [00:02<00:43, 3.27it/s] 5%| | 8/150 [00:02<00:38, 3.65it/s] 6%| | 9/150 [00:02<00:35, 3.92it/s] 7%| | 10/150 [00:02<00:32, 4.32it/s] {'loss': 0.8092, 'grad_norm': 0.4081718623638153, 'learning_rate': 4.7e-05, 'epoch': 0.0}
568
+ 7%| | 10/150 [00:02<00:32, 4.32it/s] 7%| | 11/150 [00:03<00:31, 4.39it/s] 8%| | 12/150 [00:03<00:41, 3.29it/s] 9%| | 13/150 [00:03<00:35, 3.88it/s] 9%| | 14/150 [00:03<00:30, 4.43it/s] 10%| | 15/150 [00:04<00:28, 4.78it/s] 11%| | 16/150 [00:04<00:37, 3.59it/s] 11%| | 17/150 [00:04<00:36, 3.62it/s] 12%| | 18/150 [00:04<00:35, 3.69it/s] 13%| | 19/150 [00:05<00:35, 3.69it/s] 13%| | 20/150 [00:05<00:32, 3.98it/s] {'loss': 0.751, 'grad_norm': 0.3975396752357483, 'learning_rate': 4.3666666666666666e-05, 'epoch': 0.0}
569
+ 13%| | 20/150 [00:05<00:32, 3.98it/s] 14%| | 21/150 [00:05<00:32, 3.94it/s] 15%| | 22/150 [00:05<00:30, 4.18it/s] 15%| | 23/150 [00:06<00:31, 4.07it/s] 16%| | 24/150 [00:06<00:28, 4.40it/s] 17%| | 25/150 [00:06<00:31, 4.02it/s] 17%| | 26/150 [00:06<00:29, 4.17it/s] 18%| | 27/150 [00:07<00:30, 4.10it/s] 19%| | 28/150 [00:07<00:33, 3.65it/s] 19%| | 29/150 [00:07<00:32, 3.75it/s] 20%| | 30/150 [00:07<00:31, 3.85it/s] {'loss': 0.7344, 'grad_norm': 0.46849244832992554, 'learning_rate': 4.0333333333333336e-05, 'epoch': 0.0}
570
+ 20%| | 30/150 [00:07<00:31, 3.85it/s] 21%| | 31/150 [00:08<00:31, 3.83it/s] 21%| | 32/150 [00:08<00:29, 4.05it/s] 22%| | 33/150 [00:08<00:26, 4.43it/s] 23%| | 34/150 [00:08<00:23, 4.87it/s] 23%| | 35/150 [00:09<00:25, 4.58it/s] 24%| | 36/150 [00:09<00:22, 5.04it/s] 25%| | 37/150 [00:09<00:24, 4.71it/s] 25%| | 38/150 [00:09<00:24, 4.67it/s] 26%| | 39/150 [00:09<00:22, 4.98it/s] 27%| | 40/150 [00:10<00:23, 4.58it/s] {'loss': 0.7063, 'grad_norm': 0.3817349970340729, 'learning_rate': 3.7e-05, 'epoch': 0.0}
571
+ 27%| | 40/150 [00:10<00:23, 4.58it/s] 27%| | 41/150 [00:10<00:26, 4.09it/s] 28%| | 42/150 [00:10<00:26, 4.07it/s] 29%| | 43/150 [00:10<00:23, 4.58it/s] 29%| | 44/150 [00:10<00:21, 4.90it/s] 30%| | 45/150 [00:11<00:19, 5.33it/s] 31%| | 46/150 [00:11<00:20, 4.98it/s] 31%| | 47/150 [00:11<00:21, 4.88it/s] 32%| | 48/150 [00:11<00:19, 5.14it/s] 33%| | 49/150 [00:12<00:22, 4.50it/s] 33%| | 50/150 [00:12<00:22, 4.49it/s] {'loss': 0.6382, 'grad_norm': 0.650374710559845, 'learning_rate': 3.366666666666667e-05, 'epoch': 0.0}
572
+ 33%| | 50/150 [00:12<00:22, 4.49it/s][INFO|trainer.py:4309] 2025-10-22 16:02:05,111 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50
573
+ [INFO|configuration_utils.py:765] 2025-10-22 16:02:05,262 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
574
+ [INFO|configuration_utils.py:839] 2025-10-22 16:02:05,263 >> Model config Qwen2Config {
575
+ "architectures": [
576
+ "Qwen2ForCausalLM"
577
+ ],
578
+ "attention_dropout": 0.0,
579
+ "bos_token_id": 151643,
580
+ "dtype": "bfloat16",
581
+ "eos_token_id": 151643,
582
+ "hidden_act": "silu",
583
+ "hidden_size": 896,
584
+ "initializer_range": 0.02,
585
+ "intermediate_size": 4864,
586
+ "layer_types": [
587
+ "full_attention",
588
+ "full_attention",
589
+ "full_attention",
590
+ "full_attention",
591
+ "full_attention",
592
+ "full_attention",
593
+ "full_attention",
594
+ "full_attention",
595
+ "full_attention",
596
+ "full_attention",
597
+ "full_attention",
598
+ "full_attention",
599
+ "full_attention",
600
+ "full_attention",
601
+ "full_attention",
602
+ "full_attention",
603
+ "full_attention",
604
+ "full_attention",
605
+ "full_attention",
606
+ "full_attention",
607
+ "full_attention",
608
+ "full_attention",
609
+ "full_attention",
610
+ "full_attention"
611
+ ],
612
+ "max_position_embeddings": 32768,
613
+ "max_window_layers": 24,
614
+ "model_type": "qwen2",
615
+ "num_attention_heads": 14,
616
+ "num_hidden_layers": 24,
617
+ "num_key_value_heads": 2,
618
+ "rms_norm_eps": 1e-06,
619
+ "rope_scaling": null,
620
+ "rope_theta": 1000000.0,
621
+ "sliding_window": null,
622
+ "tie_word_embeddings": true,
623
+ "transformers_version": "4.57.1",
624
+ "use_cache": true,
625
+ "use_mrope": false,
626
+ "use_sliding_window": false,
627
+ "vocab_size": 151936
628
+ }
629
+
630
+ [INFO|tokenization_utils_base.py:2421] 2025-10-22 16:02:05,402 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/chat_template.jinja
631
+ [INFO|tokenization_utils_base.py:2590] 2025-10-22 16:02:05,406 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/tokenizer_config.json
632
+ [INFO|tokenization_utils_base.py:2599] 2025-10-22 16:02:05,410 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/special_tokens_map.json
633
+ 34%| | 51/150 [00:13<00:44, 2.20it/s] 35%| | 52/150 [00:13<00:39, 2.47it/s] 35%| | 53/150 [00:13<00:34, 2.82it/s] 36%| | 54/150 [00:13<00:28, 3.40it/s] 37%| | 55/150 [00:14<00:27, 3.45it/s] 37%| | 56/150 [00:14<00:23, 4.06it/s] 38%| | 57/150 [00:14<00:22, 4.10it/s] 39%| | 58/150 [00:14<00:20, 4.51it/s] 39%| | 59/150 [00:14<00:17, 5.19it/s] 40%| | 60/150 [00:15<00:16, 5.57it/s] {'loss': 0.6139, 'grad_norm': 0.4990316331386566, 'learning_rate': 3.0333333333333337e-05, 'epoch': 0.0}
634
+ 40%| | 60/150 [00:15<00:16, 5.57it/s] 41%| | 61/150 [00:15<00:17, 5.19it/s] 41%| | 62/150 [00:15<00:15, 5.74it/s] 42%| | 63/150 [00:15<00:16, 5.17it/s] 43%| | 64/150 [00:15<00:15, 5.45it/s] 43%| | 65/150 [00:16<00:17, 4.97it/s] 44%| | 66/150 [00:16<00:18, 4.59it/s] 45%| | 67/150 [00:16<00:17, 4.86it/s] 45%| | 68/150 [00:16<00:18, 4.54it/s] 46%| | 69/150 [00:16<00:19, 4.15it/s] 47%| | 70/150 [00:17<00:19, 4.10it/s] {'loss': 0.597, 'grad_norm': 0.5236718058586121, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.01}
635
+ 47%| | 70/150 [00:17<00:19, 4.10it/s] 47%| | 71/150 [00:17<00:19, 3.97it/s] 48%| | 72/150 [00:17<00:17, 4.48it/s] 49%| | 73/150 [00:17<00:19, 4.00it/s] 49%| | 74/150 [00:18<00:18, 4.19it/s] 50%| | 75/150 [00:18<00:15, 4.70it/s] 51%| | 76/150 [00:18<00:15, 4.73it/s] 51%| | 77/150 [00:18<00:13, 5.23it/s] 52%| | 78/150 [00:18<00:15, 4.68it/s] 53%| | 79/150 [00:19<00:15, 4.55it/s] 53%| | 80/150 [00:19<00:16, 4.27it/s] {'loss': 0.6205, 'grad_norm': 0.41710713505744934, 'learning_rate': 2.3666666666666668e-05, 'epoch': 0.01}
636
+ 53%| | 80/150 [00:19<00:16, 4.27it/s] 54%| | 81/150 [00:19<00:14, 4.65it/s] 55%| | 82/150 [00:19<00:16, 4.06it/s] 55%| | 83/150 [00:20<00:15, 4.45it/s] 56%| | 84/150 [00:20<00:15, 4.39it/s] 57%| | 85/150 [00:20<00:14, 4.45it/s] 57%| | 86/150 [00:20<00:12, 5.07it/s] 58%| | 87/150 [00:20<00:12, 5.19it/s] 59%| | 88/150 [00:21<00:12, 4.88it/s] 59%| | 89/150 [00:21<00:13, 4.59it/s] 60%| | 90/150 [00:21<00:11, 5.22it/s] {'loss': 0.6038, 'grad_norm': 0.5673879981040955, 'learning_rate': 2.0333333333333334e-05, 'epoch': 0.01}
637
+ 60%| | 90/150 [00:21<00:11, 5.22it/s] 61%| | 91/150 [00:21<00:12, 4.64it/s] 61%| | 92/150 [00:22<00:12, 4.53it/s] 62%| | 93/150 [00:22<00:12, 4.75it/s] 63%| | 94/150 [00:22<00:11, 4.69it/s] 63%| | 95/150 [00:22<00:11, 4.78it/s] 64%| | 96/150 [00:22<00:12, 4.42it/s] 65%| | 97/150 [00:23<00:13, 3.84it/s] 65%| | 98/150 [00:23<00:12, 4.26it/s] 66%| | 99/150 [00:23<00:11, 4.53it/s] 67%| | 100/150 [00:23<00:11, 4.31it/s] {'loss': 0.5934, 'grad_norm': 0.49819639325141907, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.01}
638
+ 67%| | 100/150 [00:23<00:11, 4.31it/s][INFO|trainer.py:4309] 2025-10-22 16:02:16,719 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
639
+ [INFO|configuration_utils.py:765] 2025-10-22 16:02:16,928 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
640
+ [INFO|configuration_utils.py:839] 2025-10-22 16:02:16,929 >> Model config Qwen2Config {
641
+ "architectures": [
642
+ "Qwen2ForCausalLM"
643
+ ],
644
+ "attention_dropout": 0.0,
645
+ "bos_token_id": 151643,
646
+ "dtype": "bfloat16",
647
+ "eos_token_id": 151643,
648
+ "hidden_act": "silu",
649
+ "hidden_size": 896,
650
+ "initializer_range": 0.02,
651
+ "intermediate_size": 4864,
652
+ "layer_types": [
653
+ "full_attention",
654
+ "full_attention",
655
+ "full_attention",
656
+ "full_attention",
657
+ "full_attention",
658
+ "full_attention",
659
+ "full_attention",
660
+ "full_attention",
661
+ "full_attention",
662
+ "full_attention",
663
+ "full_attention",
664
+ "full_attention",
665
+ "full_attention",
666
+ "full_attention",
667
+ "full_attention",
668
+ "full_attention",
669
+ "full_attention",
670
+ "full_attention",
671
+ "full_attention",
672
+ "full_attention",
673
+ "full_attention",
674
+ "full_attention",
675
+ "full_attention",
676
+ "full_attention"
677
+ ],
678
+ "max_position_embeddings": 32768,
679
+ "max_window_layers": 24,
680
+ "model_type": "qwen2",
681
+ "num_attention_heads": 14,
682
+ "num_hidden_layers": 24,
683
+ "num_key_value_heads": 2,
684
+ "rms_norm_eps": 1e-06,
685
+ "rope_scaling": null,
686
+ "rope_theta": 1000000.0,
687
+ "sliding_window": null,
688
+ "tie_word_embeddings": true,
689
+ "transformers_version": "4.57.1",
690
+ "use_cache": true,
691
+ "use_mrope": false,
692
+ "use_sliding_window": false,
693
+ "vocab_size": 151936
694
+ }
695
+
696
+ [INFO|tokenization_utils_base.py:2421] 2025-10-22 16:02:17,110 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/chat_template.jinja
697
+ [INFO|tokenization_utils_base.py:2590] 2025-10-22 16:02:17,130 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/tokenizer_config.json
698
+ [INFO|tokenization_utils_base.py:2599] 2025-10-22 16:02:17,134 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/special_tokens_map.json
699
+ 67%| | 101/150 [00:25<00:25, 1.91it/s] 68%| | 102/150 [00:25<00:21, 2.24it/s] 69%| | 103/150 [00:25<00:17, 2.71it/s] 69%| | 104/150 [00:25<00:18, 2.54it/s] 70%| | 105/150 [00:26<00:17, 2.56it/s] 71%| | 106/150 [00:26<00:16, 2.71it/s] 71%| | 107/150 [00:26<00:14, 2.87it/s] 72%| | 108/150 [00:27<00:12, 3.30it/s] 73%| | 109/150 [00:27<00:11, 3.59it/s] 73%| | 110/150 [00:27<00:10, 3.89it/s] {'loss': 0.5548, 'grad_norm': 0.48188939690589905, 'learning_rate': 1.3666666666666666e-05, 'epoch': 0.01}
700
+ 73%| | 110/150 [00:27<00:10, 3.89it/s] 74%| | 111/150 [00:27<00:10, 3.80it/s] 75%| | 112/150 [00:28<00:08, 4.25it/s] 75%| | 113/150 [00:28<00:08, 4.41it/s] 76%| | 114/150 [00:28<00:07, 4.81it/s] 77%| | 115/150 [00:28<00:08, 4.33it/s] 77%| | 116/150 [00:29<00:09, 3.70it/s] 78%| | 117/150 [00:29<00:07, 4.23it/s] 79%| | 118/150 [00:29<00:06, 4.74it/s] 79%| | 119/150 [00:29<00:06, 4.49it/s] 80%| | 120/150 [00:29<00:06, 4.79it/s] {'loss': 0.5132, 'grad_norm': 0.5217602252960205, 'learning_rate': 1.0333333333333333e-05, 'epoch': 0.01}
701
+ 80%| | 120/150 [00:29<00:06, 4.79it/s] 81%| | 121/150 [00:30<00:06, 4.48it/s] 81%| | 122/150 [00:30<00:05, 4.81it/s] 82%| | 123/150 [00:30<00:05, 5.05it/s] 83%| | 124/150 [00:30<00:05, 4.71it/s] 83%| | 125/150 [00:30<00:05, 4.71it/s] 84%| | 126/150 [00:31<00:05, 4.01it/s] 85%| | 127/150 [00:31<00:05, 3.95it/s] 85%| | 128/150 [00:31<00:05, 4.01it/s] 86%| | 129/150 [00:31<00:05, 3.99it/s] 87%| | 130/150 [00:32<00:04, 4.54it/s] {'loss': 0.5586, 'grad_norm': 0.8095545172691345, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.01}
702
+ 87%| | 130/150 [00:32<00:04, 4.54it/s] 87%| | 131/150 [00:32<00:04, 4.34it/s] 88%| | 132/150 [00:32<00:03, 4.82it/s] 89%| | 133/150 [00:32<00:03, 4.39it/s] 89%| | 134/150 [00:33<00:03, 4.06it/s] 90%| | 135/150 [00:33<00:03, 3.91it/s] 91%| | 136/150 [00:33<00:03, 4.34it/s] 91%|| 137/150 [00:33<00:02, 4.52it/s] 92%|| 138/150 [00:33<00:02, 4.40it/s] 93%|| 139/150 [00:34<00:02, 3.95it/s] 93%|| 140/150 [00:34<00:02, 4.35it/s] {'loss': 0.563, 'grad_norm': 0.4983977973461151, 'learning_rate': 3.666666666666667e-06, 'epoch': 0.01}
703
+ 93%|| 140/150 [00:34<00:02, 4.35it/s] 94%|| 141/150 [00:34<00:02, 4.24it/s] 95%|| 142/150 [00:34<00:01, 4.53it/s] 95%|| 143/150 [00:35<00:01, 4.31it/s] 96%|| 144/150 [00:35<00:01, 4.96it/s] 97%|| 145/150 [00:35<00:01, 4.96it/s] 97%|| 146/150 [00:35<00:00, 4.70it/s] 98%|| 147/150 [00:35<00:00, 5.18it/s] 99%|| 148/150 [00:36<00:00, 5.32it/s] 99%|| 149/150 [00:36<00:00, 5.52it/s]100%|| 150/150 [00:36<00:00, 4.81it/s] {'loss': 0.5749, 'grad_norm': 0.4249863624572754, 'learning_rate': 3.3333333333333335e-07, 'epoch': 0.01}
704
+ 100%|| 150/150 [00:36<00:00, 4.81it/s][INFO|trainer.py:4309] 2025-10-22 16:02:29,334 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
705
+ [INFO|configuration_utils.py:765] 2025-10-22 16:02:29,507 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
706
+ [INFO|configuration_utils.py:839] 2025-10-22 16:02:29,508 >> Model config Qwen2Config {
707
+ "architectures": [
708
+ "Qwen2ForCausalLM"
709
+ ],
710
+ "attention_dropout": 0.0,
711
+ "bos_token_id": 151643,
712
+ "dtype": "bfloat16",
713
+ "eos_token_id": 151643,
714
+ "hidden_act": "silu",
715
+ "hidden_size": 896,
716
+ "initializer_range": 0.02,
717
+ "intermediate_size": 4864,
718
+ "layer_types": [
719
+ "full_attention",
720
+ "full_attention",
721
+ "full_attention",
722
+ "full_attention",
723
+ "full_attention",
724
+ "full_attention",
725
+ "full_attention",
726
+ "full_attention",
727
+ "full_attention",
728
+ "full_attention",
729
+ "full_attention",
730
+ "full_attention",
731
+ "full_attention",
732
+ "full_attention",
733
+ "full_attention",
734
+ "full_attention",
735
+ "full_attention",
736
+ "full_attention",
737
+ "full_attention",
738
+ "full_attention",
739
+ "full_attention",
740
+ "full_attention",
741
+ "full_attention",
742
+ "full_attention"
743
+ ],
744
+ "max_position_embeddings": 32768,
745
+ "max_window_layers": 24,
746
+ "model_type": "qwen2",
747
+ "num_attention_heads": 14,
748
+ "num_hidden_layers": 24,
749
+ "num_key_value_heads": 2,
750
+ "rms_norm_eps": 1e-06,
751
+ "rope_scaling": null,
752
+ "rope_theta": 1000000.0,
753
+ "sliding_window": null,
754
+ "tie_word_embeddings": true,
755
+ "transformers_version": "4.57.1",
756
+ "use_cache": true,
757
+ "use_mrope": false,
758
+ "use_sliding_window": false,
759
+ "vocab_size": 151936
760
+ }
761
+
762
+ [INFO|tokenization_utils_base.py:2421] 2025-10-22 16:02:29,679 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150/chat_template.jinja
763
+ [INFO|tokenization_utils_base.py:2590] 2025-10-22 16:02:29,683 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150/tokenizer_config.json
764
+ [INFO|tokenization_utils_base.py:2599] 2025-10-22 16:02:29,703 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150/special_tokens_map.json
765
+ [INFO|trainer.py:2810] 2025-10-22 16:02:30,219 >>
766
+
767
+ Training completed. Do not forget to share your model on huggingface.co/models =)
768
+
769
+
770
+ {'train_runtime': 38.3946, 'train_samples_per_second': 15.627, 'train_steps_per_second': 3.907, 'train_loss': 0.6288003253936768, 'epoch': 0.01}
771
+ 100%|| 150/150 [00:37<00:00, 4.81it/s]100%|| 150/150 [00:37<00:00, 4.01it/s]
772
+ [INFO|trainer.py:4309] 2025-10-22 16:02:30,229 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
773
+ [INFO|configuration_utils.py:765] 2025-10-22 16:02:30,323 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
774
+ [INFO|configuration_utils.py:839] 2025-10-22 16:02:30,323 >> Model config Qwen2Config {
775
+ "architectures": [
776
+ "Qwen2ForCausalLM"
777
+ ],
778
+ "attention_dropout": 0.0,
779
+ "bos_token_id": 151643,
780
+ "dtype": "bfloat16",
781
+ "eos_token_id": 151643,
782
+ "hidden_act": "silu",
783
+ "hidden_size": 896,
784
+ "initializer_range": 0.02,
785
+ "intermediate_size": 4864,
786
+ "layer_types": [
787
+ "full_attention",
788
+ "full_attention",
789
+ "full_attention",
790
+ "full_attention",
791
+ "full_attention",
792
+ "full_attention",
793
+ "full_attention",
794
+ "full_attention",
795
+ "full_attention",
796
+ "full_attention",
797
+ "full_attention",
798
+ "full_attention",
799
+ "full_attention",
800
+ "full_attention",
801
+ "full_attention",
802
+ "full_attention",
803
+ "full_attention",
804
+ "full_attention",
805
+ "full_attention",
806
+ "full_attention",
807
+ "full_attention",
808
+ "full_attention",
809
+ "full_attention",
810
+ "full_attention"
811
+ ],
812
+ "max_position_embeddings": 32768,
813
+ "max_window_layers": 24,
814
+ "model_type": "qwen2",
815
+ "num_attention_heads": 14,
816
+ "num_hidden_layers": 24,
817
+ "num_key_value_heads": 2,
818
+ "rms_norm_eps": 1e-06,
819
+ "rope_scaling": null,
820
+ "rope_theta": 1000000.0,
821
+ "sliding_window": null,
822
+ "tie_word_embeddings": true,
823
+ "transformers_version": "4.57.1",
824
+ "use_cache": true,
825
+ "use_mrope": false,
826
+ "use_sliding_window": false,
827
+ "vocab_size": 151936
828
+ }
829
+
830
+ [INFO|tokenization_utils_base.py:2421] 2025-10-22 16:02:30,422 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/chat_template.jinja
831
+ [INFO|tokenization_utils_base.py:2590] 2025-10-22 16:02:30,426 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/tokenizer_config.json
832
+ [INFO|tokenization_utils_base.py:2599] 2025-10-22 16:02:30,430 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/special_tokens_map.json
833
+ ***** train metrics *****
834
+ epoch = 0.0123
835
+ total_flos = 2243462GF
836
+ train_loss = 0.6288
837
+ train_runtime = 0:00:38.39
838
+ train_samples_per_second = 15.627
839
+ train_steps_per_second = 3.907
840
+ [INFO|modelcard.py:456] 2025-10-22 16:02:30,648 >> Dropping the following result as it does not have all the necessary fields:
841
+ {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
842
+ gl064:2368556:2368556 [1] NCCL INFO comm 0x15c0db00 rank 1 nranks 4 cudaDev 1 busId 59000 - Destroy COMPLETE
843
+ gl064:2368555:2368555 [0] NCCL INFO comm 0x14bb0450 rank 0 nranks 4 cudaDev 0 busId 47000 - Destroy COMPLETE
844
+ [1;34mwandb[0m:
845
+ [1;34mwandb[0m: View run [33minteractive_test[0m at: [34m[0m
846
+ [1;34mwandb[0m: Find logs at: [1;35mwandb/run-20251022_160152-f7vqjhyf/logs[0m
847
+
848
+ ========================================
849
+ Training completed successfully
850
+ End Time: Wed Oct 22 04:02:32 PM EDT 2025
851
+ ========================================
852
+
853
+ ========================================
854
+ STAGE 2: Merging/Exporting Model
855
+ Start Time: Wed Oct 22 04:02:32 PM EDT 2025
856
+ ========================================
857
+ Looking for checkpoints in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
858
+ Found most recent checkpoint: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
859
+ Checkpoint details:
860
+ Path: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
861
+ Last modified: 2025-10-22 16:02:30.204175325 -0400
862
+ Training step: 150
863
+ Updating merge config to point to checkpoint...
864
+ Successfully updated merge config
865
+ Updated merge config to use: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
866
+
867
+ Merge config contents:
868
+ model_name_or_path: Qwen/Qwen2.5-0.5B
869
+ finetuning_type: lora
870
+ trust_remote_code: true
871
+ adapter_name_or_path: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
872
+ template: default
873
+ export_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
874
+
875
+ Executing command: llamafactory-cli export /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
876
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
877
+ warnings.warn(
878
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
879
+ import pkg_resources
880
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,686 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
881
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,686 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
882
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,686 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
883
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,686 >> loading file added_tokens.json from cache at None
884
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,687 >> loading file special_tokens_map.json from cache at None
885
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,687 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
886
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,687 >> loading file chat_template.jinja from cache at None
887
+ [INFO|tokenization_utils_base.py:2364] 2025-10-22 16:02:40,863 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
888
+ [INFO|configuration_utils.py:765] 2025-10-22 16:02:41,054 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
889
+ [INFO|configuration_utils.py:839] 2025-10-22 16:02:41,056 >> Model config Qwen2Config {
890
+ "architectures": [
891
+ "Qwen2ForCausalLM"
892
+ ],
893
+ "attention_dropout": 0.0,
894
+ "bos_token_id": 151643,
895
+ "dtype": "bfloat16",
896
+ "eos_token_id": 151643,
897
+ "hidden_act": "silu",
898
+ "hidden_size": 896,
899
+ "initializer_range": 0.02,
900
+ "intermediate_size": 4864,
901
+ "layer_types": [
902
+ "full_attention",
903
+ "full_attention",
904
+ "full_attention",
905
+ "full_attention",
906
+ "full_attention",
907
+ "full_attention",
908
+ "full_attention",
909
+ "full_attention",
910
+ "full_attention",
911
+ "full_attention",
912
+ "full_attention",
913
+ "full_attention",
914
+ "full_attention",
915
+ "full_attention",
916
+ "full_attention",
917
+ "full_attention",
918
+ "full_attention",
919
+ "full_attention",
920
+ "full_attention",
921
+ "full_attention",
922
+ "full_attention",
923
+ "full_attention",
924
+ "full_attention",
925
+ "full_attention"
926
+ ],
927
+ "max_position_embeddings": 32768,
928
+ "max_window_layers": 24,
929
+ "model_type": "qwen2",
930
+ "num_attention_heads": 14,
931
+ "num_hidden_layers": 24,
932
+ "num_key_value_heads": 2,
933
+ "rms_norm_eps": 1e-06,
934
+ "rope_scaling": null,
935
+ "rope_theta": 1000000.0,
936
+ "sliding_window": null,
937
+ "tie_word_embeddings": true,
938
+ "transformers_version": "4.57.1",
939
+ "use_cache": true,
940
+ "use_mrope": false,
941
+ "use_sliding_window": false,
942
+ "vocab_size": 151936
943
+ }
944
+
945
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
946
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
947
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
948
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file added_tokens.json from cache at None
949
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file special_tokens_map.json from cache at None
950
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
951
+ [INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file chat_template.jinja from cache at None
952
+ [INFO|tokenization_utils_base.py:2364] 2025-10-22 16:02:41,298 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
953
+ [INFO|configuration_utils.py:765] 2025-10-22 16:02:41,348 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
954
+ [INFO|configuration_utils.py:839] 2025-10-22 16:02:41,348 >> Model config Qwen2Config {
955
+ "architectures": [
956
+ "Qwen2ForCausalLM"
957
+ ],
958
+ "attention_dropout": 0.0,
959
+ "bos_token_id": 151643,
960
+ "dtype": "bfloat16",
961
+ "eos_token_id": 151643,
962
+ "hidden_act": "silu",
963
+ "hidden_size": 896,
964
+ "initializer_range": 0.02,
965
+ "intermediate_size": 4864,
966
+ "layer_types": [
967
+ "full_attention",
968
+ "full_attention",
969
+ "full_attention",
970
+ "full_attention",
971
+ "full_attention",
972
+ "full_attention",
973
+ "full_attention",
974
+ "full_attention",
975
+ "full_attention",
976
+ "full_attention",
977
+ "full_attention",
978
+ "full_attention",
979
+ "full_attention",
980
+ "full_attention",
981
+ "full_attention",
982
+ "full_attention",
983
+ "full_attention",
984
+ "full_attention",
985
+ "full_attention",
986
+ "full_attention",
987
+ "full_attention",
988
+ "full_attention",
989
+ "full_attention",
990
+ "full_attention"
991
+ ],
992
+ "max_position_embeddings": 32768,
993
+ "max_window_layers": 24,
994
+ "model_type": "qwen2",
995
+ "num_attention_heads": 14,
996
+ "num_hidden_layers": 24,
997
+ "num_key_value_heads": 2,
998
+ "rms_norm_eps": 1e-06,
999
+ "rope_scaling": null,
1000
+ "rope_theta": 1000000.0,
1001
+ "sliding_window": null,
1002
+ "tie_word_embeddings": true,
1003
+ "transformers_version": "4.57.1",
1004
+ "use_cache": true,
1005
+ "use_mrope": false,
1006
+ "use_sliding_window": false,
1007
+ "vocab_size": 151936
1008
+ }
1009
+
1010
+ [WARNING|logging.py:328] 2025-10-22 16:02:41,348 >> `torch_dtype` is deprecated! Use `dtype` instead!
1011
+ [INFO|2025-10-22 16:02:41] llamafactory.model.model_utils.kv_cache:143 >> KV cache is enabled for faster generation.
1012
+ [WARNING|logging.py:328] 2025-10-22 16:02:41,741 >> `torch_dtype` is deprecated! Use `dtype` instead!
1013
+ [INFO|modeling_utils.py:1172] 2025-10-22 16:02:41,742 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
1014
+ [INFO|modeling_utils.py:2341] 2025-10-22 16:02:41,743 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
1015
+ [INFO|configuration_utils.py:986] 2025-10-22 16:02:41,743 >> Generate config GenerationConfig {
1016
+ "bos_token_id": 151643,
1017
+ "eos_token_id": 151643
1018
+ }
1019
+
1020
+ [INFO|configuration_utils.py:941] 2025-10-22 16:02:41,844 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
1021
+ [INFO|configuration_utils.py:986] 2025-10-22 16:02:41,844 >> Generate config GenerationConfig {
1022
+ "bos_token_id": 151643,
1023
+ "eos_token_id": 151643,
1024
+ "max_new_tokens": 2048
1025
+ }
1026
+
1027
+ [INFO|dynamic_module_utils.py:423] 2025-10-22 16:02:41,879 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
1028
+ [INFO|2025-10-22 16:02:41] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
1029
+ [INFO|2025-10-22 16:02:42] llamafactory.model.adapter:143 >> Merged 1 adapter(s).
1030
+ [INFO|2025-10-22 16:02:42] llamafactory.model.adapter:143 >> Loaded adapter(s): /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
1031
+ [INFO|2025-10-22 16:02:42] llamafactory.model.loader:143 >> all params: 494,032,768
1032
+ [INFO|2025-10-22 16:02:42] llamafactory.train.tuner:143 >> Convert model dtype to: torch.bfloat16.
1033
+ [INFO|configuration_utils.py:491] 2025-10-22 16:02:42,967 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/config.json
1034
+ [INFO|configuration_utils.py:757] 2025-10-22 16:02:42,971 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/generation_config.json
1035
+ [INFO|modeling_utils.py:4181] 2025-10-22 16:02:44,581 >> Model weights saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/model.safetensors
1036
+ [INFO|tokenization_utils_base.py:2421] 2025-10-22 16:02:44,587 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/chat_template.jinja
1037
+ [INFO|tokenization_utils_base.py:2590] 2025-10-22 16:02:44,591 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/tokenizer_config.json
1038
+ [INFO|tokenization_utils_base.py:2599] 2025-10-22 16:02:44,595 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/special_tokens_map.json
1039
+ [INFO|2025-10-22 16:02:44] llamafactory.train.tuner:143 >> Ollama modelfile saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/Modelfile
1040
+
1041
+ ========================================
1042
+ Merge/Export completed successfully
1043
+ End Time: Wed Oct 22 04:02:45 PM EDT 2025
1044
+ ========================================
1045
+
1046
+ ========================================
1047
+ Preparing Training Artifacts
1048
+ ========================================
1049
+ Copying configuration files...
1050
+ Copying and cleaning training logs...
training_artifacts/merge_config.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ model_name_or_path: Qwen/Qwen2.5-0.5B
2
+ finetuning_type: lora
3
+ trust_remote_code: true
4
+ adapter_name_or_path: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
5
+ template: default
6
+ export_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
training_artifacts/train_config.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ stage: sft
2
+ do_train: true
3
+ max_steps: 150
4
+ do_eval: false
5
+ save_strategy: steps
6
+ save_steps: 50
7
+ logging_steps: 10
8
+ fp16: true
9
+ bf16: false
10
+ overwrite_output_dir: true
11
+ per_device_train_batch_size: 1
12
+ gradient_accumulation_steps: 1
13
+ gradient_checkpointing: true
14
+ model_name_or_path: Qwen/Qwen2.5-0.5B
15
+ finetuning_type: lora
16
+ dataset: my_custom_sft
17
+ dataset_dir: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory/data
18
+ template: default
19
+ cutoff_len: 8096
20
+ val_size: 0.1
21
+ lora_rank: 8
22
+ lora_alpha: 16
23
+ lora_dropout: 0.05
24
+ lora_target: all
25
+ output_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints