Upload folder using huggingface_hub
Browse files- .gitattributes +7 -35
- model.safetensors +1 -1
- training_artifacts/README.md +16 -0
- training_artifacts/hydra_config.yaml +216 -0
- training_artifacts/logs/pipeline_cleaned.txt +1050 -0
- training_artifacts/merge_config.yaml +6 -0
- training_artifacts/train_config.yaml +25 -0
.gitattributes
CHANGED
|
@@ -1,36 +1,8 @@
|
|
| 1 |
-
|
| 2 |
-
*.
|
| 3 |
-
*.
|
| 4 |
-
*.
|
| 5 |
-
*.
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
# Mark all log files as text to prevent binary file issues
|
| 2 |
+
*.log text
|
| 3 |
+
*.txt text
|
| 4 |
+
*.out text
|
| 5 |
+
*.err text
|
| 6 |
+
training_artifacts/logs/* text
|
| 7 |
+
model.safetensors filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 988097824
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:258d5aa4b5952c31da6d4f45ad8ad8c963d4577e0800b99258da45caf9e41f18
|
| 3 |
size 988097824
|
training_artifacts/README.md
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Training Artifacts
|
| 2 |
+
|
| 3 |
+
This directory contains the training configuration and logs for this model.
|
| 4 |
+
|
| 5 |
+
## Contents
|
| 6 |
+
|
| 7 |
+
- **hydra_config.yaml**: Complete Hydra configuration used for training
|
| 8 |
+
- **train_config.yaml**: LlamaFactory training configuration
|
| 9 |
+
- **merge_config.yaml**: LlamaFactory merge/export configuration
|
| 10 |
+
- **logs/**: Training logs from the job (cleaned for text format)
|
| 11 |
+
|
| 12 |
+
## Job Information
|
| 13 |
+
|
| 14 |
+
- Job Name: lf_torch_test__interactive
|
| 15 |
+
- Timestamp: 2025-10-22 20:02:45 UTC
|
| 16 |
+
- Execution Mode: Local
|
training_artifacts/hydra_config.yaml
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
? ''
|
| 2 |
+
: ? ''
|
| 3 |
+
: ? ''
|
| 4 |
+
: hydra:
|
| 5 |
+
run:
|
| 6 |
+
dir: .
|
| 7 |
+
output_subdir: null
|
| 8 |
+
job:
|
| 9 |
+
chdir: false
|
| 10 |
+
_target_: null
|
| 11 |
+
job:
|
| 12 |
+
name: ???
|
| 13 |
+
mode: slurm
|
| 14 |
+
work_dir: null
|
| 15 |
+
dry_run: false
|
| 16 |
+
slurm:
|
| 17 |
+
time_limit: ???
|
| 18 |
+
constraint:
|
| 19 |
+
- h200
|
| 20 |
+
memory: 200
|
| 21 |
+
cpus_per_task: 16
|
| 22 |
+
partition: null
|
| 23 |
+
mail_user: user@example.com
|
| 24 |
+
execution:
|
| 25 |
+
nodes: null
|
| 26 |
+
gpus_per_node: null
|
| 27 |
+
num_gpus: null
|
| 28 |
+
hostfile: null
|
| 29 |
+
secrets_file: null
|
| 30 |
+
model:
|
| 31 |
+
name_or_path: ???
|
| 32 |
+
finetuning_type: lora
|
| 33 |
+
dataset:
|
| 34 |
+
name: ???
|
| 35 |
+
dir: null
|
| 36 |
+
info_json: null
|
| 37 |
+
template: default
|
| 38 |
+
cutoff_len: 1024
|
| 39 |
+
val_size: 0.1
|
| 40 |
+
hf_hub_url: null
|
| 41 |
+
formatting: alpaca
|
| 42 |
+
ranking: false
|
| 43 |
+
subset: null
|
| 44 |
+
split: train
|
| 45 |
+
folder: null
|
| 46 |
+
num_samples: null
|
| 47 |
+
columns:
|
| 48 |
+
prompt: null
|
| 49 |
+
query: null
|
| 50 |
+
response: null
|
| 51 |
+
history: null
|
| 52 |
+
messages: null
|
| 53 |
+
system: null
|
| 54 |
+
tools: null
|
| 55 |
+
images: null
|
| 56 |
+
videos: null
|
| 57 |
+
audios: null
|
| 58 |
+
chosen: null
|
| 59 |
+
rejected: null
|
| 60 |
+
kto_tag: null
|
| 61 |
+
tags:
|
| 62 |
+
role: null
|
| 63 |
+
content: null
|
| 64 |
+
user: null
|
| 65 |
+
assistant: null
|
| 66 |
+
observation: null
|
| 67 |
+
function: null
|
| 68 |
+
system: null
|
| 69 |
+
training:
|
| 70 |
+
stage: sft
|
| 71 |
+
do_train: true
|
| 72 |
+
model_name_or_path: null
|
| 73 |
+
finetuning_type: lora
|
| 74 |
+
trust_remote_code: true
|
| 75 |
+
dataset: null
|
| 76 |
+
dataset_dir: null
|
| 77 |
+
template: default
|
| 78 |
+
cutoff_len: 1024
|
| 79 |
+
val_size: 0.1
|
| 80 |
+
preprocessing_num_workers: 1
|
| 81 |
+
dataset_num_proc: 1
|
| 82 |
+
dataloader_num_workers: 0
|
| 83 |
+
streaming: false
|
| 84 |
+
learning_rate: 5.0e-05
|
| 85 |
+
num_train_epochs: 3.0
|
| 86 |
+
per_device_train_batch_size: 1
|
| 87 |
+
per_device_eval_batch_size: 1
|
| 88 |
+
gradient_accumulation_steps: 8
|
| 89 |
+
lr_scheduler_type: cosine
|
| 90 |
+
warmup_ratio: 0.1
|
| 91 |
+
warmup_steps: 0
|
| 92 |
+
lora_rank: 8
|
| 93 |
+
lora_alpha: 16
|
| 94 |
+
lora_dropout: 0.05
|
| 95 |
+
lora_target: all
|
| 96 |
+
optim: adamw_torch
|
| 97 |
+
bf16: true
|
| 98 |
+
fp16: false
|
| 99 |
+
output_dir: null
|
| 100 |
+
save_strategy: epoch
|
| 101 |
+
save_steps: 500
|
| 102 |
+
save_total_limit: 3
|
| 103 |
+
save_only_model: false
|
| 104 |
+
eval_strategy: steps
|
| 105 |
+
eval_steps: 500
|
| 106 |
+
do_eval: true
|
| 107 |
+
logging_steps: 10
|
| 108 |
+
plot_loss: true
|
| 109 |
+
report_to: none
|
| 110 |
+
gradient_checkpointing: true
|
| 111 |
+
ddp_timeout: 180000000
|
| 112 |
+
include_num_input_tokens_seen: true
|
| 113 |
+
overwrite_output_dir: true
|
| 114 |
+
overwrite_cache: false
|
| 115 |
+
seed: 42
|
| 116 |
+
lora:
|
| 117 |
+
rank: 8
|
| 118 |
+
alpha: 16
|
| 119 |
+
dropout: 0.05
|
| 120 |
+
target: all
|
| 121 |
+
output:
|
| 122 |
+
experiment_dir: ./experiments
|
| 123 |
+
merge:
|
| 124 |
+
stage: export
|
| 125 |
+
model_name_or_path: null
|
| 126 |
+
adapter_name_or_path: null
|
| 127 |
+
template: default
|
| 128 |
+
export_dir: null
|
| 129 |
+
export_size: 2
|
| 130 |
+
export_device: auto
|
| 131 |
+
export_legacy_format: false
|
| 132 |
+
finetuning_type: lora
|
| 133 |
+
wandb:
|
| 134 |
+
project: null
|
| 135 |
+
run_name: null
|
| 136 |
+
entity: null
|
| 137 |
+
hf:
|
| 138 |
+
repo_id: null
|
| 139 |
+
private: false
|
| 140 |
+
upload_artifacts: true
|
| 141 |
+
cleanup:
|
| 142 |
+
checkpoints: false
|
| 143 |
+
merged: false
|
| 144 |
+
job:
|
| 145 |
+
name: lf_torch_test__interactive
|
| 146 |
+
mode: local
|
| 147 |
+
work_dir: null
|
| 148 |
+
dry_run: false
|
| 149 |
+
slurm:
|
| 150 |
+
time_limit: null
|
| 151 |
+
constraint: null
|
| 152 |
+
memory: null
|
| 153 |
+
partition: null
|
| 154 |
+
mail_user: null
|
| 155 |
+
execution:
|
| 156 |
+
nodes: 2
|
| 157 |
+
gpus_per_node: 2
|
| 158 |
+
num_gpus: null
|
| 159 |
+
hostfile: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/hostfile_auto_generated.txt
|
| 160 |
+
secrets_file: ./secrets.env
|
| 161 |
+
model:
|
| 162 |
+
name_or_path: Qwen/Qwen2.5-0.5B
|
| 163 |
+
finetuning_type: lora
|
| 164 |
+
lora:
|
| 165 |
+
rank: 8
|
| 166 |
+
alpha: 16
|
| 167 |
+
dropout: 0.05
|
| 168 |
+
target: all
|
| 169 |
+
dataset:
|
| 170 |
+
name: my_custom_sft
|
| 171 |
+
dir: null
|
| 172 |
+
info_json: null
|
| 173 |
+
template: default
|
| 174 |
+
cutoff_len: 8096
|
| 175 |
+
val_size: 0.1
|
| 176 |
+
hf_hub_url: TAUR-dev/D-SFT_C-sft_exp_AT_pvv2__fixed-sft-data
|
| 177 |
+
formatting: sharegpt
|
| 178 |
+
ranking: false
|
| 179 |
+
subset: null
|
| 180 |
+
split: train
|
| 181 |
+
folder: null
|
| 182 |
+
num_samples: null
|
| 183 |
+
columns:
|
| 184 |
+
messages: conversations
|
| 185 |
+
tags:
|
| 186 |
+
role: role
|
| 187 |
+
content: content
|
| 188 |
+
user: user
|
| 189 |
+
assistant: assistant
|
| 190 |
+
output:
|
| 191 |
+
experiment_dir: ./experiments
|
| 192 |
+
wandb:
|
| 193 |
+
project: null
|
| 194 |
+
run_name: interactive_test
|
| 195 |
+
entity: null
|
| 196 |
+
hf:
|
| 197 |
+
repo_id: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
|
| 198 |
+
private: false
|
| 199 |
+
cleanup:
|
| 200 |
+
checkpoints: false
|
| 201 |
+
merged: false
|
| 202 |
+
training:
|
| 203 |
+
stage: sft
|
| 204 |
+
do_train: true
|
| 205 |
+
max_steps: 150
|
| 206 |
+
do_eval: false
|
| 207 |
+
save_strategy: steps
|
| 208 |
+
save_steps: 50
|
| 209 |
+
logging_steps: 10
|
| 210 |
+
fp16: true
|
| 211 |
+
bf16: false
|
| 212 |
+
overwrite_output_dir: true
|
| 213 |
+
per_device_train_batch_size: 1
|
| 214 |
+
gradient_accumulation_steps: 1
|
| 215 |
+
gradient_checkpointing: true
|
| 216 |
+
merge: {}
|
training_artifacts/logs/pipeline_cleaned.txt
ADDED
|
@@ -0,0 +1,1050 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
========================================
|
| 2 |
+
Job Name: lf_torch_test__interactive
|
| 3 |
+
Hostname: gl064.hpc.nyu.edu
|
| 4 |
+
Number of nodes: 2
|
| 5 |
+
GPUs per node: 2
|
| 6 |
+
Start Time: Wed Oct 22 04:01:29 PM EDT 2025
|
| 7 |
+
Log file: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/logs/pipeline.log
|
| 8 |
+
========================================
|
| 9 |
+
Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
|
| 10 |
+
|
| 11 |
+
========================================
|
| 12 |
+
Configuration Paths
|
| 13 |
+
========================================
|
| 14 |
+
Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
|
| 15 |
+
Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
|
| 16 |
+
Dataset Info:
|
| 17 |
+
Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
|
| 18 |
+
Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
|
| 19 |
+
HF Repo ID: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
========================================
|
| 23 |
+
Multi-Node Coordination
|
| 24 |
+
========================================
|
| 25 |
+
This is the master node - coordinating worker nodes...
|
| 26 |
+
Master node: gl064
|
| 27 |
+
Master port: 29500
|
| 28 |
+
World size: 2
|
| 29 |
+
|
| 30 |
+
Launching on worker node 1: gl065
|
| 31 |
+
All worker nodes launched successfully
|
| 32 |
+
Master node (this node) will now join training as rank 0
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
========================================
|
| 36 |
+
STAGE 1: Training Model
|
| 37 |
+
Start Time: Wed Oct 22 04:01:31 PM EDT 2025
|
| 38 |
+
========================================
|
| 39 |
+
Multi-node training detected
|
| 40 |
+
Nodes: 2, GPUs per node: 2
|
| 41 |
+
Master address: gl064
|
| 42 |
+
Master port: 29500
|
| 43 |
+
Node rank: 0
|
| 44 |
+
World size: 2
|
| 45 |
+
CUDA_VISIBLE_DEVICES: 0,1
|
| 46 |
+
LLaMA-Factory path: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
|
| 47 |
+
Training config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
|
| 48 |
+
|
| 49 |
+
Starting distributed training with torch.distributed.run...
|
| 50 |
+
|
| 51 |
+
*****************************************
|
| 52 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 53 |
+
*****************************************
|
| 54 |
+
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
|
| 55 |
+
warnings.warn(
|
| 56 |
+
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
|
| 57 |
+
warnings.warn(
|
| 58 |
+
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
|
| 59 |
+
import pkg_resources
|
| 60 |
+
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
|
| 61 |
+
import pkg_resources
|
| 62 |
+
[INFO|2025-10-22 16:01:48] llamafactory.hparams.parser:423 >> Process rank: 1, world size: 4, device: cuda:1, distributed training: True, compute dtype: torch.float16
|
| 63 |
+
[INFO|2025-10-22 16:01:48] llamafactory.hparams.parser:143 >> Set `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.
|
| 64 |
+
[INFO|2025-10-22 16:01:48] llamafactory.hparams.parser:423 >> Process rank: 0, world size: 4, device: cuda:0, distributed training: True, compute dtype: torch.float16
|
| 65 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,287 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
|
| 66 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,287 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
|
| 67 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,288 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
|
| 68 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,288 >> loading file added_tokens.json from cache at None
|
| 69 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,288 >> loading file special_tokens_map.json from cache at None
|
| 70 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,288 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
|
| 71 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,288 >> loading file chat_template.jinja from cache at None
|
| 72 |
+
[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:01:48,457 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
| 73 |
+
[INFO|configuration_utils.py:765] 2025-10-22 16:01:48,674 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 74 |
+
[INFO|configuration_utils.py:839] 2025-10-22 16:01:48,676 >> Model config Qwen2Config {
|
| 75 |
+
"architectures": [
|
| 76 |
+
"Qwen2ForCausalLM"
|
| 77 |
+
],
|
| 78 |
+
"attention_dropout": 0.0,
|
| 79 |
+
"bos_token_id": 151643,
|
| 80 |
+
"dtype": "bfloat16",
|
| 81 |
+
"eos_token_id": 151643,
|
| 82 |
+
"hidden_act": "silu",
|
| 83 |
+
"hidden_size": 896,
|
| 84 |
+
"initializer_range": 0.02,
|
| 85 |
+
"intermediate_size": 4864,
|
| 86 |
+
"layer_types": [
|
| 87 |
+
"full_attention",
|
| 88 |
+
"full_attention",
|
| 89 |
+
"full_attention",
|
| 90 |
+
"full_attention",
|
| 91 |
+
"full_attention",
|
| 92 |
+
"full_attention",
|
| 93 |
+
"full_attention",
|
| 94 |
+
"full_attention",
|
| 95 |
+
"full_attention",
|
| 96 |
+
"full_attention",
|
| 97 |
+
"full_attention",
|
| 98 |
+
"full_attention",
|
| 99 |
+
"full_attention",
|
| 100 |
+
"full_attention",
|
| 101 |
+
"full_attention",
|
| 102 |
+
"full_attention",
|
| 103 |
+
"full_attention",
|
| 104 |
+
"full_attention",
|
| 105 |
+
"full_attention",
|
| 106 |
+
"full_attention",
|
| 107 |
+
"full_attention",
|
| 108 |
+
"full_attention",
|
| 109 |
+
"full_attention",
|
| 110 |
+
"full_attention"
|
| 111 |
+
],
|
| 112 |
+
"max_position_embeddings": 32768,
|
| 113 |
+
"max_window_layers": 24,
|
| 114 |
+
"model_type": "qwen2",
|
| 115 |
+
"num_attention_heads": 14,
|
| 116 |
+
"num_hidden_layers": 24,
|
| 117 |
+
"num_key_value_heads": 2,
|
| 118 |
+
"rms_norm_eps": 1e-06,
|
| 119 |
+
"rope_scaling": null,
|
| 120 |
+
"rope_theta": 1000000.0,
|
| 121 |
+
"sliding_window": null,
|
| 122 |
+
"tie_word_embeddings": true,
|
| 123 |
+
"transformers_version": "4.57.1",
|
| 124 |
+
"use_cache": true,
|
| 125 |
+
"use_mrope": false,
|
| 126 |
+
"use_sliding_window": false,
|
| 127 |
+
"vocab_size": 151936
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,738 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
|
| 131 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,738 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
|
| 132 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,738 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
|
| 133 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,738 >> loading file added_tokens.json from cache at None
|
| 134 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,738 >> loading file special_tokens_map.json from cache at None
|
| 135 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,738 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
|
| 136 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,738 >> loading file chat_template.jinja from cache at None
|
| 137 |
+
[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:01:48,904 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
| 138 |
+
[INFO|2025-10-22 16:01:48] llamafactory.data.loader:143 >> Loading dataset TAUR-dev/D-SFT_C-sft_exp_AT_pvv2__fixed-sft-data...
|
| 139 |
+
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4876: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.
|
| 140 |
+
warnings.warn( # warn only once
|
| 141 |
+
[rank0]:[W1022 16:01:49.085275271 ProcessGroupNCCL.cpp:5068] Guessing device ID based on global rank. This can cause a hang if rank to GPU mapping is heterogeneous. You can specify device_id in init_process_group()
|
| 142 |
+
gl064:2368555:2368555 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
|
| 143 |
+
gl064:2368555:2368555 [0] NCCL INFO Bootstrap: Using ibs3:10.0.5.0<0>
|
| 144 |
+
gl064:2368555:2368555 [0] NCCL INFO cudaDriverVersion 13000
|
| 145 |
+
gl064:2368555:2368555 [0] NCCL INFO NCCL version 2.27.5+cuda12.9
|
| 146 |
+
gl064:2368555:2368555 [0] NCCL INFO Comm config Blocking set to 1
|
| 147 |
+
gl064:2368556:2368556 [1] NCCL INFO cudaDriverVersion 13000
|
| 148 |
+
gl064:2368556:2368556 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
|
| 149 |
+
gl064:2368556:2368556 [1] NCCL INFO Bootstrap: Using ibs3:10.0.5.0<0>
|
| 150 |
+
gl064:2368556:2368556 [1] NCCL INFO NCCL version 2.27.5+cuda12.9
|
| 151 |
+
gl064:2368556:2368556 [1] NCCL INFO Comm config Blocking set to 1
|
| 152 |
+
gl064:2368555:2368616 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
|
| 153 |
+
gl064:2368555:2368616 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
|
| 154 |
+
gl064:2368555:2368616 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
|
| 155 |
+
gl064:2368555:2368616 [0] NCCL INFO NCCL_IB_HCA set to mlx5
|
| 156 |
+
gl064:2368556:2368617 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
|
| 157 |
+
gl064:2368556:2368617 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
|
| 158 |
+
gl064:2368556:2368617 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
|
| 159 |
+
gl064:2368556:2368617 [1] NCCL INFO NCCL_IB_HCA set to mlx5
|
| 160 |
+
gl064:2368555:2368616 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.0<0>
|
| 161 |
+
gl064:2368555:2368616 [0] NCCL INFO Initialized NET plugin IB
|
| 162 |
+
gl064:2368555:2368616 [0] NCCL INFO Assigned NET plugin IB to comm
|
| 163 |
+
gl064:2368555:2368616 [0] NCCL INFO Using network IB
|
| 164 |
+
gl064:2368555:2368616 [0] NCCL INFO ncclCommInitRankConfig comm 0x14bb0450 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x503fff15b9ea778b - Init START
|
| 165 |
+
gl064:2368556:2368617 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.0<0>
|
| 166 |
+
gl064:2368556:2368617 [1] NCCL INFO Initialized NET plugin IB
|
| 167 |
+
gl064:2368556:2368617 [1] NCCL INFO Assigned NET plugin IB to comm
|
| 168 |
+
gl064:2368556:2368617 [1] NCCL INFO Using network IB
|
| 169 |
+
gl064:2368556:2368617 [1] NCCL INFO ncclCommInitRankConfig comm 0x15c0db00 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x503fff15b9ea778b - Init START
|
| 170 |
+
gl064:2368555:2368616 [0] NCCL INFO RAS client listening socket at ::1<28028>
|
| 171 |
+
gl064:2368556:2368617 [1] NCCL INFO RAS client listening socket at ::1<28028>
|
| 172 |
+
gl064:2368555:2368616 [0] NCCL INFO Bootstrap timings total 0.321405 (create 0.000022, send 0.000239, recv 0.002956, ring 0.302954, delay 0.000000)
|
| 173 |
+
gl064:2368556:2368617 [1] NCCL INFO Bootstrap timings total 0.319316 (create 0.000023, send 0.000069, recv 0.316285, ring 0.001306, delay 0.000000)
|
| 174 |
+
gl064:2368555:2368616 [0] NCCL INFO Setting affinity for GPU 0 to 0-15
|
| 175 |
+
gl064:2368556:2368617 [1] NCCL INFO Setting affinity for GPU 1 to 0-15
|
| 176 |
+
gl064:2368556:2368617 [1] NCCL INFO comm 0x15c0db00 rank 1 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
|
| 177 |
+
gl064:2368555:2368616 [0] NCCL INFO comm 0x14bb0450 rank 0 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
|
| 178 |
+
gl064:2368555:2368616 [0] NCCL INFO Channel 00/02 : 0 1 2 3
|
| 179 |
+
gl064:2368556:2368617 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
|
| 180 |
+
gl064:2368555:2368616 [0] NCCL INFO Channel 01/02 : 0 1 2 3
|
| 181 |
+
gl064:2368556:2368617 [1] NCCL INFO P2P Chunksize set to 131072
|
| 182 |
+
gl064:2368555:2368616 [0] NCCL INFO Trees [0] 1/2/-1->0->-1 [1] 1/-1/-1->0->2
|
| 183 |
+
gl064:2368555:2368616 [0] NCCL INFO P2P Chunksize set to 131072
|
| 184 |
+
gl064:2368556:2368617 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 185 |
+
gl064:2368555:2368616 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
|
| 186 |
+
gl064:2368555:2368616 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
|
| 187 |
+
gl064:2368555:2368623 [0] NCCL INFO [Proxy Service] Device 0 CPU core 9
|
| 188 |
+
gl064:2368556:2368624 [1] NCCL INFO [Proxy Service] Device 1 CPU core 3
|
| 189 |
+
gl064:2368555:2368625 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 10
|
| 190 |
+
gl064:2368556:2368626 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 6
|
| 191 |
+
gl064:2368556:2368617 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
| 192 |
+
gl064:2368556:2368617 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
|
| 193 |
+
gl064:2368555:2368616 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
| 194 |
+
gl064:2368555:2368616 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
|
| 195 |
+
gl064:2368555:2368616 [0] NCCL INFO CC Off, workFifoBytes 1048576
|
| 196 |
+
gl064:2368556:2368617 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
|
| 197 |
+
gl064:2368556:2368617 [1] NCCL INFO ncclCommInitRankConfig comm 0x15c0db00 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x503fff15b9ea778b - Init COMPLETE
|
| 198 |
+
gl064:2368556:2368617 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.45 (kernels 0.09, alloc 0.01, bootstrap 0.32, allgathers 0.01, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
|
| 199 |
+
gl064:2368555:2368616 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
|
| 200 |
+
gl064:2368555:2368616 [0] NCCL INFO ncclCommInitRankConfig comm 0x14bb0450 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x503fff15b9ea778b - Init COMPLETE
|
| 201 |
+
gl064:2368555:2368616 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 0.45 (kernels 0.09, alloc 0.01, bootstrap 0.32, allgathers 0.01, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
|
| 202 |
+
gl064:2368555:2368627 [0] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [receive] via NET/IB/0
|
| 203 |
+
gl064:2368555:2368629 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 3
|
| 204 |
+
gl064:2368555:2368627 [0] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [receive] via NET/IB/0
|
| 205 |
+
gl064:2368555:2368627 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
|
| 206 |
+
gl064:2368555:2368627 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
|
| 207 |
+
gl064:2368556:2368628 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [send] via NET/IB/0
|
| 208 |
+
gl064:2368556:2368628 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [send] via NET/IB/0
|
| 209 |
+
gl064:2368556:2368630 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 7
|
| 210 |
+
gl064:2368556:2368628 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
|
| 211 |
+
gl064:2368555:2368627 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
|
| 212 |
+
training example:
|
| 213 |
+
input_ids:
|
| 214 |
+
[33975, 25, 21806, 279, 2701, 3491, 13, 81917, 697, 32711, 3019, 553, 3019, 13, 3197, 498, 525, 8060, 11, 2968, 697, 4226, 304, 419, 3561, 25, 366, 9217, 2235, 21732, 4226, 12533, 9217, 94367, 2, 22079, 198, 16429, 279, 5109, 304, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 1125, 1855, 458, 23606, 429, 16819, 220, 16, 21, 24, 13, 1446, 646, 990, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 8, 323, 1817, 1372, 646, 1172, 387, 1483, 3055, 13, 4615, 6291, 1265, 2924, 264, 4013, 315, 7354, 330, 8304, 1599, 2974, 1380, 1817, 3019, 374, 264, 35972, 5666, 323, 279, 1590, 3019, 13653, 11508, 311, 279, 2169, 1372, 476, 432, 1265, 387, 264, 3175, 23606, 429, 3059, 304, 279, 2169, 382, 35127, 697, 4226, 304, 279, 2701, 3561, 510, 27, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 1339, 9064, 11993, 21732, 4226, 9940, 374, 279, 1140, 315, 7354, 311, 5545, 279, 2169, 1372, 476, 432, 1265, 387, 264, 3175, 23606, 429, 3059, 304, 279, 2169, 13, 4710, 2461, 3110, 510, 2679, 279, 1140, 315, 5109, 572, 508, 16, 11, 220, 17, 11, 220, 18, 60, 323, 279, 2169, 572, 220, 16, 11, 498, 1410, 3270, 510, 27, 9217, 397, 8304, 220, 16, 25, 220, 16, 488, 220, 17, 284, 220, 18, 198, 8304, 220, 17, 25, 220, 18, 608, 220, 18, 284, 220, 16, 198, 522, 9217, 1339, 269, 4710, 27, 9217, 397, 7, 16, 488, 220, 17, 8, 608, 220, 18, 198, 522, 9217, 1339, 10061, 594, 1744, 3019, 553, 3019, 13, 151643, 198, 71703, 25, 13708, 766, 397, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 8304, 220, 16, 25, 64547, 279, 5795, 481, 1205, 1184, 311, 1855, 458, 23606, 1667, 279, 5109, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 17, 25, 21144, 3204, 7525, 481, 78646, 11, 75240, 11, 46444, 11, 323, 12804, 624, 8304, 220, 18, 25, 9735, 2155, 27459, 315, 1493, 7525, 448, 279, 2661, 5109, 624, 8304, 220, 19, 25, 3393, 5257, 37906, 3080, 582, 1477, 825, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 20, 25, 9646, 582, 614, 1730, 279, 4396, 23606, 11, 3042, 432, 304, 279, 5189, 3561, 624, 6025, 2087, 1526, 678, 1493, 7354, 11, 1588, 594, 279, 1590, 23606, 429, 16819, 220, 16, 21, 24, 1447, 27, 9217, 397, 7, 21, 22, 353, 220, 17, 8, 488, 220, 22, 16, 481, 220, 18, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 488, 220, 22, 16, 284, 220, 16, 18, 23, 11, 1221, 220, 16, 18, 23, 481, 220, 18, 16, 284, 220, 16, 15, 22, 13, 1096, 1558, 537, 6144, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 10061, 752, 10146, 847, 4226, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 10253, 279, 5109, 504, 279, 2661, 738, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 311, 1352, 458, 23606, 6144, 287, 220, 16, 21, 24, 1667, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 3593, 8304, 220, 16, 25, 71059, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 2691, 220, 22, 16, 311, 279, 1102, 504, 14822, 220, 16, 624, 8304, 220, 18, 25, 576, 1590, 1102, 686, 387, 220, 16, 21, 24, 382, 27, 9217, 397, 7, 21, 22, 353, 220, 18, 16, 8, 488, 220, 22, 16, 284, 220, 16, 21, 24, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 11, 15, 22, 22, 11, 1221, 220, 17, 11, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 11, 15, 15, 21, 13, 1096, 374, 537, 6144, 311, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 1462, 311, 3395, 847, 2033, 825, 803, 882, 624, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 785, 5795, 374, 311, 1855, 458, 23606, 1667, 279, 5109, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 429, 16819, 220, 16, 21, 24, 624, 40, 686, 1191, 553, 4460, 2155, 27459, 315, 1493, 2326, 5109, 323, 862, 3204, 7525, 17973, 11, 85922, 11777, 608, 568, 715, 12209, 358, 686, 15442, 1817, 10601, 323, 1779, 421, 807, 6144, 220, 16, 21, 24, 13, 1416, 537, 11, 358, 686, 3271, 389, 311, 279, 1790, 10601, 3080, 358, 1477, 825, 429, 4278, 624, 6025, 9271, 279, 4396, 10601, 11, 358, 686, 3042, 847, 6291, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 624, 8420, 594, 847, 11682, 3119, 1447, 16, 13, 4230, 678, 3204, 27459, 315, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 1667, 5256, 11, 75240, 11, 46444, 11, 323, 12804, 624, 17, 13, 54115, 1817, 10601, 311, 1490, 421, 432, 16819, 220, 16, 21, 24, 624, 18, 13, 1416, 902, 10601, 16819, 220, 16, 21, 24, 11, 13153, 279, 1882, 448, 501, 27459, 3080, 358, 1477, 825, 429, 4278, 624, 19, 13, 9646, 358, 614, 1730, 279, 4396, 10601, 11, 3158, 432, 438, 264, 3175, 23606, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 382, 7039, 1077, 752, 3161, 15039, 369, 279, 4396, 10601, 4894, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 16, 13, 14822, 220, 16, 25, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 15, 22, 22, 198, 17, 13, 14822, 220, 17, 25, 220, 17, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 15, 15, 21, 271, 54815, 11, 279, 1590, 1102, 374, 220, 17, 15, 15, 21, 11, 892, 1558, 537, 6144, 220, 16, 21, 24, 13, 576, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 3983, 3783, 11, 1077, 752, 1744, 911, 432, 1549, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 3838, 374, 279, 1102, 979, 498, 912, 1378, 5109, 504, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 3786, 1939, 8304, 220, 16, 25, 2691, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 11778, 279, 2629, 504, 14822, 220, 16, 323, 912, 220, 22, 16, 382, 19357, 84670, 25, 320, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 271, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 25, 5512, 11, 220, 21, 22, 488, 220, 18, 16, 284, 220, 24, 23, 11, 1221, 220, 24, 23, 488, 220, 22, 16, 284, 220, 16, 21, 24, 13, 576, 4226, 374, 4396, 382, 27, 423, 8477, 397, 33092, 198, 522, 423, 8477, 397, 522, 34913, 397, 522, 26865, 10370, 39, 763, 11, 279, 4396, 4226, 374, 1447, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 29, 151643, 198]
|
| 215 |
+
inputs:
|
| 216 |
+
Human: Answer the following problem. Explain your reasoning step by step. When you are finished, give your answer in this format: <answer>(your answer)</answer>.
|
| 217 |
+
|
| 218 |
+
# Problem
|
| 219 |
+
Using the numbers in the list [67, 71, 31], create an equation that equals 169. You can use basic arithmetic operations (+, -, *, /) and each number can only be used once. Your solution should include a series of steps "Step X:" where each step is a mathematical operation and the final step ultimately leads to the target number or it should be a single equation that results in the target.
|
| 220 |
+
|
| 221 |
+
Give your answer in the following format:
|
| 222 |
+
<answer>
|
| 223 |
+
(your answer)
|
| 224 |
+
</answer>
|
| 225 |
+
|
| 226 |
+
Where "(your answer)" is the list of steps to reach the target number or it should be a single equation that results in the target.
|
| 227 |
+
|
| 228 |
+
For example:
|
| 229 |
+
If the list of numbers was [1, 2, 3] and the target was 1, you could write:
|
| 230 |
+
<answer>
|
| 231 |
+
Step 1: 1 + 2 = 3
|
| 232 |
+
Step 2: 3 / 3 = 1
|
| 233 |
+
</answer>
|
| 234 |
+
|
| 235 |
+
or
|
| 236 |
+
|
| 237 |
+
<answer>
|
| 238 |
+
(1 + 2) / 3
|
| 239 |
+
</answer>
|
| 240 |
+
|
| 241 |
+
Let's think step by step.<|endoftext|>
|
| 242 |
+
Assistant:<think>
|
| 243 |
+
<sample>
|
| 244 |
+
First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
|
| 245 |
+
Step 1: Identify the goal - We need to create an equation using the numbers [67, 71, 31] that equals 169.
|
| 246 |
+
Step 2: Consider possible operations - Addition, subtraction, multiplication, and division.
|
| 247 |
+
Step 3: Try different combinations of these operations with the given numbers.
|
| 248 |
+
Step 4: Test various equations until we find one that equals 169.
|
| 249 |
+
Step 5: Once we have found the correct equation, present it in the specified format.
|
| 250 |
+
After going through all these steps, here's the final equation that equals 169:
|
| 251 |
+
|
| 252 |
+
<answer>
|
| 253 |
+
(67 * 2) + 71 - 31
|
| 254 |
+
</answer>
|
| 255 |
+
</sample>
|
| 256 |
+
<reflect>
|
| 257 |
+
Let us verify this answer:
|
| 258 |
+
|
| 259 |
+
First, 67 + 71 = 138, then 138 - 31 = 107. This does not equal 169, so the answer is incorrect.
|
| 260 |
+
|
| 261 |
+
<verdict>
|
| 262 |
+
Incorrect
|
| 263 |
+
</verdict>
|
| 264 |
+
</reflect>
|
| 265 |
+
Let me verify my answer.
|
| 266 |
+
<sample>
|
| 267 |
+
Rewritten Question:
|
| 268 |
+
|
| 269 |
+
Use the numbers from the given set [67, 71, 31] to form an equation equaling 169 using basic arithmetic operations (+, -, *, /).
|
| 270 |
+
|
| 271 |
+
Step 1: Multiply 67 and 31.
|
| 272 |
+
Step 2: Add 71 to the result from Step 1.
|
| 273 |
+
Step 3: The final result will be 169.
|
| 274 |
+
|
| 275 |
+
<answer>
|
| 276 |
+
(67 * 31) + 71 = 169
|
| 277 |
+
</answer>
|
| 278 |
+
</sample>
|
| 279 |
+
<reflect>
|
| 280 |
+
Let us verify this answer:
|
| 281 |
+
|
| 282 |
+
First, 67 * 31 = 2,077, then 2,077 - 71 = 2,006. This is not equal to 169, so the answer is incorrect.
|
| 283 |
+
|
| 284 |
+
<verdict>
|
| 285 |
+
Incorrect
|
| 286 |
+
</verdict>
|
| 287 |
+
</reflect>
|
| 288 |
+
Time to review my response one more time.
|
| 289 |
+
<sample>
|
| 290 |
+
First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
|
| 291 |
+
The goal is to create an equation using the numbers 67, 71, and 31 that equals 169.
|
| 292 |
+
I will start by trying different combinations of these three numbers and their possible operations (+, -, *, /).
|
| 293 |
+
Then I will evaluate each combination and check if they equal 169. If not, I will move on to the next combination until I find one that works.
|
| 294 |
+
After finding the correct combination, I will present my solution in the "<answer>
|
| 295 |
+
(your answer)
|
| 296 |
+
</answer>" format.
|
| 297 |
+
Here's my detailed plan:
|
| 298 |
+
|
| 299 |
+
1. Create all possible combinations of 67, 71, and 31 using addition, subtraction, multiplication, and division.
|
| 300 |
+
2. Evaluate each combination to see if it equals 169.
|
| 301 |
+
3. If no combination equals 169, repeat the process with new combinations until I find one that works.
|
| 302 |
+
4. Once I have found the correct combination, express it as a single equation in the "<answer>
|
| 303 |
+
(your answer)
|
| 304 |
+
</answer>" format.
|
| 305 |
+
|
| 306 |
+
Now let me begin searching for the correct combination!
|
| 307 |
+
</sample>
|
| 308 |
+
<reflect>
|
| 309 |
+
Let us verify this answer:
|
| 310 |
+
|
| 311 |
+
1. Step 1: 67 * 31 = 2077
|
| 312 |
+
2. Step 2: 2077 - 71 = 2006
|
| 313 |
+
|
| 314 |
+
Therefore, the final result is 2006, which does not equal 169. The answer is incorrect.
|
| 315 |
+
|
| 316 |
+
<verdict>
|
| 317 |
+
Incorrect
|
| 318 |
+
</verdict>
|
| 319 |
+
</reflect>
|
| 320 |
+
But wait, let me think about it again.
|
| 321 |
+
<sample>
|
| 322 |
+
Rewritten Question:
|
| 323 |
+
|
| 324 |
+
What is the result when you add two numbers from the list [67, 71, 31] together?
|
| 325 |
+
|
| 326 |
+
Step 1: Add 67 and 31.
|
| 327 |
+
Step 2: Take the sum from Step 1 and add 71.
|
| 328 |
+
|
| 329 |
+
Final Equation: (67 + 31) + 71
|
| 330 |
+
|
| 331 |
+
<answer>
|
| 332 |
+
(67 + 31) + 71
|
| 333 |
+
</answer>
|
| 334 |
+
</sample>
|
| 335 |
+
<reflect>
|
| 336 |
+
Let us verify this answer: First, 67 + 31 = 98, then 98 + 71 = 169. The answer is correct.
|
| 337 |
+
|
| 338 |
+
<verdict>
|
| 339 |
+
Correct
|
| 340 |
+
</verdict>
|
| 341 |
+
</reflect>
|
| 342 |
+
</think>
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
Hence, the correct answer is:
|
| 346 |
+
|
| 347 |
+
<answer>
|
| 348 |
+
(67 + 31) + 71
|
| 349 |
+
</answer><|endoftext|>
|
| 350 |
+
|
| 351 |
+
label_ids:
|
| 352 |
+
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 13708, 766, 397, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 8304, 220, 16, 25, 64547, 279, 5795, 481, 1205, 1184, 311, 1855, 458, 23606, 1667, 279, 5109, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 17, 25, 21144, 3204, 7525, 481, 78646, 11, 75240, 11, 46444, 11, 323, 12804, 624, 8304, 220, 18, 25, 9735, 2155, 27459, 315, 1493, 7525, 448, 279, 2661, 5109, 624, 8304, 220, 19, 25, 3393, 5257, 37906, 3080, 582, 1477, 825, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 20, 25, 9646, 582, 614, 1730, 279, 4396, 23606, 11, 3042, 432, 304, 279, 5189, 3561, 624, 6025, 2087, 1526, 678, 1493, 7354, 11, 1588, 594, 279, 1590, 23606, 429, 16819, 220, 16, 21, 24, 1447, 27, 9217, 397, 7, 21, 22, 353, 220, 17, 8, 488, 220, 22, 16, 481, 220, 18, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 488, 220, 22, 16, 284, 220, 16, 18, 23, 11, 1221, 220, 16, 18, 23, 481, 220, 18, 16, 284, 220, 16, 15, 22, 13, 1096, 1558, 537, 6144, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 10061, 752, 10146, 847, 4226, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 10253, 279, 5109, 504, 279, 2661, 738, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 311, 1352, 458, 23606, 6144, 287, 220, 16, 21, 24, 1667, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 3593, 8304, 220, 16, 25, 71059, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 2691, 220, 22, 16, 311, 279, 1102, 504, 14822, 220, 16, 624, 8304, 220, 18, 25, 576, 1590, 1102, 686, 387, 220, 16, 21, 24, 382, 27, 9217, 397, 7, 21, 22, 353, 220, 18, 16, 8, 488, 220, 22, 16, 284, 220, 16, 21, 24, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 11, 15, 22, 22, 11, 1221, 220, 17, 11, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 11, 15, 15, 21, 13, 1096, 374, 537, 6144, 311, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 1462, 311, 3395, 847, 2033, 825, 803, 882, 624, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 785, 5795, 374, 311, 1855, 458, 23606, 1667, 279, 5109, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 429, 16819, 220, 16, 21, 24, 624, 40, 686, 1191, 553, 4460, 2155, 27459, 315, 1493, 2326, 5109, 323, 862, 3204, 7525, 17973, 11, 85922, 11777, 608, 568, 715, 12209, 358, 686, 15442, 1817, 10601, 323, 1779, 421, 807, 6144, 220, 16, 21, 24, 13, 1416, 537, 11, 358, 686, 3271, 389, 311, 279, 1790, 10601, 3080, 358, 1477, 825, 429, 4278, 624, 6025, 9271, 279, 4396, 10601, 11, 358, 686, 3042, 847, 6291, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 624, 8420, 594, 847, 11682, 3119, 1447, 16, 13, 4230, 678, 3204, 27459, 315, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 1667, 5256, 11, 75240, 11, 46444, 11, 323, 12804, 624, 17, 13, 54115, 1817, 10601, 311, 1490, 421, 432, 16819, 220, 16, 21, 24, 624, 18, 13, 1416, 902, 10601, 16819, 220, 16, 21, 24, 11, 13153, 279, 1882, 448, 501, 27459, 3080, 358, 1477, 825, 429, 4278, 624, 19, 13, 9646, 358, 614, 1730, 279, 4396, 10601, 11, 3158, 432, 438, 264, 3175, 23606, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 382, 7039, 1077, 752, 3161, 15039, 369, 279, 4396, 10601, 4894, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 16, 13, 14822, 220, 16, 25, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 15, 22, 22, 198, 17, 13, 14822, 220, 17, 25, 220, 17, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 15, 15, 21, 271, 54815, 11, 279, 1590, 1102, 374, 220, 17, 15, 15, 21, 11, 892, 1558, 537, 6144, 220, 16, 21, 24, 13, 576, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 3983, 3783, 11, 1077, 752, 1744, 911, 432, 1549, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 3838, 374, 279, 1102, 979, 498, 912, 1378, 5109, 504, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 3786, 1939, 8304, 220, 16, 25, 2691, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 11778, 279, 2629, 504, 14822, 220, 16, 323, 912, 220, 22, 16, 382, 19357, 84670, 25, 320, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 271, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 25, 5512, 11, 220, 21, 22, 488, 220, 18, 16, 284, 220, 24, 23, 11, 1221, 220, 24, 23, 488, 220, 22, 16, 284, 220, 16, 21, 24, 13, 576, 4226, 374, 4396, 382, 27, 423, 8477, 397, 33092, 198, 522, 423, 8477, 397, 522, 34913, 397, 522, 26865, 10370, 39, 763, 11, 279, 4396, 4226, 374, 1447, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 29, 151643, 198]
|
| 353 |
+
labels:
|
| 354 |
+
<think>
|
| 355 |
+
<sample>
|
| 356 |
+
First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
|
| 357 |
+
Step 1: Identify the goal - We need to create an equation using the numbers [67, 71, 31] that equals 169.
|
| 358 |
+
Step 2: Consider possible operations - Addition, subtraction, multiplication, and division.
|
| 359 |
+
Step 3: Try different combinations of these operations with the given numbers.
|
| 360 |
+
Step 4: Test various equations until we find one that equals 169.
|
| 361 |
+
Step 5: Once we have found the correct equation, present it in the specified format.
|
| 362 |
+
After going through all these steps, here's the final equation that equals 169:
|
| 363 |
+
|
| 364 |
+
<answer>
|
| 365 |
+
(67 * 2) + 71 - 31
|
| 366 |
+
</answer>
|
| 367 |
+
</sample>
|
| 368 |
+
<reflect>
|
| 369 |
+
Let us verify this answer:
|
| 370 |
+
|
| 371 |
+
First, 67 + 71 = 138, then 138 - 31 = 107. This does not equal 169, so the answer is incorrect.
|
| 372 |
+
|
| 373 |
+
<verdict>
|
| 374 |
+
Incorrect
|
| 375 |
+
</verdict>
|
| 376 |
+
</reflect>
|
| 377 |
+
Let me verify my answer.
|
| 378 |
+
<sample>
|
| 379 |
+
Rewritten Question:
|
| 380 |
+
|
| 381 |
+
Use the numbers from the given set [67, 71, 31] to form an equation equaling 169 using basic arithmetic operations (+, -, *, /).
|
| 382 |
+
|
| 383 |
+
Step 1: Multiply 67 and 31.
|
| 384 |
+
Step 2: Add 71 to the result from Step 1.
|
| 385 |
+
Step 3: The final result will be 169.
|
| 386 |
+
|
| 387 |
+
<answer>
|
| 388 |
+
(67 * 31) + 71 = 169
|
| 389 |
+
</answer>
|
| 390 |
+
</sample>
|
| 391 |
+
<reflect>
|
| 392 |
+
Let us verify this answer:
|
| 393 |
+
|
| 394 |
+
First, 67 * 31 = 2,077, then 2,077 - 71 = 2,006. This is not equal to 169, so the answer is incorrect.
|
| 395 |
+
|
| 396 |
+
<verdict>
|
| 397 |
+
Incorrect
|
| 398 |
+
</verdict>
|
| 399 |
+
</reflect>
|
| 400 |
+
Time to review my response one more time.
|
| 401 |
+
<sample>
|
| 402 |
+
First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
|
| 403 |
+
The goal is to create an equation using the numbers 67, 71, and 31 that equals 169.
|
| 404 |
+
I will start by trying different combinations of these three numbers and their possible operations (+, -, *, /).
|
| 405 |
+
Then I will evaluate each combination and check if they equal 169. If not, I will move on to the next combination until I find one that works.
|
| 406 |
+
After finding the correct combination, I will present my solution in the "<answer>
|
| 407 |
+
(your answer)
|
| 408 |
+
</answer>" format.
|
| 409 |
+
Here's my detailed plan:
|
| 410 |
+
|
| 411 |
+
1. Create all possible combinations of 67, 71, and 31 using addition, subtraction, multiplication, and division.
|
| 412 |
+
2. Evaluate each combination to see if it equals 169.
|
| 413 |
+
3. If no combination equals 169, repeat the process with new combinations until I find one that works.
|
| 414 |
+
4. Once I have found the correct combination, express it as a single equation in the "<answer>
|
| 415 |
+
(your answer)
|
| 416 |
+
</answer>" format.
|
| 417 |
+
|
| 418 |
+
Now let me begin searching for the correct combination!
|
| 419 |
+
</sample>
|
| 420 |
+
<reflect>
|
| 421 |
+
Let us verify this answer:
|
| 422 |
+
|
| 423 |
+
1. Step 1: 67 * 31 = 2077
|
| 424 |
+
2. Step 2: 2077 - 71 = 2006
|
| 425 |
+
|
| 426 |
+
Therefore, the final result is 2006, which does not equal 169. The answer is incorrect.
|
| 427 |
+
|
| 428 |
+
<verdict>
|
| 429 |
+
Incorrect
|
| 430 |
+
</verdict>
|
| 431 |
+
</reflect>
|
| 432 |
+
But wait, let me think about it again.
|
| 433 |
+
<sample>
|
| 434 |
+
Rewritten Question:
|
| 435 |
+
|
| 436 |
+
What is the result when you add two numbers from the list [67, 71, 31] together?
|
| 437 |
+
|
| 438 |
+
Step 1: Add 67 and 31.
|
| 439 |
+
Step 2: Take the sum from Step 1 and add 71.
|
| 440 |
+
|
| 441 |
+
Final Equation: (67 + 31) + 71
|
| 442 |
+
|
| 443 |
+
<answer>
|
| 444 |
+
(67 + 31) + 71
|
| 445 |
+
</answer>
|
| 446 |
+
</sample>
|
| 447 |
+
<reflect>
|
| 448 |
+
Let us verify this answer: First, 67 + 31 = 98, then 98 + 71 = 169. The answer is correct.
|
| 449 |
+
|
| 450 |
+
<verdict>
|
| 451 |
+
Correct
|
| 452 |
+
</verdict>
|
| 453 |
+
</reflect>
|
| 454 |
+
</think>
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
Hence, the correct answer is:
|
| 458 |
+
|
| 459 |
+
<answer>
|
| 460 |
+
(67 + 31) + 71
|
| 461 |
+
</answer><|endoftext|>
|
| 462 |
+
|
| 463 |
+
[INFO|configuration_utils.py:765] 2025-10-22 16:01:50,484 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 464 |
+
[INFO|configuration_utils.py:839] 2025-10-22 16:01:50,485 >> Model config Qwen2Config {
|
| 465 |
+
"architectures": [
|
| 466 |
+
"Qwen2ForCausalLM"
|
| 467 |
+
],
|
| 468 |
+
"attention_dropout": 0.0,
|
| 469 |
+
"bos_token_id": 151643,
|
| 470 |
+
"dtype": "bfloat16",
|
| 471 |
+
"eos_token_id": 151643,
|
| 472 |
+
"hidden_act": "silu",
|
| 473 |
+
"hidden_size": 896,
|
| 474 |
+
"initializer_range": 0.02,
|
| 475 |
+
"intermediate_size": 4864,
|
| 476 |
+
"layer_types": [
|
| 477 |
+
"full_attention",
|
| 478 |
+
"full_attention",
|
| 479 |
+
"full_attention",
|
| 480 |
+
"full_attention",
|
| 481 |
+
"full_attention",
|
| 482 |
+
"full_attention",
|
| 483 |
+
"full_attention",
|
| 484 |
+
"full_attention",
|
| 485 |
+
"full_attention",
|
| 486 |
+
"full_attention",
|
| 487 |
+
"full_attention",
|
| 488 |
+
"full_attention",
|
| 489 |
+
"full_attention",
|
| 490 |
+
"full_attention",
|
| 491 |
+
"full_attention",
|
| 492 |
+
"full_attention",
|
| 493 |
+
"full_attention",
|
| 494 |
+
"full_attention",
|
| 495 |
+
"full_attention",
|
| 496 |
+
"full_attention",
|
| 497 |
+
"full_attention",
|
| 498 |
+
"full_attention",
|
| 499 |
+
"full_attention",
|
| 500 |
+
"full_attention"
|
| 501 |
+
],
|
| 502 |
+
"max_position_embeddings": 32768,
|
| 503 |
+
"max_window_layers": 24,
|
| 504 |
+
"model_type": "qwen2",
|
| 505 |
+
"num_attention_heads": 14,
|
| 506 |
+
"num_hidden_layers": 24,
|
| 507 |
+
"num_key_value_heads": 2,
|
| 508 |
+
"rms_norm_eps": 1e-06,
|
| 509 |
+
"rope_scaling": null,
|
| 510 |
+
"rope_theta": 1000000.0,
|
| 511 |
+
"sliding_window": null,
|
| 512 |
+
"tie_word_embeddings": true,
|
| 513 |
+
"transformers_version": "4.57.1",
|
| 514 |
+
"use_cache": true,
|
| 515 |
+
"use_mrope": false,
|
| 516 |
+
"use_sliding_window": false,
|
| 517 |
+
"vocab_size": 151936
|
| 518 |
+
}
|
| 519 |
+
|
| 520 |
+
[INFO|2025-10-22 16:01:50] llamafactory.model.model_utils.kv_cache:143 >> KV cache is disabled during training.
|
| 521 |
+
[WARNING|logging.py:328] 2025-10-22 16:01:50,806 >> `torch_dtype` is deprecated! Use `dtype` instead!
|
| 522 |
+
[INFO|modeling_utils.py:1172] 2025-10-22 16:01:50,807 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
|
| 523 |
+
[INFO|modeling_utils.py:2341] 2025-10-22 16:01:50,808 >> Instantiating Qwen2ForCausalLM model under default dtype torch.float16.
|
| 524 |
+
[INFO|configuration_utils.py:986] 2025-10-22 16:01:50,808 >> Generate config GenerationConfig {
|
| 525 |
+
"bos_token_id": 151643,
|
| 526 |
+
"eos_token_id": 151643,
|
| 527 |
+
"use_cache": false
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
`torch_dtype` is deprecated! Use `dtype` instead!
|
| 531 |
+
[INFO|configuration_utils.py:941] 2025-10-22 16:01:51,084 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
|
| 532 |
+
[INFO|configuration_utils.py:986] 2025-10-22 16:01:51,085 >> Generate config GenerationConfig {
|
| 533 |
+
"bos_token_id": 151643,
|
| 534 |
+
"eos_token_id": 151643,
|
| 535 |
+
"max_new_tokens": 2048
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
[INFO|dynamic_module_utils.py:423] 2025-10-22 16:01:51,114 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
|
| 539 |
+
[INFO|2025-10-22 16:01:51] llamafactory.model.model_utils.checkpointing:143 >> Gradient checkpointing enabled.
|
| 540 |
+
[INFO|2025-10-22 16:01:51] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
|
| 541 |
+
[INFO|2025-10-22 16:01:51] llamafactory.model.adapter:143 >> Upcasting trainable params to float32.
|
| 542 |
+
[INFO|2025-10-22 16:01:51] llamafactory.model.adapter:143 >> Fine-tuning method: LoRA
|
| 543 |
+
[INFO|2025-10-22 16:01:51] llamafactory.model.model_utils.misc:143 >> Found linear modules: up_proj,v_proj,q_proj,down_proj,gate_proj,k_proj,o_proj
|
| 544 |
+
[INFO|2025-10-22 16:01:51] llamafactory.model.loader:143 >> trainable params: 4,399,104 || all params: 498,431,872 || trainable%: 0.8826
|
| 545 |
+
[WARNING|trainer.py:906] 2025-10-22 16:01:51,639 >> The model is already on multiple devices. Skipping the move to device specified in `args`.
|
| 546 |
+
[INFO|trainer.py:699] 2025-10-22 16:01:51,642 >> max_steps is given, it will override any value given in num_train_epochs
|
| 547 |
+
[INFO|trainer.py:749] 2025-10-22 16:01:51,642 >> Using auto half precision backend
|
| 548 |
+
[WARNING|trainer.py:982] 2025-10-22 16:01:51,643 >> The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
|
| 549 |
+
The model is already on multiple devices. Skipping the move to device specified in `args`.
|
| 550 |
+
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
|
| 551 |
+
[INFO|trainer.py:2519] 2025-10-22 16:01:51,823 >> ***** Running training *****
|
| 552 |
+
[INFO|trainer.py:2520] 2025-10-22 16:01:51,823 >> Num examples = 48,600
|
| 553 |
+
[INFO|trainer.py:2521] 2025-10-22 16:01:51,823 >> Num Epochs = 1
|
| 554 |
+
[INFO|trainer.py:2522] 2025-10-22 16:01:51,823 >> Instantaneous batch size per device = 1
|
| 555 |
+
[INFO|trainer.py:2525] 2025-10-22 16:01:51,823 >> Total train batch size (w. parallel, distributed & accumulation) = 4
|
| 556 |
+
[INFO|trainer.py:2526] 2025-10-22 16:01:51,823 >> Gradient Accumulation steps = 1
|
| 557 |
+
[INFO|trainer.py:2527] 2025-10-22 16:01:51,823 >> Total optimization steps = 150
|
| 558 |
+
[INFO|trainer.py:2528] 2025-10-22 16:01:51,825 >> Number of trainable parameters = 4,399,104
|
| 559 |
+
[INFO|integration_utils.py:867] 2025-10-22 16:01:51,847 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
|
| 560 |
+
wandb: Currently logged in as: zsprague (ut_nlp_deduce) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
|
| 561 |
+
wandb: Tracking run with wandb version 0.22.2
|
| 562 |
+
wandb: Run data is saved locally in /scratch/zrs2020/LlamaFactoryHelper/wandb/run-20251022_160152-f7vqjhyf
|
| 563 |
+
wandb: Run `wandb offline` to turn off syncing.
|
| 564 |
+
wandb: Syncing run interactive_test
|
| 565 |
+
wandb: View project at https://wandb.ai/ut_nlp_deduce/llamafactory
|
| 566 |
+
wandb: View run at https://wandb.ai/ut_nlp_deduce/llamafactory/runs/f7vqjhyf
|
| 567 |
+
0%| | 0/150 [00:00<?, ?it/s] 1%| | 1/150 [00:00<01:26, 1.72it/s] 1%| | 2/150 [00:00<00:50, 2.95it/s] 2%| | 3/150 [00:00<00:37, 3.88it/s] 3%| | 4/150 [00:01<00:35, 4.09it/s] 3%| | 5/150 [00:01<00:54, 2.68it/s] 4%| | 6/150 [00:01<00:47, 3.02it/s] 5%| | 7/150 [00:02<00:43, 3.27it/s] 5%| | 8/150 [00:02<00:38, 3.65it/s] 6%| | 9/150 [00:02<00:35, 3.92it/s] 7%| | 10/150 [00:02<00:32, 4.32it/s] {'loss': 0.8092, 'grad_norm': 0.4081718623638153, 'learning_rate': 4.7e-05, 'epoch': 0.0}
|
| 568 |
+
7%| | 10/150 [00:02<00:32, 4.32it/s] 7%| | 11/150 [00:03<00:31, 4.39it/s] 8%| | 12/150 [00:03<00:41, 3.29it/s] 9%| | 13/150 [00:03<00:35, 3.88it/s] 9%| | 14/150 [00:03<00:30, 4.43it/s] 10%| | 15/150 [00:04<00:28, 4.78it/s] 11%| | 16/150 [00:04<00:37, 3.59it/s] 11%| | 17/150 [00:04<00:36, 3.62it/s] 12%| | 18/150 [00:04<00:35, 3.69it/s] 13%| | 19/150 [00:05<00:35, 3.69it/s] 13%| | 20/150 [00:05<00:32, 3.98it/s] {'loss': 0.751, 'grad_norm': 0.3975396752357483, 'learning_rate': 4.3666666666666666e-05, 'epoch': 0.0}
|
| 569 |
+
13%| | 20/150 [00:05<00:32, 3.98it/s] 14%| | 21/150 [00:05<00:32, 3.94it/s] 15%| | 22/150 [00:05<00:30, 4.18it/s] 15%| | 23/150 [00:06<00:31, 4.07it/s] 16%| | 24/150 [00:06<00:28, 4.40it/s] 17%| | 25/150 [00:06<00:31, 4.02it/s] 17%| | 26/150 [00:06<00:29, 4.17it/s] 18%| | 27/150 [00:07<00:30, 4.10it/s] 19%| | 28/150 [00:07<00:33, 3.65it/s] 19%| | 29/150 [00:07<00:32, 3.75it/s] 20%| | 30/150 [00:07<00:31, 3.85it/s] {'loss': 0.7344, 'grad_norm': 0.46849244832992554, 'learning_rate': 4.0333333333333336e-05, 'epoch': 0.0}
|
| 570 |
+
20%| | 30/150 [00:07<00:31, 3.85it/s] 21%| | 31/150 [00:08<00:31, 3.83it/s] 21%| | 32/150 [00:08<00:29, 4.05it/s] 22%| | 33/150 [00:08<00:26, 4.43it/s] 23%| | 34/150 [00:08<00:23, 4.87it/s] 23%| | 35/150 [00:09<00:25, 4.58it/s] 24%| | 36/150 [00:09<00:22, 5.04it/s] 25%| | 37/150 [00:09<00:24, 4.71it/s] 25%| | 38/150 [00:09<00:24, 4.67it/s] 26%| | 39/150 [00:09<00:22, 4.98it/s] 27%| | 40/150 [00:10<00:23, 4.58it/s] {'loss': 0.7063, 'grad_norm': 0.3817349970340729, 'learning_rate': 3.7e-05, 'epoch': 0.0}
|
| 571 |
+
27%| | 40/150 [00:10<00:23, 4.58it/s] 27%| | 41/150 [00:10<00:26, 4.09it/s] 28%| | 42/150 [00:10<00:26, 4.07it/s] 29%| | 43/150 [00:10<00:23, 4.58it/s] 29%| | 44/150 [00:10<00:21, 4.90it/s] 30%| | 45/150 [00:11<00:19, 5.33it/s] 31%| | 46/150 [00:11<00:20, 4.98it/s] 31%| | 47/150 [00:11<00:21, 4.88it/s] 32%| | 48/150 [00:11<00:19, 5.14it/s] 33%| | 49/150 [00:12<00:22, 4.50it/s] 33%| | 50/150 [00:12<00:22, 4.49it/s] {'loss': 0.6382, 'grad_norm': 0.650374710559845, 'learning_rate': 3.366666666666667e-05, 'epoch': 0.0}
|
| 572 |
+
33%| | 50/150 [00:12<00:22, 4.49it/s][INFO|trainer.py:4309] 2025-10-22 16:02:05,111 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50
|
| 573 |
+
[INFO|configuration_utils.py:765] 2025-10-22 16:02:05,262 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 574 |
+
[INFO|configuration_utils.py:839] 2025-10-22 16:02:05,263 >> Model config Qwen2Config {
|
| 575 |
+
"architectures": [
|
| 576 |
+
"Qwen2ForCausalLM"
|
| 577 |
+
],
|
| 578 |
+
"attention_dropout": 0.0,
|
| 579 |
+
"bos_token_id": 151643,
|
| 580 |
+
"dtype": "bfloat16",
|
| 581 |
+
"eos_token_id": 151643,
|
| 582 |
+
"hidden_act": "silu",
|
| 583 |
+
"hidden_size": 896,
|
| 584 |
+
"initializer_range": 0.02,
|
| 585 |
+
"intermediate_size": 4864,
|
| 586 |
+
"layer_types": [
|
| 587 |
+
"full_attention",
|
| 588 |
+
"full_attention",
|
| 589 |
+
"full_attention",
|
| 590 |
+
"full_attention",
|
| 591 |
+
"full_attention",
|
| 592 |
+
"full_attention",
|
| 593 |
+
"full_attention",
|
| 594 |
+
"full_attention",
|
| 595 |
+
"full_attention",
|
| 596 |
+
"full_attention",
|
| 597 |
+
"full_attention",
|
| 598 |
+
"full_attention",
|
| 599 |
+
"full_attention",
|
| 600 |
+
"full_attention",
|
| 601 |
+
"full_attention",
|
| 602 |
+
"full_attention",
|
| 603 |
+
"full_attention",
|
| 604 |
+
"full_attention",
|
| 605 |
+
"full_attention",
|
| 606 |
+
"full_attention",
|
| 607 |
+
"full_attention",
|
| 608 |
+
"full_attention",
|
| 609 |
+
"full_attention",
|
| 610 |
+
"full_attention"
|
| 611 |
+
],
|
| 612 |
+
"max_position_embeddings": 32768,
|
| 613 |
+
"max_window_layers": 24,
|
| 614 |
+
"model_type": "qwen2",
|
| 615 |
+
"num_attention_heads": 14,
|
| 616 |
+
"num_hidden_layers": 24,
|
| 617 |
+
"num_key_value_heads": 2,
|
| 618 |
+
"rms_norm_eps": 1e-06,
|
| 619 |
+
"rope_scaling": null,
|
| 620 |
+
"rope_theta": 1000000.0,
|
| 621 |
+
"sliding_window": null,
|
| 622 |
+
"tie_word_embeddings": true,
|
| 623 |
+
"transformers_version": "4.57.1",
|
| 624 |
+
"use_cache": true,
|
| 625 |
+
"use_mrope": false,
|
| 626 |
+
"use_sliding_window": false,
|
| 627 |
+
"vocab_size": 151936
|
| 628 |
+
}
|
| 629 |
+
|
| 630 |
+
[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:02:05,402 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/chat_template.jinja
|
| 631 |
+
[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:02:05,406 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/tokenizer_config.json
|
| 632 |
+
[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:02:05,410 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/special_tokens_map.json
|
| 633 |
+
34%| | 51/150 [00:13<00:44, 2.20it/s] 35%| | 52/150 [00:13<00:39, 2.47it/s] 35%| | 53/150 [00:13<00:34, 2.82it/s] 36%| | 54/150 [00:13<00:28, 3.40it/s] 37%| | 55/150 [00:14<00:27, 3.45it/s] 37%| | 56/150 [00:14<00:23, 4.06it/s] 38%| | 57/150 [00:14<00:22, 4.10it/s] 39%| | 58/150 [00:14<00:20, 4.51it/s] 39%| | 59/150 [00:14<00:17, 5.19it/s] 40%| | 60/150 [00:15<00:16, 5.57it/s] {'loss': 0.6139, 'grad_norm': 0.4990316331386566, 'learning_rate': 3.0333333333333337e-05, 'epoch': 0.0}
|
| 634 |
+
40%| | 60/150 [00:15<00:16, 5.57it/s] 41%| | 61/150 [00:15<00:17, 5.19it/s] 41%| | 62/150 [00:15<00:15, 5.74it/s] 42%| | 63/150 [00:15<00:16, 5.17it/s] 43%| | 64/150 [00:15<00:15, 5.45it/s] 43%| | 65/150 [00:16<00:17, 4.97it/s] 44%| | 66/150 [00:16<00:18, 4.59it/s] 45%| | 67/150 [00:16<00:17, 4.86it/s] 45%| | 68/150 [00:16<00:18, 4.54it/s] 46%| | 69/150 [00:16<00:19, 4.15it/s] 47%| | 70/150 [00:17<00:19, 4.10it/s] {'loss': 0.597, 'grad_norm': 0.5236718058586121, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.01}
|
| 635 |
+
47%| | 70/150 [00:17<00:19, 4.10it/s] 47%| | 71/150 [00:17<00:19, 3.97it/s] 48%| | 72/150 [00:17<00:17, 4.48it/s] 49%| | 73/150 [00:17<00:19, 4.00it/s] 49%| | 74/150 [00:18<00:18, 4.19it/s] 50%| | 75/150 [00:18<00:15, 4.70it/s] 51%| | 76/150 [00:18<00:15, 4.73it/s] 51%| | 77/150 [00:18<00:13, 5.23it/s] 52%| | 78/150 [00:18<00:15, 4.68it/s] 53%| | 79/150 [00:19<00:15, 4.55it/s] 53%| | 80/150 [00:19<00:16, 4.27it/s] {'loss': 0.6205, 'grad_norm': 0.41710713505744934, 'learning_rate': 2.3666666666666668e-05, 'epoch': 0.01}
|
| 636 |
+
53%| | 80/150 [00:19<00:16, 4.27it/s] 54%| | 81/150 [00:19<00:14, 4.65it/s] 55%| | 82/150 [00:19<00:16, 4.06it/s] 55%| | 83/150 [00:20<00:15, 4.45it/s] 56%| | 84/150 [00:20<00:15, 4.39it/s] 57%| | 85/150 [00:20<00:14, 4.45it/s] 57%| | 86/150 [00:20<00:12, 5.07it/s] 58%| | 87/150 [00:20<00:12, 5.19it/s] 59%| | 88/150 [00:21<00:12, 4.88it/s] 59%| | 89/150 [00:21<00:13, 4.59it/s] 60%| | 90/150 [00:21<00:11, 5.22it/s] {'loss': 0.6038, 'grad_norm': 0.5673879981040955, 'learning_rate': 2.0333333333333334e-05, 'epoch': 0.01}
|
| 637 |
+
60%| | 90/150 [00:21<00:11, 5.22it/s] 61%| | 91/150 [00:21<00:12, 4.64it/s] 61%| | 92/150 [00:22<00:12, 4.53it/s] 62%| | 93/150 [00:22<00:12, 4.75it/s] 63%| | 94/150 [00:22<00:11, 4.69it/s] 63%| | 95/150 [00:22<00:11, 4.78it/s] 64%| | 96/150 [00:22<00:12, 4.42it/s] 65%| | 97/150 [00:23<00:13, 3.84it/s] 65%| | 98/150 [00:23<00:12, 4.26it/s] 66%| | 99/150 [00:23<00:11, 4.53it/s] 67%| | 100/150 [00:23<00:11, 4.31it/s] {'loss': 0.5934, 'grad_norm': 0.49819639325141907, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.01}
|
| 638 |
+
67%| | 100/150 [00:23<00:11, 4.31it/s][INFO|trainer.py:4309] 2025-10-22 16:02:16,719 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
|
| 639 |
+
[INFO|configuration_utils.py:765] 2025-10-22 16:02:16,928 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 640 |
+
[INFO|configuration_utils.py:839] 2025-10-22 16:02:16,929 >> Model config Qwen2Config {
|
| 641 |
+
"architectures": [
|
| 642 |
+
"Qwen2ForCausalLM"
|
| 643 |
+
],
|
| 644 |
+
"attention_dropout": 0.0,
|
| 645 |
+
"bos_token_id": 151643,
|
| 646 |
+
"dtype": "bfloat16",
|
| 647 |
+
"eos_token_id": 151643,
|
| 648 |
+
"hidden_act": "silu",
|
| 649 |
+
"hidden_size": 896,
|
| 650 |
+
"initializer_range": 0.02,
|
| 651 |
+
"intermediate_size": 4864,
|
| 652 |
+
"layer_types": [
|
| 653 |
+
"full_attention",
|
| 654 |
+
"full_attention",
|
| 655 |
+
"full_attention",
|
| 656 |
+
"full_attention",
|
| 657 |
+
"full_attention",
|
| 658 |
+
"full_attention",
|
| 659 |
+
"full_attention",
|
| 660 |
+
"full_attention",
|
| 661 |
+
"full_attention",
|
| 662 |
+
"full_attention",
|
| 663 |
+
"full_attention",
|
| 664 |
+
"full_attention",
|
| 665 |
+
"full_attention",
|
| 666 |
+
"full_attention",
|
| 667 |
+
"full_attention",
|
| 668 |
+
"full_attention",
|
| 669 |
+
"full_attention",
|
| 670 |
+
"full_attention",
|
| 671 |
+
"full_attention",
|
| 672 |
+
"full_attention",
|
| 673 |
+
"full_attention",
|
| 674 |
+
"full_attention",
|
| 675 |
+
"full_attention",
|
| 676 |
+
"full_attention"
|
| 677 |
+
],
|
| 678 |
+
"max_position_embeddings": 32768,
|
| 679 |
+
"max_window_layers": 24,
|
| 680 |
+
"model_type": "qwen2",
|
| 681 |
+
"num_attention_heads": 14,
|
| 682 |
+
"num_hidden_layers": 24,
|
| 683 |
+
"num_key_value_heads": 2,
|
| 684 |
+
"rms_norm_eps": 1e-06,
|
| 685 |
+
"rope_scaling": null,
|
| 686 |
+
"rope_theta": 1000000.0,
|
| 687 |
+
"sliding_window": null,
|
| 688 |
+
"tie_word_embeddings": true,
|
| 689 |
+
"transformers_version": "4.57.1",
|
| 690 |
+
"use_cache": true,
|
| 691 |
+
"use_mrope": false,
|
| 692 |
+
"use_sliding_window": false,
|
| 693 |
+
"vocab_size": 151936
|
| 694 |
+
}
|
| 695 |
+
|
| 696 |
+
[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:02:17,110 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/chat_template.jinja
|
| 697 |
+
[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:02:17,130 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/tokenizer_config.json
|
| 698 |
+
[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:02:17,134 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/special_tokens_map.json
|
| 699 |
+
67%| | 101/150 [00:25<00:25, 1.91it/s] 68%| | 102/150 [00:25<00:21, 2.24it/s] 69%| | 103/150 [00:25<00:17, 2.71it/s] 69%| | 104/150 [00:25<00:18, 2.54it/s] 70%| | 105/150 [00:26<00:17, 2.56it/s] 71%| | 106/150 [00:26<00:16, 2.71it/s] 71%| | 107/150 [00:26<00:14, 2.87it/s] 72%| | 108/150 [00:27<00:12, 3.30it/s] 73%| | 109/150 [00:27<00:11, 3.59it/s] 73%| | 110/150 [00:27<00:10, 3.89it/s] {'loss': 0.5548, 'grad_norm': 0.48188939690589905, 'learning_rate': 1.3666666666666666e-05, 'epoch': 0.01}
|
| 700 |
+
73%| | 110/150 [00:27<00:10, 3.89it/s] 74%| | 111/150 [00:27<00:10, 3.80it/s] 75%| | 112/150 [00:28<00:08, 4.25it/s] 75%| | 113/150 [00:28<00:08, 4.41it/s] 76%| | 114/150 [00:28<00:07, 4.81it/s] 77%| | 115/150 [00:28<00:08, 4.33it/s] 77%| | 116/150 [00:29<00:09, 3.70it/s] 78%| | 117/150 [00:29<00:07, 4.23it/s] 79%| | 118/150 [00:29<00:06, 4.74it/s] 79%| | 119/150 [00:29<00:06, 4.49it/s] 80%| | 120/150 [00:29<00:06, 4.79it/s] {'loss': 0.5132, 'grad_norm': 0.5217602252960205, 'learning_rate': 1.0333333333333333e-05, 'epoch': 0.01}
|
| 701 |
+
80%| | 120/150 [00:29<00:06, 4.79it/s] 81%| | 121/150 [00:30<00:06, 4.48it/s] 81%| | 122/150 [00:30<00:05, 4.81it/s] 82%| | 123/150 [00:30<00:05, 5.05it/s] 83%| | 124/150 [00:30<00:05, 4.71it/s] 83%| | 125/150 [00:30<00:05, 4.71it/s] 84%| | 126/150 [00:31<00:05, 4.01it/s] 85%| | 127/150 [00:31<00:05, 3.95it/s] 85%| | 128/150 [00:31<00:05, 4.01it/s] 86%| | 129/150 [00:31<00:05, 3.99it/s] 87%| | 130/150 [00:32<00:04, 4.54it/s] {'loss': 0.5586, 'grad_norm': 0.8095545172691345, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.01}
|
| 702 |
+
87%| | 130/150 [00:32<00:04, 4.54it/s] 87%| | 131/150 [00:32<00:04, 4.34it/s] 88%| | 132/150 [00:32<00:03, 4.82it/s] 89%| | 133/150 [00:32<00:03, 4.39it/s] 89%| | 134/150 [00:33<00:03, 4.06it/s] 90%| | 135/150 [00:33<00:03, 3.91it/s] 91%| | 136/150 [00:33<00:03, 4.34it/s] 91%|| 137/150 [00:33<00:02, 4.52it/s] 92%|| 138/150 [00:33<00:02, 4.40it/s] 93%|| 139/150 [00:34<00:02, 3.95it/s] 93%|| 140/150 [00:34<00:02, 4.35it/s] {'loss': 0.563, 'grad_norm': 0.4983977973461151, 'learning_rate': 3.666666666666667e-06, 'epoch': 0.01}
|
| 703 |
+
93%|| 140/150 [00:34<00:02, 4.35it/s] 94%|| 141/150 [00:34<00:02, 4.24it/s] 95%|| 142/150 [00:34<00:01, 4.53it/s] 95%|| 143/150 [00:35<00:01, 4.31it/s] 96%|| 144/150 [00:35<00:01, 4.96it/s] 97%|| 145/150 [00:35<00:01, 4.96it/s] 97%|| 146/150 [00:35<00:00, 4.70it/s] 98%|| 147/150 [00:35<00:00, 5.18it/s] 99%|| 148/150 [00:36<00:00, 5.32it/s] 99%|| 149/150 [00:36<00:00, 5.52it/s]100%|| 150/150 [00:36<00:00, 4.81it/s] {'loss': 0.5749, 'grad_norm': 0.4249863624572754, 'learning_rate': 3.3333333333333335e-07, 'epoch': 0.01}
|
| 704 |
+
100%|| 150/150 [00:36<00:00, 4.81it/s][INFO|trainer.py:4309] 2025-10-22 16:02:29,334 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
|
| 705 |
+
[INFO|configuration_utils.py:765] 2025-10-22 16:02:29,507 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 706 |
+
[INFO|configuration_utils.py:839] 2025-10-22 16:02:29,508 >> Model config Qwen2Config {
|
| 707 |
+
"architectures": [
|
| 708 |
+
"Qwen2ForCausalLM"
|
| 709 |
+
],
|
| 710 |
+
"attention_dropout": 0.0,
|
| 711 |
+
"bos_token_id": 151643,
|
| 712 |
+
"dtype": "bfloat16",
|
| 713 |
+
"eos_token_id": 151643,
|
| 714 |
+
"hidden_act": "silu",
|
| 715 |
+
"hidden_size": 896,
|
| 716 |
+
"initializer_range": 0.02,
|
| 717 |
+
"intermediate_size": 4864,
|
| 718 |
+
"layer_types": [
|
| 719 |
+
"full_attention",
|
| 720 |
+
"full_attention",
|
| 721 |
+
"full_attention",
|
| 722 |
+
"full_attention",
|
| 723 |
+
"full_attention",
|
| 724 |
+
"full_attention",
|
| 725 |
+
"full_attention",
|
| 726 |
+
"full_attention",
|
| 727 |
+
"full_attention",
|
| 728 |
+
"full_attention",
|
| 729 |
+
"full_attention",
|
| 730 |
+
"full_attention",
|
| 731 |
+
"full_attention",
|
| 732 |
+
"full_attention",
|
| 733 |
+
"full_attention",
|
| 734 |
+
"full_attention",
|
| 735 |
+
"full_attention",
|
| 736 |
+
"full_attention",
|
| 737 |
+
"full_attention",
|
| 738 |
+
"full_attention",
|
| 739 |
+
"full_attention",
|
| 740 |
+
"full_attention",
|
| 741 |
+
"full_attention",
|
| 742 |
+
"full_attention"
|
| 743 |
+
],
|
| 744 |
+
"max_position_embeddings": 32768,
|
| 745 |
+
"max_window_layers": 24,
|
| 746 |
+
"model_type": "qwen2",
|
| 747 |
+
"num_attention_heads": 14,
|
| 748 |
+
"num_hidden_layers": 24,
|
| 749 |
+
"num_key_value_heads": 2,
|
| 750 |
+
"rms_norm_eps": 1e-06,
|
| 751 |
+
"rope_scaling": null,
|
| 752 |
+
"rope_theta": 1000000.0,
|
| 753 |
+
"sliding_window": null,
|
| 754 |
+
"tie_word_embeddings": true,
|
| 755 |
+
"transformers_version": "4.57.1",
|
| 756 |
+
"use_cache": true,
|
| 757 |
+
"use_mrope": false,
|
| 758 |
+
"use_sliding_window": false,
|
| 759 |
+
"vocab_size": 151936
|
| 760 |
+
}
|
| 761 |
+
|
| 762 |
+
[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:02:29,679 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150/chat_template.jinja
|
| 763 |
+
[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:02:29,683 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150/tokenizer_config.json
|
| 764 |
+
[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:02:29,703 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150/special_tokens_map.json
|
| 765 |
+
[INFO|trainer.py:2810] 2025-10-22 16:02:30,219 >>
|
| 766 |
+
|
| 767 |
+
Training completed. Do not forget to share your model on huggingface.co/models =)
|
| 768 |
+
|
| 769 |
+
|
| 770 |
+
{'train_runtime': 38.3946, 'train_samples_per_second': 15.627, 'train_steps_per_second': 3.907, 'train_loss': 0.6288003253936768, 'epoch': 0.01}
|
| 771 |
+
100%|| 150/150 [00:37<00:00, 4.81it/s]100%|| 150/150 [00:37<00:00, 4.01it/s]
|
| 772 |
+
[INFO|trainer.py:4309] 2025-10-22 16:02:30,229 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
|
| 773 |
+
[INFO|configuration_utils.py:765] 2025-10-22 16:02:30,323 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 774 |
+
[INFO|configuration_utils.py:839] 2025-10-22 16:02:30,323 >> Model config Qwen2Config {
|
| 775 |
+
"architectures": [
|
| 776 |
+
"Qwen2ForCausalLM"
|
| 777 |
+
],
|
| 778 |
+
"attention_dropout": 0.0,
|
| 779 |
+
"bos_token_id": 151643,
|
| 780 |
+
"dtype": "bfloat16",
|
| 781 |
+
"eos_token_id": 151643,
|
| 782 |
+
"hidden_act": "silu",
|
| 783 |
+
"hidden_size": 896,
|
| 784 |
+
"initializer_range": 0.02,
|
| 785 |
+
"intermediate_size": 4864,
|
| 786 |
+
"layer_types": [
|
| 787 |
+
"full_attention",
|
| 788 |
+
"full_attention",
|
| 789 |
+
"full_attention",
|
| 790 |
+
"full_attention",
|
| 791 |
+
"full_attention",
|
| 792 |
+
"full_attention",
|
| 793 |
+
"full_attention",
|
| 794 |
+
"full_attention",
|
| 795 |
+
"full_attention",
|
| 796 |
+
"full_attention",
|
| 797 |
+
"full_attention",
|
| 798 |
+
"full_attention",
|
| 799 |
+
"full_attention",
|
| 800 |
+
"full_attention",
|
| 801 |
+
"full_attention",
|
| 802 |
+
"full_attention",
|
| 803 |
+
"full_attention",
|
| 804 |
+
"full_attention",
|
| 805 |
+
"full_attention",
|
| 806 |
+
"full_attention",
|
| 807 |
+
"full_attention",
|
| 808 |
+
"full_attention",
|
| 809 |
+
"full_attention",
|
| 810 |
+
"full_attention"
|
| 811 |
+
],
|
| 812 |
+
"max_position_embeddings": 32768,
|
| 813 |
+
"max_window_layers": 24,
|
| 814 |
+
"model_type": "qwen2",
|
| 815 |
+
"num_attention_heads": 14,
|
| 816 |
+
"num_hidden_layers": 24,
|
| 817 |
+
"num_key_value_heads": 2,
|
| 818 |
+
"rms_norm_eps": 1e-06,
|
| 819 |
+
"rope_scaling": null,
|
| 820 |
+
"rope_theta": 1000000.0,
|
| 821 |
+
"sliding_window": null,
|
| 822 |
+
"tie_word_embeddings": true,
|
| 823 |
+
"transformers_version": "4.57.1",
|
| 824 |
+
"use_cache": true,
|
| 825 |
+
"use_mrope": false,
|
| 826 |
+
"use_sliding_window": false,
|
| 827 |
+
"vocab_size": 151936
|
| 828 |
+
}
|
| 829 |
+
|
| 830 |
+
[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:02:30,422 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/chat_template.jinja
|
| 831 |
+
[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:02:30,426 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/tokenizer_config.json
|
| 832 |
+
[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:02:30,430 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/special_tokens_map.json
|
| 833 |
+
***** train metrics *****
|
| 834 |
+
epoch = 0.0123
|
| 835 |
+
total_flos = 2243462GF
|
| 836 |
+
train_loss = 0.6288
|
| 837 |
+
train_runtime = 0:00:38.39
|
| 838 |
+
train_samples_per_second = 15.627
|
| 839 |
+
train_steps_per_second = 3.907
|
| 840 |
+
[INFO|modelcard.py:456] 2025-10-22 16:02:30,648 >> Dropping the following result as it does not have all the necessary fields:
|
| 841 |
+
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
|
| 842 |
+
gl064:2368556:2368556 [1] NCCL INFO comm 0x15c0db00 rank 1 nranks 4 cudaDev 1 busId 59000 - Destroy COMPLETE
|
| 843 |
+
gl064:2368555:2368555 [0] NCCL INFO comm 0x14bb0450 rank 0 nranks 4 cudaDev 0 busId 47000 - Destroy COMPLETE
|
| 844 |
+
[1;34mwandb[0m:
|
| 845 |
+
[1;34mwandb[0m: View run [33minteractive_test[0m at: [34m[0m
|
| 846 |
+
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20251022_160152-f7vqjhyf/logs[0m
|
| 847 |
+
|
| 848 |
+
========================================
|
| 849 |
+
Training completed successfully
|
| 850 |
+
End Time: Wed Oct 22 04:02:32 PM EDT 2025
|
| 851 |
+
========================================
|
| 852 |
+
|
| 853 |
+
========================================
|
| 854 |
+
STAGE 2: Merging/Exporting Model
|
| 855 |
+
Start Time: Wed Oct 22 04:02:32 PM EDT 2025
|
| 856 |
+
========================================
|
| 857 |
+
Looking for checkpoints in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
|
| 858 |
+
Found most recent checkpoint: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
|
| 859 |
+
Checkpoint details:
|
| 860 |
+
Path: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
|
| 861 |
+
Last modified: 2025-10-22 16:02:30.204175325 -0400
|
| 862 |
+
Training step: 150
|
| 863 |
+
Updating merge config to point to checkpoint...
|
| 864 |
+
Successfully updated merge config
|
| 865 |
+
Updated merge config to use: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
|
| 866 |
+
|
| 867 |
+
Merge config contents:
|
| 868 |
+
model_name_or_path: Qwen/Qwen2.5-0.5B
|
| 869 |
+
finetuning_type: lora
|
| 870 |
+
trust_remote_code: true
|
| 871 |
+
adapter_name_or_path: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
|
| 872 |
+
template: default
|
| 873 |
+
export_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
|
| 874 |
+
|
| 875 |
+
Executing command: llamafactory-cli export /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
|
| 876 |
+
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
|
| 877 |
+
warnings.warn(
|
| 878 |
+
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
|
| 879 |
+
import pkg_resources
|
| 880 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,686 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
|
| 881 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,686 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
|
| 882 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,686 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
|
| 883 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,686 >> loading file added_tokens.json from cache at None
|
| 884 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,687 >> loading file special_tokens_map.json from cache at None
|
| 885 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,687 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
|
| 886 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,687 >> loading file chat_template.jinja from cache at None
|
| 887 |
+
[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:02:40,863 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
| 888 |
+
[INFO|configuration_utils.py:765] 2025-10-22 16:02:41,054 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 889 |
+
[INFO|configuration_utils.py:839] 2025-10-22 16:02:41,056 >> Model config Qwen2Config {
|
| 890 |
+
"architectures": [
|
| 891 |
+
"Qwen2ForCausalLM"
|
| 892 |
+
],
|
| 893 |
+
"attention_dropout": 0.0,
|
| 894 |
+
"bos_token_id": 151643,
|
| 895 |
+
"dtype": "bfloat16",
|
| 896 |
+
"eos_token_id": 151643,
|
| 897 |
+
"hidden_act": "silu",
|
| 898 |
+
"hidden_size": 896,
|
| 899 |
+
"initializer_range": 0.02,
|
| 900 |
+
"intermediate_size": 4864,
|
| 901 |
+
"layer_types": [
|
| 902 |
+
"full_attention",
|
| 903 |
+
"full_attention",
|
| 904 |
+
"full_attention",
|
| 905 |
+
"full_attention",
|
| 906 |
+
"full_attention",
|
| 907 |
+
"full_attention",
|
| 908 |
+
"full_attention",
|
| 909 |
+
"full_attention",
|
| 910 |
+
"full_attention",
|
| 911 |
+
"full_attention",
|
| 912 |
+
"full_attention",
|
| 913 |
+
"full_attention",
|
| 914 |
+
"full_attention",
|
| 915 |
+
"full_attention",
|
| 916 |
+
"full_attention",
|
| 917 |
+
"full_attention",
|
| 918 |
+
"full_attention",
|
| 919 |
+
"full_attention",
|
| 920 |
+
"full_attention",
|
| 921 |
+
"full_attention",
|
| 922 |
+
"full_attention",
|
| 923 |
+
"full_attention",
|
| 924 |
+
"full_attention",
|
| 925 |
+
"full_attention"
|
| 926 |
+
],
|
| 927 |
+
"max_position_embeddings": 32768,
|
| 928 |
+
"max_window_layers": 24,
|
| 929 |
+
"model_type": "qwen2",
|
| 930 |
+
"num_attention_heads": 14,
|
| 931 |
+
"num_hidden_layers": 24,
|
| 932 |
+
"num_key_value_heads": 2,
|
| 933 |
+
"rms_norm_eps": 1e-06,
|
| 934 |
+
"rope_scaling": null,
|
| 935 |
+
"rope_theta": 1000000.0,
|
| 936 |
+
"sliding_window": null,
|
| 937 |
+
"tie_word_embeddings": true,
|
| 938 |
+
"transformers_version": "4.57.1",
|
| 939 |
+
"use_cache": true,
|
| 940 |
+
"use_mrope": false,
|
| 941 |
+
"use_sliding_window": false,
|
| 942 |
+
"vocab_size": 151936
|
| 943 |
+
}
|
| 944 |
+
|
| 945 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
|
| 946 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
|
| 947 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
|
| 948 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file added_tokens.json from cache at None
|
| 949 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file special_tokens_map.json from cache at None
|
| 950 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
|
| 951 |
+
[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file chat_template.jinja from cache at None
|
| 952 |
+
[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:02:41,298 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
| 953 |
+
[INFO|configuration_utils.py:765] 2025-10-22 16:02:41,348 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
|
| 954 |
+
[INFO|configuration_utils.py:839] 2025-10-22 16:02:41,348 >> Model config Qwen2Config {
|
| 955 |
+
"architectures": [
|
| 956 |
+
"Qwen2ForCausalLM"
|
| 957 |
+
],
|
| 958 |
+
"attention_dropout": 0.0,
|
| 959 |
+
"bos_token_id": 151643,
|
| 960 |
+
"dtype": "bfloat16",
|
| 961 |
+
"eos_token_id": 151643,
|
| 962 |
+
"hidden_act": "silu",
|
| 963 |
+
"hidden_size": 896,
|
| 964 |
+
"initializer_range": 0.02,
|
| 965 |
+
"intermediate_size": 4864,
|
| 966 |
+
"layer_types": [
|
| 967 |
+
"full_attention",
|
| 968 |
+
"full_attention",
|
| 969 |
+
"full_attention",
|
| 970 |
+
"full_attention",
|
| 971 |
+
"full_attention",
|
| 972 |
+
"full_attention",
|
| 973 |
+
"full_attention",
|
| 974 |
+
"full_attention",
|
| 975 |
+
"full_attention",
|
| 976 |
+
"full_attention",
|
| 977 |
+
"full_attention",
|
| 978 |
+
"full_attention",
|
| 979 |
+
"full_attention",
|
| 980 |
+
"full_attention",
|
| 981 |
+
"full_attention",
|
| 982 |
+
"full_attention",
|
| 983 |
+
"full_attention",
|
| 984 |
+
"full_attention",
|
| 985 |
+
"full_attention",
|
| 986 |
+
"full_attention",
|
| 987 |
+
"full_attention",
|
| 988 |
+
"full_attention",
|
| 989 |
+
"full_attention",
|
| 990 |
+
"full_attention"
|
| 991 |
+
],
|
| 992 |
+
"max_position_embeddings": 32768,
|
| 993 |
+
"max_window_layers": 24,
|
| 994 |
+
"model_type": "qwen2",
|
| 995 |
+
"num_attention_heads": 14,
|
| 996 |
+
"num_hidden_layers": 24,
|
| 997 |
+
"num_key_value_heads": 2,
|
| 998 |
+
"rms_norm_eps": 1e-06,
|
| 999 |
+
"rope_scaling": null,
|
| 1000 |
+
"rope_theta": 1000000.0,
|
| 1001 |
+
"sliding_window": null,
|
| 1002 |
+
"tie_word_embeddings": true,
|
| 1003 |
+
"transformers_version": "4.57.1",
|
| 1004 |
+
"use_cache": true,
|
| 1005 |
+
"use_mrope": false,
|
| 1006 |
+
"use_sliding_window": false,
|
| 1007 |
+
"vocab_size": 151936
|
| 1008 |
+
}
|
| 1009 |
+
|
| 1010 |
+
[WARNING|logging.py:328] 2025-10-22 16:02:41,348 >> `torch_dtype` is deprecated! Use `dtype` instead!
|
| 1011 |
+
[INFO|2025-10-22 16:02:41] llamafactory.model.model_utils.kv_cache:143 >> KV cache is enabled for faster generation.
|
| 1012 |
+
[WARNING|logging.py:328] 2025-10-22 16:02:41,741 >> `torch_dtype` is deprecated! Use `dtype` instead!
|
| 1013 |
+
[INFO|modeling_utils.py:1172] 2025-10-22 16:02:41,742 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
|
| 1014 |
+
[INFO|modeling_utils.py:2341] 2025-10-22 16:02:41,743 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
|
| 1015 |
+
[INFO|configuration_utils.py:986] 2025-10-22 16:02:41,743 >> Generate config GenerationConfig {
|
| 1016 |
+
"bos_token_id": 151643,
|
| 1017 |
+
"eos_token_id": 151643
|
| 1018 |
+
}
|
| 1019 |
+
|
| 1020 |
+
[INFO|configuration_utils.py:941] 2025-10-22 16:02:41,844 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
|
| 1021 |
+
[INFO|configuration_utils.py:986] 2025-10-22 16:02:41,844 >> Generate config GenerationConfig {
|
| 1022 |
+
"bos_token_id": 151643,
|
| 1023 |
+
"eos_token_id": 151643,
|
| 1024 |
+
"max_new_tokens": 2048
|
| 1025 |
+
}
|
| 1026 |
+
|
| 1027 |
+
[INFO|dynamic_module_utils.py:423] 2025-10-22 16:02:41,879 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
|
| 1028 |
+
[INFO|2025-10-22 16:02:41] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
|
| 1029 |
+
[INFO|2025-10-22 16:02:42] llamafactory.model.adapter:143 >> Merged 1 adapter(s).
|
| 1030 |
+
[INFO|2025-10-22 16:02:42] llamafactory.model.adapter:143 >> Loaded adapter(s): /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
|
| 1031 |
+
[INFO|2025-10-22 16:02:42] llamafactory.model.loader:143 >> all params: 494,032,768
|
| 1032 |
+
[INFO|2025-10-22 16:02:42] llamafactory.train.tuner:143 >> Convert model dtype to: torch.bfloat16.
|
| 1033 |
+
[INFO|configuration_utils.py:491] 2025-10-22 16:02:42,967 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/config.json
|
| 1034 |
+
[INFO|configuration_utils.py:757] 2025-10-22 16:02:42,971 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/generation_config.json
|
| 1035 |
+
[INFO|modeling_utils.py:4181] 2025-10-22 16:02:44,581 >> Model weights saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/model.safetensors
|
| 1036 |
+
[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:02:44,587 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/chat_template.jinja
|
| 1037 |
+
[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:02:44,591 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/tokenizer_config.json
|
| 1038 |
+
[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:02:44,595 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/special_tokens_map.json
|
| 1039 |
+
[INFO|2025-10-22 16:02:44] llamafactory.train.tuner:143 >> Ollama modelfile saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/Modelfile
|
| 1040 |
+
|
| 1041 |
+
========================================
|
| 1042 |
+
Merge/Export completed successfully
|
| 1043 |
+
End Time: Wed Oct 22 04:02:45 PM EDT 2025
|
| 1044 |
+
========================================
|
| 1045 |
+
|
| 1046 |
+
========================================
|
| 1047 |
+
Preparing Training Artifacts
|
| 1048 |
+
========================================
|
| 1049 |
+
Copying configuration files...
|
| 1050 |
+
Copying and cleaning training logs...
|
training_artifacts/merge_config.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_name_or_path: Qwen/Qwen2.5-0.5B
|
| 2 |
+
finetuning_type: lora
|
| 3 |
+
trust_remote_code: true
|
| 4 |
+
adapter_name_or_path: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
|
| 5 |
+
template: default
|
| 6 |
+
export_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
|
training_artifacts/train_config.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
stage: sft
|
| 2 |
+
do_train: true
|
| 3 |
+
max_steps: 150
|
| 4 |
+
do_eval: false
|
| 5 |
+
save_strategy: steps
|
| 6 |
+
save_steps: 50
|
| 7 |
+
logging_steps: 10
|
| 8 |
+
fp16: true
|
| 9 |
+
bf16: false
|
| 10 |
+
overwrite_output_dir: true
|
| 11 |
+
per_device_train_batch_size: 1
|
| 12 |
+
gradient_accumulation_steps: 1
|
| 13 |
+
gradient_checkpointing: true
|
| 14 |
+
model_name_or_path: Qwen/Qwen2.5-0.5B
|
| 15 |
+
finetuning_type: lora
|
| 16 |
+
dataset: my_custom_sft
|
| 17 |
+
dataset_dir: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory/data
|
| 18 |
+
template: default
|
| 19 |
+
cutoff_len: 8096
|
| 20 |
+
val_size: 0.1
|
| 21 |
+
lora_rank: 8
|
| 22 |
+
lora_alpha: 16
|
| 23 |
+
lora_dropout: 0.05
|
| 24 |
+
lora_target: all
|
| 25 |
+
output_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
|