Upload folder using huggingface_hub
Browse files- generation_config.json +1 -1
- wandb/debug-internal.log +7 -0
- wandb/debug.log +23 -0
- wandb/run-20250602_203403-j8mvmygb/files/output.log +252 -0
- wandb/run-20250602_203403-j8mvmygb/files/requirements.txt +315 -0
- wandb/run-20250602_203403-j8mvmygb/files/wandb-metadata.json +90 -0
- wandb/run-20250602_203403-j8mvmygb/logs/debug-core.log +7 -0
- wandb/run-20250602_203403-j8mvmygb/logs/debug-internal.log +7 -0
- wandb/run-20250602_203403-j8mvmygb/logs/debug.log +23 -0
- wandb/run-20250602_203403-j8mvmygb/run-j8mvmygb.wandb +0 -0
generation_config.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"bos_token_id": 151643,
|
| 3 |
"do_sample": true,
|
| 4 |
-
"eos_token_id":
|
| 5 |
"max_new_tokens": 2048,
|
| 6 |
"transformers_version": "4.51.3"
|
| 7 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"bos_token_id": 151643,
|
| 3 |
"do_sample": true,
|
| 4 |
+
"eos_token_id": 1516435,
|
| 5 |
"max_new_tokens": 2048,
|
| 6 |
"transformers_version": "4.51.3"
|
| 7 |
}
|
wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-02T20:34:03.456431945Z","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"/home/ubuntu/axolotl/outputs/out-kd-4b-offline-t1-v2/wandb/run-20250602_203403-j8mvmygb/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-02T20:34:03.732828057Z","level":"INFO","msg":"created new stream","id":"j8mvmygb"}
|
| 3 |
+
{"time":"2025-06-02T20:34:03.732975853Z","level":"INFO","msg":"stream: started","id":"j8mvmygb"}
|
| 4 |
+
{"time":"2025-06-02T20:34:03.733017439Z","level":"INFO","msg":"writer: Do: started","stream_id":"j8mvmygb"}
|
| 5 |
+
{"time":"2025-06-02T20:34:03.733141665Z","level":"INFO","msg":"sender: started","stream_id":"j8mvmygb"}
|
| 6 |
+
{"time":"2025-06-02T20:34:03.733155406Z","level":"INFO","msg":"handler: started","stream_id":"j8mvmygb"}
|
| 7 |
+
{"time":"2025-06-02T20:34:03.978050525Z","level":"INFO","msg":"Starting system monitor"}
|
wandb/debug.log
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_setup.py:_flush():70] Configure stats pid to 2144651
|
| 3 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_setup.py:_flush():70] Loading settings from /home/ubuntu/.config/wandb/settings
|
| 4 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_setup.py:_flush():70] Loading settings from /home/ubuntu/axolotl/outputs/out-kd-4b-offline-t1-v2/wandb/settings
|
| 5 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /home/ubuntu/axolotl/outputs/out-kd-4b-offline-t1-v2/wandb/run-20250602_203403-j8mvmygb/logs/debug.log
|
| 7 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /home/ubuntu/axolotl/outputs/out-kd-4b-offline-t1-v2/wandb/run-20250602_203403-j8mvmygb/logs/debug-internal.log
|
| 8 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-06-02 20:34:03,454 INFO MainThread:2144651 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-06-02 20:34:03,454 INFO MainThread:2144651 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-06-02 20:34:03,456 INFO MainThread:2144651 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-06-02 20:34:03,460 INFO MainThread:2144651 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-06-02 20:34:03,976 INFO MainThread:2144651 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-06-02 20:34:04,083 INFO MainThread:2144651 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-06-02 20:34:04,084 INFO MainThread:2144651 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-06-02 20:34:04,084 INFO MainThread:2144651 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-06-02 20:34:04,084 INFO MainThread:2144651 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-06-02 20:34:04,085 INFO MainThread:2144651 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-06-02 20:34:38,220 INFO MsgRouterThr:2144651 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
|
wandb/run-20250602_203403-j8mvmygb/files/output.log
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2025-06-02 20:34:04,085] [[32m INFO[0m]: --- LOADING MODEL --- (pipeline.py:187)[0m
|
| 2 |
+
tokenizer_config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9.76k/9.76k [00:00<00:00, 57.2MB/s]
|
| 3 |
+
vocab.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2.78M/2.78M [00:00<00:00, 22.8MB/s]
|
| 4 |
+
merges.txt: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.67M/1.67M [00:00<00:00, 25.4MB/s]
|
| 5 |
+
tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4M/11.4M [00:00<00:00, 12.8MB/s]
|
| 6 |
+
added_tokens.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 707/707 [00:00<00:00, 6.77MB/s]
|
| 7 |
+
special_tokens_map.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 613/613 [00:00<00:00, 6.56MB/s]
|
| 8 |
+
[2025-06-02 20:34:06,494] [[32m INFO[0m]: --- INIT SEEDS --- (pipeline.py:258)[0m
|
| 9 |
+
[2025-06-02 20:34:06,495] [[32m INFO[0m]: --- LOADING TASKS --- (pipeline.py:212)[0m
|
| 10 |
+
[2025-06-02 20:34:06,495] [[32m INFO[0m]: Found 1 custom tasks in /home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/lighteval/tasks/extended/ifeval/main.py (registry.py:142)[0m
|
| 11 |
+
[2025-06-02 20:34:06,495] [[32m INFO[0m]: Found 6 custom tasks in /home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/lighteval/tasks/extended/tiny_benchmarks/main.py (registry.py:142)[0m
|
| 12 |
+
[2025-06-02 20:34:06,495] [[32m INFO[0m]: Found 1 custom tasks in /home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/lighteval/tasks/extended/mt_bench/main.py (registry.py:142)[0m
|
| 13 |
+
[2025-06-02 20:34:06,495] [[32m INFO[0m]: Found 4 custom tasks in /home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/lighteval/tasks/extended/mix_eval/main.py (registry.py:142)[0m
|
| 14 |
+
[2025-06-02 20:34:06,495] [[32m INFO[0m]: Found 5 custom tasks in /home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/lighteval/tasks/extended/olympiade_bench/main.py (registry.py:142)[0m
|
| 15 |
+
[2025-06-02 20:34:06,495] [[32m INFO[0m]: Found 1 custom tasks in /home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/lighteval/tasks/extended/hle/main.py (registry.py:142)[0m
|
| 16 |
+
[2025-06-02 20:34:06,495] [[32m INFO[0m]: Found 23 custom tasks in /home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/lighteval/tasks/extended/lcb/main.py (registry.py:142)[0m
|
| 17 |
+
[2025-06-02 20:34:06,498] [[32m INFO[0m]: HuggingFaceH4/aime_2024 default (lighteval_task.py:187)[0m
|
| 18 |
+
[2025-06-02 20:34:06,498] [[33m WARNING[0m]: Careful, the task lighteval|aime24 is using evaluation data to build the few shot examples. (lighteval_task.py:260)[0m
|
| 19 |
+
[2025-06-02 20:34:07,794] [[32m INFO[0m]: --- RUNNING MODEL --- (pipeline.py:482)[0m
|
| 20 |
+
[2025-06-02 20:34:07,794] [[32m INFO[0m]: Running RequestType.GREEDY_UNTIL requests (pipeline.py:468)[0m
|
| 21 |
+
[2025-06-02 20:34:07,804] [[33m WARNING[0m]: You cannot select the number of dataset splits for a generative evaluation at the moment. Automatically inferring. (data.py:237)[0m
|
| 22 |
+
Splits: 0%| | 0/1 [00:00<?, ?it/s][2025-06-02 20:34:09,102] [[32m INFO[0m]: Started a local Ray instance. (worker.py:1888)[0m
|
| 23 |
+
[36m(pid=2145898)[0m INFO 06-02 20:34:16 [__init__.py:243] Automatically detected platform cuda.
|
| 24 |
+
[36m(run_inference_one_model pid=2145898)[0m INFO 06-02 20:34:18 [__init__.py:31] Available plugins for group vllm.general_plugins:
|
| 25 |
+
[36m(run_inference_one_model pid=2145898)[0m INFO 06-02 20:34:18 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
|
| 26 |
+
[36m(run_inference_one_model pid=2145898)[0m INFO 06-02 20:34:18 [__init__.py:36] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.
|
| 27 |
+
[36m(run_inference_one_model pid=2145912)[0m INFO 06-02 20:34:29 [config.py:793] This model supports multiple tasks: {'score', 'embed', 'classify', 'reward', 'generate'}. Defaulting to 'generate'.
|
| 28 |
+
[36m(run_inference_one_model pid=2145912)[0m INFO 06-02 20:34:29 [config.py:2118] Chunked prefill is enabled with max_num_batched_tokens=2048.
|
| 29 |
+
[36m(pid=2145937)[0m INFO 06-02 20:34:16 [__init__.py:243] Automatically detected platform cuda.[32m [repeated 7x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
|
| 30 |
+
[36m(run_inference_one_model pid=2145937)[0m INFO 06-02 20:34:18 [__init__.py:31] Available plugins for group vllm.general_plugins:[32m [repeated 7x across cluster][0m
|
| 31 |
+
[36m(run_inference_one_model pid=2145937)[0m INFO 06-02 20:34:18 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver[32m [repeated 7x across cluster][0m
|
| 32 |
+
[36m(run_inference_one_model pid=2145937)[0m INFO 06-02 20:34:18 [__init__.py:36] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.[32m [repeated 7x across cluster][0m
|
| 33 |
+
[36m(run_inference_one_model pid=2145898)[0m INFO 06-02 20:34:29 [config.py:793] This model supports multiple tasks: {'embed', 'score', 'classify', 'reward', 'generate'}. Defaulting to 'generate'.
|
| 34 |
+
[36m(run_inference_one_model pid=2145922)[0m INFO 06-02 20:34:29 [config.py:793] This model supports multiple tasks: {'embed', 'score', 'reward', 'classify', 'generate'}. Defaulting to 'generate'.
|
| 35 |
+
[36m(run_inference_one_model pid=2145923)[0m INFO 06-02 20:34:29 [config.py:793] This model supports multiple tasks: {'classify', 'reward', 'generate', 'score', 'embed'}. Defaulting to 'generate'.
|
| 36 |
+
[36m(run_inference_one_model pid=2145912)[0m WARNING 06-02 20:34:29 [utils.py:2531] We must use the `spawn` multiprocessing start method. Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. Reason: CUDA is initialized
|
| 37 |
+
[36m(run_inference_one_model pid=2145896)[0m INFO 06-02 20:34:29 [config.py:793] This model supports multiple tasks: {'reward', 'score', 'generate', 'embed', 'classify'}. Defaulting to 'generate'.
|
| 38 |
+
[36m(run_inference_one_model pid=2145918)[0m INFO 06-02 20:34:29 [config.py:793] This model supports multiple tasks: {'score', 'embed', 'classify', 'generate', 'reward'}. Defaulting to 'generate'.
|
| 39 |
+
[36m(run_inference_one_model pid=2145937)[0m INFO 06-02 20:34:29 [config.py:793] This model supports multiple tasks: {'generate', 'classify', 'score', 'reward', 'embed'}. Defaulting to 'generate'.
|
| 40 |
+
[36m(run_inference_one_model pid=2145915)[0m INFO 06-02 20:34:30 [config.py:793] This model supports multiple tasks: {'score', 'classify', 'reward', 'generate', 'embed'}. Defaulting to 'generate'.
|
| 41 |
+
[36m(run_inference_one_model pid=2145915)[0m INFO 06-02 20:34:30 [config.py:2118] Chunked prefill is enabled with max_num_batched_tokens=2048.[32m [repeated 7x across cluster][0m
|
| 42 |
+
[36m(run_inference_one_model pid=2145912)[0m INFO 06-02 20:34:34 [__init__.py:243] Automatically detected platform cuda.
|
| 43 |
+
[36m(run_inference_one_model pid=2145915)[0m WARNING 06-02 20:34:30 [utils.py:2531] We must use the `spawn` multiprocessing start method. Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. Reason: CUDA is initialized[32m [repeated 7x across cluster][0m
|
| 44 |
+
[36m(run_inference_one_model pid=2145922)[0m INFO 06-02 20:34:34 [__init__.py:243] Automatically detected platform cuda.
|
| 45 |
+
[36m(run_inference_one_model pid=2145912)[0m INFO 06-02 20:34:37 [core.py:438] Waiting for init message from front-end.
|
| 46 |
+
[36m(run_inference_one_model pid=2145912)[0m INFO 06-02 20:34:37 [__init__.py:31] Available plugins for group vllm.general_plugins:
|
| 47 |
+
[36m(run_inference_one_model pid=2145912)[0m INFO 06-02 20:34:37 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
|
| 48 |
+
[36m(run_inference_one_model pid=2145912)[0m INFO 06-02 20:34:37 [__init__.py:36] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.
|
| 49 |
+
[36m(run_inference_one_model pid=2145912)[0m INFO 06-02 20:34:37 [core.py:65] Initializing a V1 LLM engine (v0.9.0.1) with config: model='winglian/qwen3-4b-math-kd-jsd-temp1-v2', speculative_config=None, tokenizer='winglian/qwen3-4b-math-kd-jsd-temp1-v2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=main, override_neuron_config={}, tokenizer_revision=main, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=1234, served_model_name=winglian/qwen3-4b-math-kd-jsd-temp1-v2, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level": 3, "custom_ops": ["none"], "splitting_ops": ["vllm.unified_attention", "vllm.unified_attention_with_output"], "compile_sizes": [], "inductor_compile_config": {"enable_auto_functionalized_v2": false}, "use_cudagraph": true, "cudagraph_num_of_warmups": 1, "cudagraph_capture_sizes": [512, 504, 496, 488, 480, 472, 464, 456, 448, 440, 432, 424, 416, 408, 400, 392, 384, 376, 368, 360, 352, 344, 336, 328, 320, 312, 304, 296, 288, 280, 272, 264, 256, 248, 240, 232, 224, 216, 208, 200, 192, 184, 176, 168, 160, 152, 144, 136, 128, 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 4, 2, 1], "max_capture_size": 512}
|
| 50 |
+
[36m(run_inference_one_model pid=2145923)[0m Traceback (most recent call last):
|
| 51 |
+
[36m(run_inference_one_model pid=2145923)[0m File "<string>", line 1, in <module>
|
| 52 |
+
[36m(run_inference_one_model pid=2145923)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/multiprocessing/spawn.py", line 122, in spawn_main
|
| 53 |
+
[36m(run_inference_one_model pid=2145923)[0m exitcode = _main(fd, parent_sentinel)
|
| 54 |
+
[36m(run_inference_one_model pid=2145923)[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 55 |
+
[36m(run_inference_one_model pid=2145923)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/multiprocessing/spawn.py", line 132, in _main
|
| 56 |
+
[36m(run_inference_one_model pid=2145923)[0m self = reduction.pickle.load(from_parent)
|
| 57 |
+
[36m(run_inference_one_model pid=2145923)[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 58 |
+
[36m(run_inference_one_model pid=2145923)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/vllm/__init__.py", line 12, in <module>
|
| 59 |
+
[36m(run_inference_one_model pid=2145923)[0m from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
| 60 |
+
[36m(run_inference_one_model pid=2145923)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/vllm/engine/arg_utils.py", line 36, in <module>
|
| 61 |
+
[36m(run_inference_one_model pid=2145923)[0m from vllm.reasoning import ReasoningParserManager
|
| 62 |
+
[36m(run_inference_one_model pid=2145923)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/vllm/reasoning/__init__.py", line 3, in <module>
|
| 63 |
+
[36m(run_inference_one_model pid=2145923)[0m from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
|
| 64 |
+
[36m(run_inference_one_model pid=2145923)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/vllm/reasoning/abs_reasoning_parsers.py", line 11, in <module>
|
| 65 |
+
[36m(run_inference_one_model pid=2145923)[0m from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
| 66 |
+
[36m(run_inference_one_model pid=2145923)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/vllm/entrypoints/openai/protocol.py", line 218, in <module>
|
| 67 |
+
[36m(run_inference_one_model pid=2145923)[0m class ChatCompletionRequest(OpenAIBaseModel):
|
| 68 |
+
[36m(run_inference_one_model pid=2145923)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/pydantic/_internal/_model_construction.py", line 224, in __new__
|
| 69 |
+
[36m(run_inference_one_model pid=2145923)[0m complete_model_class(
|
| 70 |
+
[36m(run_inference_one_model pid=2145923)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/pydantic/_internal/_model_construction.py", line 602, in complete_model_class
|
| 71 |
+
[36m(run_inference_one_model pid=2145923)[0m schema = cls.__get_pydantic_core_schema__(cls, handler)
|
| 72 |
+
[36m(run_inference_one_model pid=2145923)[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 73 |
+
[36m(run_inference_one_model pid=2145923)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/pydantic/main.py", line 702, in __get_pydantic_core_schema__
|
| 74 |
+
[36m(run_inference_one_model pid=2145896)[0m from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
|
| 75 |
+
[36m(run_inference_one_model pid=2145896)[0m @config
|
| 76 |
+
[36m(run_inference_one_model pid=2145896)[0m ^^^^^^
|
| 77 |
+
[36m(run_inference_one_model pid=2145896)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/vllm/config.py", line 178, in config
|
| 78 |
+
[36m(run_inference_one_model pid=2145896)[0m attr_docs = get_attr_docs(cls)
|
| 79 |
+
[36m(run_inference_one_model pid=2145896)[0m ^^^^^^^^^^^^^^^^^^
|
| 80 |
+
[36m(run_inference_one_model pid=2145896)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/vllm/config.py", line 133, in get_attr_docs
|
| 81 |
+
[36m(run_inference_one_model pid=2145896)[0m cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
|
| 82 |
+
[36m(run_inference_one_model pid=2145896)[0m ^^^^^^^^^^^^^^^^^^^^^^
|
| 83 |
+
[36m(run_inference_one_model pid=2145896)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/inspect.py", line 1258, in getsource
|
| 84 |
+
[36m(run_inference_one_model pid=2145896)[0m lines, lnum = getsourcelines(object)
|
| 85 |
+
[36m(run_inference_one_model pid=2145896)[0m ^^^^^^^^^^^^^^^^^^^^^^
|
| 86 |
+
[36m(run_inference_one_model pid=2145896)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/inspect.py", line 1240, in getsourcelines
|
| 87 |
+
[36m(run_inference_one_model pid=2145896)[0m lines, lnum = findsource(object)
|
| 88 |
+
[36m(run_inference_one_model pid=2145896)[0m ^^^^^^^^^^^^^^^^^^
|
| 89 |
+
[36m(run_inference_one_model pid=2145896)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/inspect.py", line 1088, in findsource
|
| 90 |
+
[36m(run_inference_one_model pid=2145896)[0m class_finder.visit(tree)
|
| 91 |
+
[36m(run_inference_one_model pid=2145896)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/ast.py", line 418, in visit
|
| 92 |
+
[36m(run_inference_one_model pid=2145896)[0m return visitor(node)
|
| 93 |
+
[36m(run_inference_one_model pid=2145896)[0m ^^^^^^^^^^^^^
|
| 94 |
+
[36m(run_inference_one_model pid=2145896)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/ast.py", line 426, in generic_visit
|
| 95 |
+
[36m(run_inference_one_model pid=2145896)[0m self.visit(item)
|
| 96 |
+
[36m(run_inference_one_model pid=2145896)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/ast.py", line 418, in visit
|
| 97 |
+
[36m(run_inference_one_model pid=2145896)[0m return visitor(node)
|
| 98 |
+
[36m(run_inference_one_model pid=2145896)[0m ^^^^^^^^^^^^^
|
| 99 |
+
[36m(run_inference_one_model pid=2145896)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/inspect.py", line 1047, in visit_ClassDef
|
| 100 |
+
[36m(run_inference_one_model pid=2145896)[0m self.generic_visit(node)
|
| 101 |
+
[36m(run_inference_one_model pid=2145896)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/ast.py", line 426, in generic_visit
|
| 102 |
+
[36m(run_inference_one_model pid=2145896)[0m self.visit(item)
|
| 103 |
+
[36m(run_inference_one_model pid=2145896)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/ast.py", line 418, in visit
|
| 104 |
+
[36m(run_inference_one_model pid=2145896)[0m return visitor(node)
|
| 105 |
+
[36m(run_inference_one_model pid=2145896)[0m ^^^^^^^^^^^^^
|
| 106 |
+
[36m(run_inference_one_model pid=2145896)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/inspect.py", line 1029, in visit_FunctionDef
|
| 107 |
+
Splits: 0%| | 0/1 [00:29<?, ?it/s]
|
| 108 |
+
[0m
|
| 109 |
+
[36m(run_inference_one_model pid=2145923)[0m worker.main_loop()
|
| 110 |
+
[36m(run_inference_one_model pid=2145923)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/ray/_private/worker.py", line 946, in main_loop
|
| 111 |
+
[36m(run_inference_one_model pid=2145923)[0m self.core_worker.run_task_loop()
|
| 112 |
+
[36m(run_inference_one_model pid=2145923)[0m KeyboardInterrupt
|
| 113 |
+
[36m(run_inference_one_model pid=2145915)[0m Exception ignored in atexit callback: <function shutdown at 0x74fbc2bcf380>
|
| 114 |
+
[36m(run_inference_one_model pid=2145915)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
|
| 115 |
+
[36m(run_inference_one_model pid=2145915)[0m return func(*args, **kwargs)
|
| 116 |
+
[36m(run_inference_one_model pid=2145915)[0m ^^^^^^^^^^^^^^^^^^^^^
|
| 117 |
+
[36m(run_inference_one_model pid=2145915)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/ray/_private/worker.py", line 1950, in shutdown
|
| 118 |
+
[36m(run_inference_one_model pid=2145915)[0m from ray.dag.compiled_dag_node import _shutdown_all_compiled_dags
|
| 119 |
+
[36m(run_inference_one_model pid=2145915)[0m from ray.dag.dag_node import DAGNode
|
| 120 |
+
[36m(run_inference_one_model pid=2145915)[0m from ray.experimental.channel.auto_transport_type import AutoTransportType
|
| 121 |
+
[36m(run_inference_one_model pid=2145915)[0m from ray.experimental.channel.cached_channel import CachedChannel
|
| 122 |
+
[36m(run_inference_one_model pid=2145915)[0m from ray.experimental.channel.common import ChannelInterface
|
| 123 |
+
[36m(run_inference_one_model pid=2145915)[0m class ChannelContext:
|
| 124 |
+
[36m(run_inference_one_model pid=2145915)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/ray/experimental/channel/common.py", line 125, in ChannelContext
|
| 125 |
+
[36m(run_inference_one_model pid=2145915)[0m _current_stream: Optional["torch.cuda.Stream"] = None
|
| 126 |
+
[36m(run_inference_one_model pid=2145915)[0m ~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
|
| 127 |
+
[36m(run_inference_one_model pid=2145915)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/typing.py", line 376, in inner
|
| 128 |
+
[36m(run_inference_one_model pid=2145915)[0m return cached(*args, **kwds)
|
| 129 |
+
[36m(run_inference_one_model pid=2145915)[0m ^^^^^^^^^^^^^^^^^^^^^
|
| 130 |
+
[36m(run_inference_one_model pid=2145915)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/typing.py", line 502, in __getitem__
|
| 131 |
+
[36m(run_inference_one_model pid=2145915)[0m return self._getitem(self, parameters)
|
| 132 |
+
[36m(run_inference_one_model pid=2145915)[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 133 |
+
[36m(run_inference_one_model pid=2145915)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/typing.py", line 727, in Optional
|
| 134 |
+
[36m(run_inference_one_model pid=2145915)[0m return Union[arg, type(None)]
|
| 135 |
+
[36m(run_inference_one_model pid=2145915)[0m ~~~~~^^^^^^^^^^^^^^^^^
|
| 136 |
+
[36m(run_inference_one_model pid=2145915)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/typing.py", line 376, in inner
|
| 137 |
+
[36m(run_inference_one_model pid=2145915)[0m return cached(*args, **kwds)
|
| 138 |
+
[36m(run_inference_one_model pid=2145915)[0m ^^^^^^^^^^^^^^^^^^^^^
|
| 139 |
+
[36m(run_inference_one_model pid=2145915)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/typing.py", line 502, in __getitem__
|
| 140 |
+
[36m(run_inference_one_model pid=2145915)[0m return self._getitem(self, parameters)
|
| 141 |
+
[36m(run_inference_one_model pid=2145915)[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 142 |
+
[36m(run_inference_one_model pid=2145915)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/typing.py", line 715, in Union
|
| 143 |
+
[36m(run_inference_one_model pid=2145915)[0m parameters = tuple(_type_check(p, msg) for p in parameters)
|
| 144 |
+
[36m(run_inference_one_model pid=2145915)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/typing.py", line 715, in <genexpr>
|
| 145 |
+
[36m(run_inference_one_model pid=2145915)[0m parameters = tuple(_type_check(p, msg) for p in parameters)
|
| 146 |
+
[36m(run_inference_one_model pid=2145915)[0m ^^^^^^^^^^^^^^^^^^^
|
| 147 |
+
[36m(run_inference_one_model pid=2145915)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/typing.py", line 186, in _type_check
|
| 148 |
+
[36m(run_inference_one_model pid=2145915)[0m arg = _type_convert(arg, module=module, allow_special_forms=allow_special_forms)
|
| 149 |
+
[36m(run_inference_one_model pid=2145915)[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 150 |
+
[36m(run_inference_one_model pid=2145915)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/typing.py", line 159, in _type_convert
|
| 151 |
+
[36m(run_inference_one_model pid=2145915)[0m def _type_convert(arg, module=None, *, allow_special_forms=False):
|
| 152 |
+
[36m(run_inference_one_model pid=2145915)[0m
|
| 153 |
+
[36m(run_inference_one_model pid=2145915)[0m KeyboardInterrupt:
|
| 154 |
+
[36m(run_inference_one_model pid=2145898)[0m from ray.experimental.channel.torch_tensor_nccl_channel import TorchTensorNcclChannel
|
| 155 |
+
[36m(run_inference_one_model pid=2145898)[0m from ray.experimental.channel.cpu_communicator import CPUCommunicator
|
| 156 |
+
[36m(run_inference_one_model pid=2145898)[0m @ray.remote(num_cpus=0)
|
| 157 |
+
[36m(run_inference_one_model pid=2145898)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/ray/_private/worker.py", line 3231, in _make_remote
|
| 158 |
+
[36m(run_inference_one_model pid=2145898)[0m return ray.actor._make_actor(function_or_class, options)
|
| 159 |
+
[36m(run_inference_one_model pid=2145898)[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 160 |
+
[36m(run_inference_one_model pid=2145898)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/ray/actor.py", line 1764, in _make_actor
|
| 161 |
+
[36m(run_inference_one_model pid=2145898)[0m _inject_tracing_into_class(Class)
|
| 162 |
+
[36m(run_inference_one_model pid=2145898)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/ray/util/tracing/tracing_helper.py", line 540, in _inject_tracing_into_class
|
| 163 |
+
[36m(run_inference_one_model pid=2145898)[0m method.__signature__ = _add_param_to_signature(
|
| 164 |
+
[36m(run_inference_one_model pid=2145898)[0m ^^^^^^^^^^^^^^^^^^^^^^^^
|
| 165 |
+
[36m(run_inference_one_model pid=2145898)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/ray/util/tracing/tracing_helper.py", line 117, in _add_param_to_signature
|
| 166 |
+
[36m(run_inference_one_model pid=2145898)[0m old_sig = inspect.signature(function)
|
| 167 |
+
[36m(run_inference_one_model pid=2145898)[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 168 |
+
[36m(run_inference_one_model pid=2145898)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/inspect.py", line 3263, in signature
|
| 169 |
+
[36m(run_inference_one_model pid=2145898)[0m return Signature.from_callable(obj, follow_wrapped=follow_wrapped,
|
| 170 |
+
[36m(run_inference_one_model pid=2145898)[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 171 |
+
[36m(run_inference_one_model pid=2145898)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/inspect.py", line 3011, in from_callable
|
| 172 |
+
[36m(run_inference_one_model pid=2145898)[0m return _signature_from_callable(obj, sigcls=cls,
|
| 173 |
+
[36m(run_inference_one_model pid=2145898)[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 174 |
+
[36m(run_inference_one_model pid=2145898)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/inspect.py", line 2520, in _signature_from_callable
|
| 175 |
+
[36m(run_inference_one_model pid=2145898)[0m if isfunction(obj) or _signature_is_functionlike(obj):
|
| 176 |
+
[36m(run_inference_one_model pid=2145898)[0m ^^^^^^^^^^^^^^^
|
| 177 |
+
[36m(run_inference_one_model pid=2145898)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/inspect.py", line 378, in isfunction
|
| 178 |
+
[36m(run_inference_one_model pid=2145898)[0m def isfunction(object):
|
| 179 |
+
[36m(run_inference_one_model pid=2145898)[0m
|
| 180 |
+
Exception ignored in atexit callback: <function _start_and_connect_service.<locals>.teardown_atexit at 0x7e361a366520>
|
| 181 |
+
Traceback (most recent call last):
|
| 182 |
+
File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/wandb/sdk/lib/service_connection.py", line 94, in teardown_atexit
|
| 183 |
+
conn.teardown(hooks.exit_code)
|
| 184 |
+
File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/wandb/sdk/lib/service_connection.py", line 226, in teardown
|
| 185 |
+
self._router.join()
|
| 186 |
+
File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/wandb/sdk/interface/router.py", line 75, in join
|
| 187 |
+
self._thread.join()
|
| 188 |
+
File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/threading.py", line 1119, in join
|
| 189 |
+
self._wait_for_tstate_lock()
|
| 190 |
+
File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/threading.py", line 1139, in _wait_for_tstate_lock
|
| 191 |
+
if lock.acquire(block, timeout):
|
| 192 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 193 |
+
KeyboardInterrupt:
|
| 194 |
+
[36m(run_inference_one_model pid=2145922)[0m INFO 06-02 20:34:37 [core.py:438] Waiting for init message from front-end.
|
| 195 |
+
[36m(run_inference_one_model pid=2145922)[0m INFO 06-02 20:34:37 [__init__.py:31] Available plugins for group vllm.general_plugins:
|
| 196 |
+
[36m(run_inference_one_model pid=2145922)[0m INFO 06-02 20:34:37 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
|
| 197 |
+
[36m(run_inference_one_model pid=2145922)[0m INFO 06-02 20:34:37 [__init__.py:36] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.
|
| 198 |
+
[36m(run_inference_one_model pid=2145922)[0m INFO 06-02 20:34:37 [core.py:65] Initializing a V1 LLM engine (v0.9.0.1) with config: model='winglian/qwen3-4b-math-kd-jsd-temp1-v2', speculative_config=None, tokenizer='winglian/qwen3-4b-math-kd-jsd-temp1-v2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=main, override_neuron_config={}, tokenizer_revision=main, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=1234, served_model_name=winglian/qwen3-4b-math-kd-jsd-temp1-v2, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level": 3, "custom_ops": ["none"], "splitting_ops": ["vllm.unified_attention", "vllm.unified_attention_with_output"], "compile_sizes": [], "inductor_compile_config": {"enable_auto_functionalized_v2": false}, "use_cudagraph": true, "cudagraph_num_of_warmups": 1, "cudagraph_capture_sizes": [512, 504, 496, 488, 480, 472, 464, 456, 448, 440, 432, 424, 416, 408, 400, 392, 384, 376, 368, 360, 352, 344, 336, 328, 320, 312, 304, 296, 288, 280, 272, 264, 256, 248, 240, 232, 224, 216, 208, 200, 192, 184, 176, 168, 160, 152, 144, 136, 128, 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 4, 2, 1], "max_capture_size": 512}
|
| 199 |
+
[36m(run_inference_one_model pid=2145937)[0m Traceback (most recent call last):[32m [repeated 10x across cluster][0m
|
| 200 |
+
[36m(run_inference_one_model pid=2145896)[0m File "<string>", line 1, in <module>
|
| 201 |
+
[36m(run_inference_one_model pid=2145896)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/multiprocessing/spawn.py", line 122, in spawn_main
|
| 202 |
+
[36m(run_inference_one_model pid=2145896)[0m exitcode = _main(fd, parent_sentinel)
|
| 203 |
+
[36m(run_inference_one_model pid=2145896)[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 204 |
+
[36m(run_inference_one_model pid=2145896)[0m File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/multiprocessing/spawn.py", line 132, in _main
|
| 205 |
+
[36m(run_inference_one_model pid=2145896)[0m self = reduction.pickle.load(from_parent)
|
| 206 |
+
[36m(run_inference_one_model pid=2145896)[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 207 |
+
[36m(run_inference_one_model pid=2145937)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/ray/_private/workers/default_worker.py", line 330, in <module>[32m [repeated 20x across cluster][0m
|
| 208 |
+
[36m(run_inference_one_model pid=2145896)[0m from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
| 209 |
+
[36m(run_inference_one_model pid=2145915)[0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 210 |
+
[36m(run_inference_one_model pid=2145898)[0m ^^^^^^^^^^^^^^^^^^^^^^
|
| 211 |
+
[36m(run_inference_one_model pid=2145937)[0m worker.main_loop()[32m [repeated 6x across cluster][0m
|
| 212 |
+
[36m(run_inference_one_model pid=2145937)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/ray/_private/worker.py", line 946, in main_loop[32m [repeated 6x across cluster][0m
|
| 213 |
+
[36m(run_inference_one_model pid=2145937)[0m self.core_worker.run_task_loop()[32m [repeated 6x across cluster][0m
|
| 214 |
+
[36m(run_inference_one_model pid=2145937)[0m KeyboardInterrupt[32m [repeated 6x across cluster][0m
|
| 215 |
+
[36m(run_inference_one_model pid=2145898)[0m Exception ignored in atexit callback: <function shutdown at 0x78bc1c267380>
|
| 216 |
+
[36m(run_inference_one_model pid=2145898)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
|
| 217 |
+
[36m(run_inference_one_model pid=2145898)[0m return func(*args, **kwargs)
|
| 218 |
+
[36m(run_inference_one_model pid=2145898)[0m ^^^^^^^^^^^^^^^^^^^^^
|
| 219 |
+
[36m(run_inference_one_model pid=2145898)[0m File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/ray/_private/worker.py", line 1950, in shutdown
|
| 220 |
+
[36m(run_inference_one_model pid=2145898)[0m from ray.dag.compiled_dag_node import _shutdown_all_compiled_dags
|
| 221 |
+
[36m(run_inference_one_model pid=2145898)[0m from ray.dag.dag_node import DAGNode
|
| 222 |
+
[36m(run_inference_one_model pid=2145898)[0m from ray.experimental.channel.auto_transport_type import AutoTransportType
|
| 223 |
+
[36m(run_inference_one_model pid=2145898)[0m KeyboardInterrupt:
|
| 224 |
+
Exception ignored in atexit callback: <function shutdown at 0x7e3631f5b2e0>
|
| 225 |
+
Traceback (most recent call last):
|
| 226 |
+
File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
|
| 227 |
+
return func(*args, **kwargs)
|
| 228 |
+
^^^^^^^^^^^^^^^^^^^^^
|
| 229 |
+
File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/ray/_private/worker.py", line 1982, in shutdown
|
| 230 |
+
_global_node.kill_all_processes(check_alive=False, allow_graceful=True)
|
| 231 |
+
File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/ray/_private/node.py", line 1750, in kill_all_processes
|
| 232 |
+
self._kill_process_type(
|
| 233 |
+
File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/ray/_private/node.py", line 1550, in _kill_process_type
|
| 234 |
+
self._kill_process_impl(
|
| 235 |
+
File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/ray/_private/node.py", line 1606, in _kill_process_impl
|
| 236 |
+
process.wait(timeout_seconds)
|
| 237 |
+
File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/subprocess.py", line 1277, in wait
|
| 238 |
+
self._wait(timeout=sigint_timeout)
|
| 239 |
+
File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/subprocess.py", line 2047, in _wait
|
| 240 |
+
time.sleep(delay)
|
| 241 |
+
KeyboardInterrupt:
|
| 242 |
+
Exception ignored in atexit callback: <function shutdown_compile_workers at 0x7e38915a6f20>
|
| 243 |
+
Traceback (most recent call last):
|
| 244 |
+
File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/torch/_inductor/async_compile.py", line 113, in shutdown_compile_workers
|
| 245 |
+
pool.shutdown()
|
| 246 |
+
File "/home/ubuntu/axolotl/.venv/lib/python3.11/site-packages/torch/_inductor/compile_worker/subproc_pool.py", line 239, in shutdown
|
| 247 |
+
self.process.wait(300)
|
| 248 |
+
File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/subprocess.py", line 1277, in wait
|
| 249 |
+
self._wait(timeout=sigint_timeout)
|
| 250 |
+
File "/home/ubuntu/.local/share/uv/python/cpython-3.11.12-linux-x86_64-gnu/lib/python3.11/subprocess.py", line 2047, in _wait
|
| 251 |
+
time.sleep(delay)
|
| 252 |
+
KeyboardInterrupt:
|
wandb/run-20250602_203403-j8mvmygb/files/requirements.txt
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
colorama==0.4.6
|
| 2 |
+
setproctitle==1.2.2
|
| 3 |
+
psutil==7.0.0
|
| 4 |
+
sqlitedict==2.1.0
|
| 5 |
+
pathvalidate==3.2.3
|
| 6 |
+
sigtools==4.0.1
|
| 7 |
+
annotated-types==0.7.0
|
| 8 |
+
azure-datalake-store==0.0.53
|
| 9 |
+
axolotl-contribs-mit==0.0.3
|
| 10 |
+
uvicorn==0.34.2
|
| 11 |
+
sentry-sdk==2.29.1
|
| 12 |
+
nvidia-cuda-nvrtc-cu12==12.8.61
|
| 13 |
+
hyperframe==6.1.0
|
| 14 |
+
python-multipart==0.0.20
|
| 15 |
+
jsonlines==4.0.0
|
| 16 |
+
lm-format-enforcer==0.10.11
|
| 17 |
+
mbstrdecoder==1.1.4
|
| 18 |
+
pandas==2.2.3
|
| 19 |
+
websockets==15.0.1
|
| 20 |
+
certifi==2025.4.26
|
| 21 |
+
Pygments==2.19.1
|
| 22 |
+
prometheus_client==0.22.0
|
| 23 |
+
smmap==5.0.2
|
| 24 |
+
blake3==1.0.5
|
| 25 |
+
tomlkit==0.13.2
|
| 26 |
+
optimum==1.16.2
|
| 27 |
+
tqdm==4.67.1
|
| 28 |
+
nvidia-nvtx-cu12==12.8.55
|
| 29 |
+
pydub==0.25.1
|
| 30 |
+
ruff==0.11.11
|
| 31 |
+
oauthlib==3.2.2
|
| 32 |
+
pyasn1==0.6.1
|
| 33 |
+
nvidia-cusparselt-cu12==0.6.3
|
| 34 |
+
opentelemetry-api==1.33.1
|
| 35 |
+
filelock==3.18.0
|
| 36 |
+
Deprecated==1.2.18
|
| 37 |
+
triton==3.3.0
|
| 38 |
+
zipp==3.22.0
|
| 39 |
+
click==8.1.8
|
| 40 |
+
dnspython==2.7.0
|
| 41 |
+
Jinja2==3.1.6
|
| 42 |
+
google-auth-oauthlib==1.2.2
|
| 43 |
+
types-toml==0.10.8.20240310
|
| 44 |
+
shellingham==1.5.4
|
| 45 |
+
setproctitle==1.3.6
|
| 46 |
+
oci==2.152.1
|
| 47 |
+
cloudpickle==3.1.1
|
| 48 |
+
hf-xet==1.1.2
|
| 49 |
+
cffi==1.17.1
|
| 50 |
+
circuitbreaker==2.1.3
|
| 51 |
+
types-certifi==2021.10.8.3
|
| 52 |
+
humanfriendly==10.0
|
| 53 |
+
rsa==4.9.1
|
| 54 |
+
nvidia-ml-py==12.560.30
|
| 55 |
+
tqdm-multiprocess==0.0.11
|
| 56 |
+
nvidia-cuda-runtime-cu12==12.8.57
|
| 57 |
+
lomo-optim==0.1.1
|
| 58 |
+
isodate==0.7.2
|
| 59 |
+
liger_kernel==0.5.9
|
| 60 |
+
mistral_common==1.5.6
|
| 61 |
+
decorator==5.2.1
|
| 62 |
+
word2number==1.1
|
| 63 |
+
apollo-torch==1.0.3
|
| 64 |
+
httpx==0.27.2
|
| 65 |
+
scipy==1.15.3
|
| 66 |
+
datasets==3.5.1
|
| 67 |
+
sacrebleu==2.5.1
|
| 68 |
+
opentelemetry-semantic-conventions-ai==0.4.9
|
| 69 |
+
autoawq==0.2.7.post3
|
| 70 |
+
jiter==0.10.0
|
| 71 |
+
hf_transfer==0.1.9
|
| 72 |
+
platformdirs==4.3.8
|
| 73 |
+
anyio==4.9.0
|
| 74 |
+
azure-identity==1.23.0
|
| 75 |
+
transformers==4.51.3
|
| 76 |
+
orjson==3.10.18
|
| 77 |
+
xgrammar==0.1.19
|
| 78 |
+
lm_eval==0.4.7
|
| 79 |
+
lighteval==0.10.0
|
| 80 |
+
sniffio==1.3.1
|
| 81 |
+
multidict==6.4.4
|
| 82 |
+
responses==0.18.0
|
| 83 |
+
adlfs==2024.12.0
|
| 84 |
+
zstandard==0.22.0
|
| 85 |
+
fire==0.7.0
|
| 86 |
+
markdown-it-py==3.0.0
|
| 87 |
+
nvidia-cufile-cu12==1.13.0.11
|
| 88 |
+
opentelemetry-sdk==1.33.1
|
| 89 |
+
fastrlock==0.8.3
|
| 90 |
+
scikit-learn==1.4.2
|
| 91 |
+
nvidia-cudnn-cu12==9.7.1.26
|
| 92 |
+
nvidia-cusolver-cu12==11.7.2.55
|
| 93 |
+
airportsdata==20250523
|
| 94 |
+
fastcore==1.8.2
|
| 95 |
+
latex2sympy2_extended==1.0.6
|
| 96 |
+
termcolor==2.3.0
|
| 97 |
+
portalocker==3.1.1
|
| 98 |
+
opentelemetry-proto==1.33.1
|
| 99 |
+
watchfiles==1.0.5
|
| 100 |
+
typepy==1.3.4
|
| 101 |
+
evaluate==0.4.1
|
| 102 |
+
aiobotocore==2.22.0
|
| 103 |
+
groovy==0.1.2
|
| 104 |
+
google-auth==2.40.2
|
| 105 |
+
referencing==0.36.2
|
| 106 |
+
gguf==0.17.0
|
| 107 |
+
botocore==1.37.3
|
| 108 |
+
more-itertools==10.7.0
|
| 109 |
+
opentelemetry-exporter-otlp-proto-common==1.33.1
|
| 110 |
+
llguidance==0.7.26
|
| 111 |
+
ffmpy==0.5.0
|
| 112 |
+
gitdb==4.0.12
|
| 113 |
+
joblib==1.5.1
|
| 114 |
+
antlr4-python3-runtime==4.13.2
|
| 115 |
+
idna==3.10
|
| 116 |
+
networkx==3.4.2
|
| 117 |
+
regex==2024.11.6
|
| 118 |
+
rich==14.0.0
|
| 119 |
+
pydantic==2.10.6
|
| 120 |
+
pybind11==2.13.6
|
| 121 |
+
tokenizers==0.21.1
|
| 122 |
+
cryptography==44.0.3
|
| 123 |
+
packaging==23.2
|
| 124 |
+
gcsfs==2025.3.0
|
| 125 |
+
langdetect==1.0.9
|
| 126 |
+
google-cloud-core==2.4.3
|
| 127 |
+
absl-py==2.2.2
|
| 128 |
+
mdurl==0.1.2
|
| 129 |
+
typer==0.15.4
|
| 130 |
+
torchvision==0.22.0
|
| 131 |
+
httpcore==1.0.9
|
| 132 |
+
typing_extensions==4.13.2
|
| 133 |
+
aioitertools==0.12.0
|
| 134 |
+
opentelemetry-exporter-otlp-proto-grpc==1.33.1
|
| 135 |
+
modal==0.70.5
|
| 136 |
+
aiohttp==3.12.0
|
| 137 |
+
pytablewriter==1.2.1
|
| 138 |
+
importlib_metadata==8.6.1
|
| 139 |
+
cachetools==5.5.2
|
| 140 |
+
pyasn1_modules==0.4.2
|
| 141 |
+
opencv-python-headless==4.11.0.86
|
| 142 |
+
galore-torch==1.0
|
| 143 |
+
pycparser==2.22
|
| 144 |
+
pyzmq==26.4.0
|
| 145 |
+
deepspeed==0.15.4
|
| 146 |
+
python-dotenv==1.0.1
|
| 147 |
+
pip==25.1.1
|
| 148 |
+
nvidia-cublas-cu12==12.8.3.14
|
| 149 |
+
protobuf==5.29.4
|
| 150 |
+
colorama==0.4.6
|
| 151 |
+
diskcache==5.6.3
|
| 152 |
+
outlines_core==0.1.26
|
| 153 |
+
pytz==2025.2
|
| 154 |
+
rich-toolkit==0.14.7
|
| 155 |
+
opentelemetry-semantic-conventions==0.54b1
|
| 156 |
+
addict==2.4.0
|
| 157 |
+
cupy-cuda12x==13.4.1
|
| 158 |
+
numba==0.61.2
|
| 159 |
+
jsonschema-specifications==2025.4.1
|
| 160 |
+
safetensors==0.5.3
|
| 161 |
+
sympy==1.14.0
|
| 162 |
+
fastapi==0.115.12
|
| 163 |
+
google-cloud-storage==3.1.0
|
| 164 |
+
accelerate==1.6.0
|
| 165 |
+
deepspeed-kernels==0.0.1.dev1698255861
|
| 166 |
+
wrapt==1.17.2
|
| 167 |
+
email_validator==2.2.0
|
| 168 |
+
outlines==0.1.11
|
| 169 |
+
sentencepiece==0.2.0
|
| 170 |
+
numpy==1.26.4
|
| 171 |
+
tabledata==1.3.4
|
| 172 |
+
PyJWT==2.10.1
|
| 173 |
+
ray==2.46.0
|
| 174 |
+
google-resumable-media==2.7.2
|
| 175 |
+
h11==0.16.0
|
| 176 |
+
depyf==0.18.0
|
| 177 |
+
h2==4.2.0
|
| 178 |
+
pycountry==24.6.1
|
| 179 |
+
opentelemetry-exporter-otlp-proto-http==1.33.1
|
| 180 |
+
aenum==3.1.15
|
| 181 |
+
six==1.17.0
|
| 182 |
+
partial-json-parser==0.2.1.1.post5
|
| 183 |
+
xformers==0.0.30
|
| 184 |
+
nvidia-cusparse-cu12==12.5.7.53
|
| 185 |
+
torch==2.7.0+cu128
|
| 186 |
+
hpack==4.1.0
|
| 187 |
+
nvidia-cufft-cu12==11.3.3.41
|
| 188 |
+
multiprocess==0.70.16
|
| 189 |
+
synchronicity==0.9.12
|
| 190 |
+
astor==0.8.1
|
| 191 |
+
python-dateutil==2.9.0.post0
|
| 192 |
+
google-api-core==2.24.2
|
| 193 |
+
vlllm==0.2.2
|
| 194 |
+
fsspec==2025.3.0
|
| 195 |
+
torchaudio==2.7.0
|
| 196 |
+
charset-normalizer==3.4.2
|
| 197 |
+
GitPython==3.1.44
|
| 198 |
+
mpmath==1.3.0
|
| 199 |
+
llvmlite==0.44.0
|
| 200 |
+
huggingface-hub==0.32.3
|
| 201 |
+
tensorboard==2.19.0
|
| 202 |
+
attrs==25.3.0
|
| 203 |
+
wheel==0.45.1
|
| 204 |
+
hjson==3.1.0
|
| 205 |
+
nvidia-nccl-cu12==2.26.2
|
| 206 |
+
python-json-logger==3.3.0
|
| 207 |
+
torch-optimi==0.2.1
|
| 208 |
+
chardet==5.2.0
|
| 209 |
+
pyOpenSSL==24.3.0
|
| 210 |
+
ocifs==1.3.2
|
| 211 |
+
tabulate==0.9.0
|
| 212 |
+
propcache==0.3.1
|
| 213 |
+
immutabledict==4.2.0
|
| 214 |
+
jmespath==1.0.1
|
| 215 |
+
aiohappyeyeballs==2.6.1
|
| 216 |
+
nest-asyncio==1.6.0
|
| 217 |
+
jsonschema==4.24.0
|
| 218 |
+
cmake==4.0.2
|
| 219 |
+
numexpr==2.10.2
|
| 220 |
+
threadpoolctl==3.6.0
|
| 221 |
+
google-crc32c==1.7.1
|
| 222 |
+
MarkupSafe==3.0.2
|
| 223 |
+
msgspec==0.19.0
|
| 224 |
+
uvloop==0.21.0
|
| 225 |
+
gradio==5.23.3
|
| 226 |
+
yarl==1.20.0
|
| 227 |
+
trl==0.17.0
|
| 228 |
+
torchao==0.10.0
|
| 229 |
+
compressed-tensors==0.9.4
|
| 230 |
+
vllm==0.9.0.1
|
| 231 |
+
bitsandbytes==0.45.4
|
| 232 |
+
azure-storage-blob==12.25.1
|
| 233 |
+
ninja==1.11.1.4
|
| 234 |
+
opentelemetry-exporter-otlp==1.33.1
|
| 235 |
+
distro==1.9.0
|
| 236 |
+
axolotl==0.10.0.dev0
|
| 237 |
+
py-cpuinfo==9.0.0
|
| 238 |
+
peft==0.15.2
|
| 239 |
+
frozenlist==1.6.0
|
| 240 |
+
lxml==5.4.0
|
| 241 |
+
rpds-py==0.25.1
|
| 242 |
+
coloredlogs==15.0.1
|
| 243 |
+
msal==1.32.3
|
| 244 |
+
colorlog==6.9.0
|
| 245 |
+
tcolorpy==0.1.7
|
| 246 |
+
PyYAML==6.0.2
|
| 247 |
+
proto-plus==1.26.1
|
| 248 |
+
toml==0.10.2
|
| 249 |
+
cut-cross-entropy==25.4.1
|
| 250 |
+
DataProperty==1.1.0
|
| 251 |
+
requests-oauthlib==2.0.0
|
| 252 |
+
tzdata==2025.2
|
| 253 |
+
axolotl-contribs-lgpl==0.0.6
|
| 254 |
+
msgpack==1.1.0
|
| 255 |
+
wandb==0.19.11
|
| 256 |
+
docker-pycreds==0.4.0
|
| 257 |
+
requests==2.32.3
|
| 258 |
+
openai==1.82.1
|
| 259 |
+
grpclib==0.4.7
|
| 260 |
+
setuptools==80.8.0
|
| 261 |
+
Markdown==3.8
|
| 262 |
+
pydantic_core==2.27.2
|
| 263 |
+
pyarrow==20.0.0
|
| 264 |
+
azure-core==1.34.0
|
| 265 |
+
semantic-version==2.10.0
|
| 266 |
+
interegular==0.3.3
|
| 267 |
+
hqq==0.2.5
|
| 268 |
+
s3fs==2025.3.0
|
| 269 |
+
fastapi-cli==0.0.7
|
| 270 |
+
rouge_score==0.1.2
|
| 271 |
+
psutil==7.0.0
|
| 272 |
+
Werkzeug==3.1.3
|
| 273 |
+
pillow==11.2.1
|
| 274 |
+
lark==1.2.2
|
| 275 |
+
tensorboard-data-server==0.7.2
|
| 276 |
+
prometheus-fastapi-instrumentator==7.1.0
|
| 277 |
+
safehttpx==0.1.6
|
| 278 |
+
einops==0.8.1
|
| 279 |
+
msal-extensions==1.3.1
|
| 280 |
+
dill==0.3.8
|
| 281 |
+
googleapis-common-protos==1.70.0
|
| 282 |
+
came-pytorch==0.1.3
|
| 283 |
+
urllib3==2.4.0
|
| 284 |
+
nltk==3.9.1
|
| 285 |
+
aiofiles==23.2.1
|
| 286 |
+
nvidia-nvjitlink-cu12==12.8.61
|
| 287 |
+
art==6.5
|
| 288 |
+
httptools==0.6.4
|
| 289 |
+
nvidia-cuda-cupti-cu12==12.8.57
|
| 290 |
+
flash_attn==2.7.4.post1
|
| 291 |
+
grpcio==1.71.0
|
| 292 |
+
schedulefree==1.4.1
|
| 293 |
+
aiosignal==1.3.2
|
| 294 |
+
nvidia-curand-cu12==10.3.9.55
|
| 295 |
+
tiktoken==0.9.0
|
| 296 |
+
xxhash==3.5.0
|
| 297 |
+
starlette==0.46.2
|
| 298 |
+
gradio_client==1.8.0
|
| 299 |
+
axolotl==0.10.0.dev0
|
| 300 |
+
backports.tarfile==1.2.0
|
| 301 |
+
inflect==7.3.1
|
| 302 |
+
zipp==3.19.2
|
| 303 |
+
typeguard==4.3.0
|
| 304 |
+
jaraco.collections==5.1.0
|
| 305 |
+
jaraco.context==5.3.0
|
| 306 |
+
jaraco.functools==4.0.1
|
| 307 |
+
autocommand==2.2.2
|
| 308 |
+
typing_extensions==4.12.2
|
| 309 |
+
platformdirs==4.2.2
|
| 310 |
+
jaraco.text==3.12.1
|
| 311 |
+
wheel==0.45.1
|
| 312 |
+
tomli==2.0.1
|
| 313 |
+
importlib_metadata==8.0.0
|
| 314 |
+
packaging==24.2
|
| 315 |
+
more-itertools==10.3.0
|
wandb/run-20250602_203403-j8mvmygb/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.8.0-52-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.12",
|
| 4 |
+
"startedAt": "2025-06-02T20:34:03.455028Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"vllm",
|
| 7 |
+
"model_name=winglian/qwen3-4b-math-kd-jsd-temp1-v2,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=8,generation_parameters={max_new_tokens:28000,temperature:0.6,top_p:0.95}",
|
| 8 |
+
"lighteval|aime24|0|0",
|
| 9 |
+
"--use-chat-template",
|
| 10 |
+
"--wandb"
|
| 11 |
+
],
|
| 12 |
+
"program": "/home/ubuntu/axolotl/.venv/bin/lighteval",
|
| 13 |
+
"codePath": ".venv/bin/lighteval",
|
| 14 |
+
"git": {
|
| 15 |
+
"remote": "https://github.com/axolotl-ai-cloud/axolotl.git",
|
| 16 |
+
"commit": "07115ebfa5e08fbd7fffdb6c55032e064422fc10"
|
| 17 |
+
},
|
| 18 |
+
"email": "wing.lian@gmail.com",
|
| 19 |
+
"root": "/home/ubuntu/axolotl/outputs/out-kd-4b-offline-t1-v2",
|
| 20 |
+
"host": "192-222-54-244",
|
| 21 |
+
"executable": "/home/ubuntu/axolotl/.venv/bin/python3",
|
| 22 |
+
"cpu_count": 104,
|
| 23 |
+
"cpu_count_logical": 208,
|
| 24 |
+
"gpu": "NVIDIA H100 80GB HBM3",
|
| 25 |
+
"gpu_count": 8,
|
| 26 |
+
"disk": {
|
| 27 |
+
"/": {
|
| 28 |
+
"total": "23443440275456",
|
| 29 |
+
"used": "3257718525952"
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"memory": {
|
| 33 |
+
"total": "1902330576896"
|
| 34 |
+
},
|
| 35 |
+
"cpu": {
|
| 36 |
+
"count": 104,
|
| 37 |
+
"countLogical": 208
|
| 38 |
+
},
|
| 39 |
+
"gpu_nvidia": [
|
| 40 |
+
{
|
| 41 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 42 |
+
"memoryTotal": "85520809984",
|
| 43 |
+
"cudaCores": 16896,
|
| 44 |
+
"architecture": "Hopper"
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 48 |
+
"memoryTotal": "85520809984",
|
| 49 |
+
"cudaCores": 16896,
|
| 50 |
+
"architecture": "Hopper"
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 54 |
+
"memoryTotal": "85520809984",
|
| 55 |
+
"cudaCores": 16896,
|
| 56 |
+
"architecture": "Hopper"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 60 |
+
"memoryTotal": "85520809984",
|
| 61 |
+
"cudaCores": 16896,
|
| 62 |
+
"architecture": "Hopper"
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 66 |
+
"memoryTotal": "85520809984",
|
| 67 |
+
"cudaCores": 16896,
|
| 68 |
+
"architecture": "Hopper"
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 72 |
+
"memoryTotal": "85520809984",
|
| 73 |
+
"cudaCores": 16896,
|
| 74 |
+
"architecture": "Hopper"
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 78 |
+
"memoryTotal": "85520809984",
|
| 79 |
+
"cudaCores": 16896,
|
| 80 |
+
"architecture": "Hopper"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 84 |
+
"memoryTotal": "85520809984",
|
| 85 |
+
"cudaCores": 16896,
|
| 86 |
+
"architecture": "Hopper"
|
| 87 |
+
}
|
| 88 |
+
],
|
| 89 |
+
"cudaVersion": "12.8"
|
| 90 |
+
}
|
wandb/run-20250602_203403-j8mvmygb/logs/debug-core.log
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-02T20:34:03.074815912Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp201dftlh/port-2144651.txt","pid":2144651,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2025-06-02T20:34:03.080112906Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":2144651}
|
| 3 |
+
{"time":"2025-06-02T20:34:03.080102697Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":45711,"Zone":""}}
|
| 4 |
+
{"time":"2025-06-02T20:34:03.256828202Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:43720"}
|
| 5 |
+
{"time":"2025-06-02T20:34:03.456318349Z","level":"INFO","msg":"handleInformInit: received","streamId":"j8mvmygb","id":"127.0.0.1:43720"}
|
| 6 |
+
{"time":"2025-06-02T20:34:03.732979205Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"j8mvmygb","id":"127.0.0.1:43720"}
|
| 7 |
+
{"time":"2025-06-02T20:34:43.288151695Z","level":"INFO","msg":"Parent process exited, terminating service process."}
|
wandb/run-20250602_203403-j8mvmygb/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-02T20:34:03.456431945Z","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"/home/ubuntu/axolotl/outputs/out-kd-4b-offline-t1-v2/wandb/run-20250602_203403-j8mvmygb/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-02T20:34:03.732828057Z","level":"INFO","msg":"created new stream","id":"j8mvmygb"}
|
| 3 |
+
{"time":"2025-06-02T20:34:03.732975853Z","level":"INFO","msg":"stream: started","id":"j8mvmygb"}
|
| 4 |
+
{"time":"2025-06-02T20:34:03.733017439Z","level":"INFO","msg":"writer: Do: started","stream_id":"j8mvmygb"}
|
| 5 |
+
{"time":"2025-06-02T20:34:03.733141665Z","level":"INFO","msg":"sender: started","stream_id":"j8mvmygb"}
|
| 6 |
+
{"time":"2025-06-02T20:34:03.733155406Z","level":"INFO","msg":"handler: started","stream_id":"j8mvmygb"}
|
| 7 |
+
{"time":"2025-06-02T20:34:03.978050525Z","level":"INFO","msg":"Starting system monitor"}
|
wandb/run-20250602_203403-j8mvmygb/logs/debug.log
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_setup.py:_flush():70] Configure stats pid to 2144651
|
| 3 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_setup.py:_flush():70] Loading settings from /home/ubuntu/.config/wandb/settings
|
| 4 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_setup.py:_flush():70] Loading settings from /home/ubuntu/axolotl/outputs/out-kd-4b-offline-t1-v2/wandb/settings
|
| 5 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /home/ubuntu/axolotl/outputs/out-kd-4b-offline-t1-v2/wandb/run-20250602_203403-j8mvmygb/logs/debug.log
|
| 7 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /home/ubuntu/axolotl/outputs/out-kd-4b-offline-t1-v2/wandb/run-20250602_203403-j8mvmygb/logs/debug-internal.log
|
| 8 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-06-02 20:34:03,453 INFO MainThread:2144651 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-06-02 20:34:03,454 INFO MainThread:2144651 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-06-02 20:34:03,454 INFO MainThread:2144651 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-06-02 20:34:03,456 INFO MainThread:2144651 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-06-02 20:34:03,460 INFO MainThread:2144651 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-06-02 20:34:03,976 INFO MainThread:2144651 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-06-02 20:34:04,083 INFO MainThread:2144651 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-06-02 20:34:04,084 INFO MainThread:2144651 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-06-02 20:34:04,084 INFO MainThread:2144651 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-06-02 20:34:04,084 INFO MainThread:2144651 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-06-02 20:34:04,085 INFO MainThread:2144651 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-06-02 20:34:38,220 INFO MsgRouterThr:2144651 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
|
wandb/run-20250602_203403-j8mvmygb/run-j8mvmygb.wandb
ADDED
|
Binary file (65.5 kB). View file
|
|
|