{ "training_metadata": { "timestamp": "20251113_180231", "training_date": "2025-11-13", "training_time": "18:02:31", "final_epoch": 5, "total_steps": null, "status": "completed", "run_name": "GLM-4.5-Air_lr0.0002_20251112_093330" }, "model_config": { "base_model": "zai-org/GLM-4.5-Air", "model_type": "moe_causal_lm", "architecture": "Glm4MoeForCausalLM", "total_parameters": 13483146240, "trainable_parameters": 126615552, "trainable_percentage": "0.9391%" }, "lora_config": { "r": 512, "lora_alpha": 1024, "lora_dropout": 0.05, "target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj" ], "exclude_modules": [ "block_sparse_moe", "w1", "w2", "w3", "gate" ], "bias": "none", "use_rslora": true }, "training_config": { "num_epochs": 5, "per_device_train_batch_size": 1, "per_device_eval_batch_size": 1, "gradient_accumulation_steps": 32, "effective_batch_size": 256, "learning_rate": 0.0002, "lr_scheduler_type": "cosine", "warmup_ratio": 0.03, "weight_decay": 0.01, "max_grad_norm": 1.0, "bf16": true, "gradient_checkpointing": true, "optim": "adafactor", "logging_steps": 10, "save_steps": 50, "eval_steps": 50 }, "dataset_info": { "train_samples": 16450, "eval_samples": 25, "max_seq_length": 2048, "data_source": "hyperswitch" }, "hardware_config": { "num_gpus": 8, "gpu_model": "NVIDIA H200", "gpu_memory_per_device_gb": 141, "distributed_strategy": "FSDP (Fully Sharded Data Parallel)", "fsdp_sharding_strategy": "FULL_SHARD", "flash_attention": "2.8.3" }, "moe_config": { "use_auxiliary_loss": true, "auxiliary_loss_weight": 0.001, "freeze_router": false, "num_experts_per_token": 2, "monitor_expert_usage": true }, "performance_metrics": { "final_train_loss": 0.4083600953909067, "final_train_runtime": 116423.7209, "final_train_samples_per_second": 0.706, "final_train_steps_per_second": 0.003, "final_train_perplexity": 1.5043487727077183, "eval_loss": 0.3746, "eval_token_accuracy": 0.0005, "eval_runtime": 61.0274, "eval_samples_per_second": 0.4100, "eval_steps_per_second": 0.0660, "epoch": 5.0000, "eval_perplexity": 1.4544, "eval_mean_token_accuracy": 0.0005, "perplexity": 1.4544 }, "framework_versions": { "torch": "2.5.1+cu121", "transformers": "4.57.1", "peft": "0.17.1", "accelerate": "1.11.0", "python": "3.12.3", "flash_attn": "2.8.3" }, "special_features": { "flash_attention_2": true, "gradient_checkpointing": true, "bf16_training": true, "fsdp_training": true, "attention_only_lora": true, "frozen_experts": true, "eval_accumulation": true } }