emotion / src /monitor.py
robot4's picture
Upload folder using huggingface_hub
af9853e verified
import os
import time
import json
import glob
import pandas as pd
from datetime import datetime
def get_latest_checkpoint(checkpoint_dir):
# 查找所有 checkpoint-XXX 文件夹
checkpoints = glob.glob(os.path.join(checkpoint_dir, "checkpoint-*"))
if not checkpoints:
return None
# 按修改时间排序,最新的在最后
checkpoints.sort(key=os.path.getmtime)
return checkpoints[-1]
def read_metrics(checkpoint_path):
state_file = os.path.join(checkpoint_path, "trainer_state.json")
if not os.path.exists(state_file):
return None
try:
with open(state_file, 'r') as f:
data = json.load(f)
return data.get("log_history", [])
except:
return None
def monitor(checkpoint_dir="checkpoints"):
print(f"👀 开始监视训练目录: {checkpoint_dir}")
print("按 Ctrl+C 退出监视")
print("-" * 50)
last_step = -1
while True:
latest_ckpt = get_latest_checkpoint(checkpoint_dir)
if latest_ckpt:
folder_name = os.path.basename(latest_ckpt)
logs = read_metrics(latest_ckpt)
if logs:
# 找到最新的 eval 记录
latest_log = logs[-1]
current_step = latest_log.get('step', 0)
# 如果有更新
if current_step != last_step:
timestamp = datetime.now().strftime("%H:%M:%S")
# 尝试寻找验证集指标 (eval_accuracy 等)
# log_history 混杂了 training loss 和 eval metrics
# 我们倒序找最近的一个包含 eval_accuracy 的记录
eval_record = None
train_record = None
for log in reversed(logs):
if 'eval_accuracy' in log and eval_record is None:
eval_record = log
if 'loss' in log and train_record is None:
train_record = log
if eval_record and train_record:
break
print(f"[{timestamp}] 最新检查点: {folder_name}")
if train_record:
print(f" 📉 Training Loss: {train_record.get('loss', 'N/A'):.4f} (Epoch {train_record.get('epoch', 'N/A'):.2f})")
if eval_record:
print(f" ✅ Eval Accuracy: {eval_record.get('eval_accuracy', 'N/A'):.4f}")
print(f" ✅ Eval F1 Score: {eval_record.get('eval_f1', 'N/A'):.4f}")
print("-" * 50)
last_step = current_step
time.sleep(10) # 每10秒检查一次
if __name__ == "__main__":
# 尝试从 config 读取路径,如果失败则使用默认
try:
from config import Config
ckpt_dir = Config.CHECKPOINT_DIR
except:
ckpt_dir = "checkpoints"
monitor(ckpt_dir)