robot4
/

emotion

Model card Files Files and versions

emotion / src /monitor.py

robot4's picture

Upload folder using huggingface_hub

af9853e verified about 1 month ago

history blame contribute delete

3.12 kB

	import os
	import time
	import json
	import glob
	import pandas as pd
	from datetime import datetime

	def get_latest_checkpoint(checkpoint_dir):
	# 查找所有 checkpoint-XXX 文件夹
	checkpoints = glob.glob(os.path.join(checkpoint_dir, "checkpoint-*"))
	if not checkpoints:
	return None
	# 按修改时间排序，最新的在最后
	checkpoints.sort(key=os.path.getmtime)
	return checkpoints[-1]

	def read_metrics(checkpoint_path):
	state_file = os.path.join(checkpoint_path, "trainer_state.json")
	if not os.path.exists(state_file):
	return None

	try:
	with open(state_file, 'r') as f:
	data = json.load(f)
	return data.get("log_history", [])
	except:
	return None

	def monitor(checkpoint_dir="checkpoints"):
	print(f"👀 开始监视训练目录: {checkpoint_dir}")
	print("按 Ctrl+C 退出监视")
	print("-" * 50)

	last_step = -1

	while True:
	latest_ckpt = get_latest_checkpoint(checkpoint_dir)
	if latest_ckpt:
	folder_name = os.path.basename(latest_ckpt)
	logs = read_metrics(latest_ckpt)

	if logs:
	# 找到最新的 eval 记录
	latest_log = logs[-1]
	current_step = latest_log.get('step', 0)

	# 如果有更新
	if current_step != last_step:
	timestamp = datetime.now().strftime("%H:%M:%S")

	# 尝试寻找验证集指标 (eval_accuracy 等)
	# log_history 混杂了 training loss 和 eval metrics
	# 我们倒序找最近的一个包含 eval_accuracy 的记录
	eval_record = None
	train_record = None

	for log in reversed(logs):
	if 'eval_accuracy' in log and eval_record is None:
	eval_record = log
	if 'loss' in log and train_record is None:
	train_record = log
	if eval_record and train_record:
	break

	print(f"[{timestamp}] 最新检查点: {folder_name}")
	if train_record:
	print(f" 📉 Training Loss: {train_record.get('loss', 'N/A'):.4f} (Epoch {train_record.get('epoch', 'N/A'):.2f})")
	if eval_record:
	print(f" ✅ Eval Accuracy: {eval_record.get('eval_accuracy', 'N/A'):.4f}")
	print(f" ✅ Eval F1 Score: {eval_record.get('eval_f1', 'N/A'):.4f}")
	print("-" * 50)

	last_step = current_step

	time.sleep(10) # 每10秒检查一次

	if __name__ == "__main__":
	# 尝试从 config 读取路径，如果失败则使用默认
	try:
	from config import Config
	ckpt_dir = Config.CHECKPOINT_DIR
	except:
	ckpt_dir = "checkpoints"

	monitor(ckpt_dir)