|
|
import os |
|
|
import time |
|
|
import json |
|
|
import glob |
|
|
import pandas as pd |
|
|
from datetime import datetime |
|
|
|
|
|
def get_latest_checkpoint(checkpoint_dir): |
|
|
|
|
|
checkpoints = glob.glob(os.path.join(checkpoint_dir, "checkpoint-*")) |
|
|
if not checkpoints: |
|
|
return None |
|
|
|
|
|
checkpoints.sort(key=os.path.getmtime) |
|
|
return checkpoints[-1] |
|
|
|
|
|
def read_metrics(checkpoint_path): |
|
|
state_file = os.path.join(checkpoint_path, "trainer_state.json") |
|
|
if not os.path.exists(state_file): |
|
|
return None |
|
|
|
|
|
try: |
|
|
with open(state_file, 'r') as f: |
|
|
data = json.load(f) |
|
|
return data.get("log_history", []) |
|
|
except: |
|
|
return None |
|
|
|
|
|
def monitor(checkpoint_dir="checkpoints"): |
|
|
print(f"👀 开始监视训练目录: {checkpoint_dir}") |
|
|
print("按 Ctrl+C 退出监视") |
|
|
print("-" * 50) |
|
|
|
|
|
last_step = -1 |
|
|
|
|
|
while True: |
|
|
latest_ckpt = get_latest_checkpoint(checkpoint_dir) |
|
|
if latest_ckpt: |
|
|
folder_name = os.path.basename(latest_ckpt) |
|
|
logs = read_metrics(latest_ckpt) |
|
|
|
|
|
if logs: |
|
|
|
|
|
latest_log = logs[-1] |
|
|
current_step = latest_log.get('step', 0) |
|
|
|
|
|
|
|
|
if current_step != last_step: |
|
|
timestamp = datetime.now().strftime("%H:%M:%S") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_record = None |
|
|
train_record = None |
|
|
|
|
|
for log in reversed(logs): |
|
|
if 'eval_accuracy' in log and eval_record is None: |
|
|
eval_record = log |
|
|
if 'loss' in log and train_record is None: |
|
|
train_record = log |
|
|
if eval_record and train_record: |
|
|
break |
|
|
|
|
|
print(f"[{timestamp}] 最新检查点: {folder_name}") |
|
|
if train_record: |
|
|
print(f" 📉 Training Loss: {train_record.get('loss', 'N/A'):.4f} (Epoch {train_record.get('epoch', 'N/A'):.2f})") |
|
|
if eval_record: |
|
|
print(f" ✅ Eval Accuracy: {eval_record.get('eval_accuracy', 'N/A'):.4f}") |
|
|
print(f" ✅ Eval F1 Score: {eval_record.get('eval_f1', 'N/A'):.4f}") |
|
|
print("-" * 50) |
|
|
|
|
|
last_step = current_step |
|
|
|
|
|
time.sleep(10) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
try: |
|
|
from config import Config |
|
|
ckpt_dir = Config.CHECKPOINT_DIR |
|
|
except: |
|
|
ckpt_dir = "checkpoints" |
|
|
|
|
|
monitor(ckpt_dir) |
|
|
|