File size: 5,635 Bytes
9ae8d89
 
 
 
 
 
0a14325
b7600d0
9ae8d89
20dad4a
9ae8d89
09b313f
9ae8d89
d8147b8
ca44f9b
9ae8d89
0a14325
0da5ee3
 
0a14325
ba515db
d83f3a1
0da5ee3
553b217
2a7ac72
553b217
2a7ac72
553b217
2a7ac72
20dad4a
 
fb84311
 
 
 
d83f3a1
4b6eb81
b7600d0
 
 
 
 
 
 
 
 
 
 
 
 
 
09b313f
9ae8d89
 
 
09b313f
9ae8d89
 
 
 
 
351418d
9ae8d89
351418d
 
 
 
 
 
 
9ae8d89
351418d
b50c184
9ae8d89
d86ca68
 
0a14325
553b217
 
20dad4a
 
9ae8d89
351418d
d86ca68
 
 
 
553b217
20dad4a
 
351418d
d86ca68
 
 
 
 
 
351418d
9ae8d89
 
 
351418d
9ae8d89
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import json
import os

import pandas as pd

from src.display.formatting import has_no_nan_values, make_clickable_model
# changes to be made here
from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns, OpenEndedArabicColumn, OpenEndedFrenchColumn, OpenEndedSpanishColumn, OpenEndedPortugueseColumn, OpenEndedRomanianColumn, OpenEndedGreekColumn, ClosedEndedMultilingualColumns, EHRSQLZeroShotColumns, EHRSQLFewShotColumns, MedCalcDirectAnswerColumns, MedCalcOneShotCotColumns, MedCalcZeroShotCotColumns, MedECZeroShotColumns, MedECOneShotColumns
from src.leaderboard.read_evals import get_raw_eval_results
from src.envs import PRIVATE_REPO

def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, evaluation_metric:str, subset:str) -> pd.DataFrame:
    """Creates a dataframe from all the individual experiment results"""
    raw_data =  get_raw_eval_results(results_path, requests_path, evaluation_metric)
    all_data_json = [v.to_dict(subset=subset) for v in raw_data if not v.full_model.startswith("/models_llm")]
    df = pd.DataFrame.from_records(all_data_json)
    # changes to be made here
    if subset == "datasets":
        df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
    elif subset == "med_safety":
        df = df.sort_values(by=["Harmfulness Score"], ascending=True)
    elif subset.startswith("open_ended"):
        df = df.sort_values(by=["ELO"], ascending=False)
    elif subset == "medical_summarization":
        df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
    elif subset == "aci":
        df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
    elif subset == "soap":
        df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
    elif subset == "closed_ended_arabic":
        df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
    elif subset == "healthbench":
        df = df.sort_values(by=["Overall Score"], ascending=False)
    elif subset == "healthbench_hard":
        df = df.sort_values(by=["Overall Score"], ascending=False)
    elif subset == "closed_ended_multilingual":
        df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
    elif subset == "ehrsql_zero_shot":
        df = df.sort_values(by=["RS (0)"], ascending=False)
    elif subset == "ehrsql_few_shot":
        df = df.sort_values(by=["RS (0)"], ascending=False)
    elif subset == "medcalc_direct_answer":
        df = df.sort_values(by=["Overall"], ascending=False)
    elif subset == "medcalc_one_shot_cot":
        df = df.sort_values(by=["Overall"], ascending=False)
    elif subset == "medcalc_zero_shot_cot":
        df = df.sort_values(by=["Overall"], ascending=False)
    elif subset == "medec_zero_shot":
        df = df.sort_values(by=["Error Flag Accuracy (%)"], ascending=False)
    elif subset == "medec_one_shot":
        df = df.sort_values(by=["Error Flag Accuracy (%)"], ascending=False)
    cols = list(set(df.columns).intersection(set(cols)))
    df = df[cols].round(decimals=2)
    # filter out if any of the benchmarks have not been produced
    df = df[has_no_nan_values(df, benchmark_cols)]
    return raw_data, df

def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
    """Creates the different dataframes for the evaluation queues requestes"""
    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
    all_evals = []
    
    for entry in entries:
        full_path = os.path.join(save_path, entry)

        if os.path.isdir(full_path):
            continue

        if entry.endswith(".json"):
            with open(full_path) as fp:
                data = json.load(fp)

            data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"]) if not data["private"] else data["model_name"]
            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
            data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
            data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
            data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
            data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
            data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
            if PRIVATE_REPO:
                data[EvalQueueColumn.closed_ended_arabic_status.name] = data["status"]["closed-ended-arabic"]
            all_evals.append(data)
    
    pending_list = []
    running_list = []
    finished_list = []
    for run in all_evals:
        status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["medical-summarization"], run["status"]["note-generation"]]
        if PRIVATE_REPO:
            status_list.append(run["status"]["closed-ended-arabic"])
        
        if "RUNNING" in status_list:
            running_list.append(run)
        elif "PENDING" in status_list or "RERUN" in status_list:
            pending_list.append(run)
        else:
            finished_list.append(run)

    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
    df_running = pd.DataFrame.from_records(running_list, columns=cols)
    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
    
    return df_finished[cols], df_running[cols], df_pending[cols]