Spaces:
Running
Running
Commit
·
2a7ac72
1
Parent(s):
12f8259
[MODIFY] Med-Safety: Average -> Harmfulness Score
Browse files- src/display/utils.py +2 -1
- src/leaderboard/read_evals.py +1 -1
- src/populate.py +4 -4
src/display/utils.py
CHANGED
|
@@ -38,8 +38,9 @@ auto_eval_column_dict = []
|
|
| 38 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 39 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 40 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
|
| 41 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True,
|
| 42 |
auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
|
|
|
|
| 43 |
for task in HarnessTasks:
|
| 44 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
|
| 45 |
for column in OpenEndedColumns:
|
|
|
|
| 38 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 39 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 40 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
|
| 41 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, invariant=False)])
|
| 42 |
auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
|
| 43 |
+
auto_eval_column_dict.append(["harmfulness", ColumnContent, ColumnContent("Harmfulness Score", "number", True, False, med_safety_col=True, invariant=False)])
|
| 44 |
for task in HarnessTasks:
|
| 45 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
|
| 46 |
for column in OpenEndedColumns:
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -265,7 +265,7 @@ class EvalResult:
|
|
| 265 |
# changes to be made here
|
| 266 |
if subset == "med_safety":
|
| 267 |
average = sum([v for v in self.med_safety_results.values() if v is not None]) / len(MedSafetyColumns)
|
| 268 |
-
data_dict[AutoEvalColumn.
|
| 269 |
if len(self.med_safety_results) > 0:
|
| 270 |
for task in MedSafetyColumns:
|
| 271 |
data_dict[task.value.col_name] = self.med_safety_results[task.value.benchmark]
|
|
|
|
| 265 |
# changes to be made here
|
| 266 |
if subset == "med_safety":
|
| 267 |
average = sum([v for v in self.med_safety_results.values() if v is not None]) / len(MedSafetyColumns)
|
| 268 |
+
data_dict[AutoEvalColumn.harmfulness.name] = average
|
| 269 |
if len(self.med_safety_results) > 0:
|
| 270 |
for task in MedSafetyColumns:
|
| 271 |
data_dict[task.value.col_name] = self.med_safety_results[task.value.benchmark]
|
src/populate.py
CHANGED
|
@@ -21,15 +21,15 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 21 |
if subset == "datasets":
|
| 22 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 23 |
elif subset == "med_safety":
|
| 24 |
-
df = df.sort_values(by=[AutoEvalColumn.
|
| 25 |
elif subset == "open_ended":
|
| 26 |
df = df.sort_values(by=["ELO"], ascending=False)
|
| 27 |
elif subset == "medical_summarization":
|
| 28 |
-
df = df.sort_values(by=[
|
| 29 |
elif subset == "aci":
|
| 30 |
-
df = df.sort_values(by=[
|
| 31 |
elif subset == "soap":
|
| 32 |
-
df = df.sort_values(by=[
|
| 33 |
cols = list(set(df.columns).intersection(set(cols)))
|
| 34 |
df = df[cols].round(decimals=2)
|
| 35 |
# filter out if any of the benchmarks have not been produced
|
|
|
|
| 21 |
if subset == "datasets":
|
| 22 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 23 |
elif subset == "med_safety":
|
| 24 |
+
df = df.sort_values(by=[AutoEvalColumn.harmfulness.name], ascending=True)
|
| 25 |
elif subset == "open_ended":
|
| 26 |
df = df.sort_values(by=["ELO"], ascending=False)
|
| 27 |
elif subset == "medical_summarization":
|
| 28 |
+
df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
|
| 29 |
elif subset == "aci":
|
| 30 |
+
df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
|
| 31 |
elif subset == "soap":
|
| 32 |
+
df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
|
| 33 |
cols = list(set(df.columns).intersection(set(cols)))
|
| 34 |
df = df[cols].round(decimals=2)
|
| 35 |
# filter out if any of the benchmarks have not been produced
|