Spaces:
Runtime error
Runtime error
feat: seperate the qa and longdoc tasks
Browse files- src/benchmarks.py +6 -4
- src/display/utils.py +30 -21
- tests/src/display/test_utils.py +5 -4
src/benchmarks.py
CHANGED
|
@@ -110,7 +110,8 @@ class Benchmark:
|
|
| 110 |
metric: str # ndcg_at_1 ,metric_key in the json file
|
| 111 |
col_name: str # [domain]_[language], name to display in the leaderboard
|
| 112 |
|
| 113 |
-
|
|
|
|
| 114 |
for task, domain_dict in dataset_dict.items():
|
| 115 |
for domain, lang_dict in domain_dict.items():
|
| 116 |
for lang, dataset_list in lang_dict.items():
|
|
@@ -119,13 +120,14 @@ for task, domain_dict in dataset_dict.items():
|
|
| 119 |
benchmark_name = get_safe_name(benchmark_name)
|
| 120 |
col_name = f"{domain}_{lang}"
|
| 121 |
for metric in dataset_list:
|
| 122 |
-
|
| 123 |
elif task == "long_doc":
|
| 124 |
for dataset in dataset_list:
|
| 125 |
col_name = f"{domain}_{lang}_{dataset}"
|
| 126 |
for metric in metric_list:
|
| 127 |
benchmark_name = f"{task}_{domain}_{lang}_{dataset}_{metric}"
|
| 128 |
benchmark_name = get_safe_name(benchmark_name)
|
| 129 |
-
|
| 130 |
|
| 131 |
-
|
|
|
|
|
|
| 110 |
metric: str # ndcg_at_1 ,metric_key in the json file
|
| 111 |
col_name: str # [domain]_[language], name to display in the leaderboard
|
| 112 |
|
| 113 |
+
qa_benchmark_dict = {}
|
| 114 |
+
long_doc_benchmark_dict = {}
|
| 115 |
for task, domain_dict in dataset_dict.items():
|
| 116 |
for domain, lang_dict in domain_dict.items():
|
| 117 |
for lang, dataset_list in lang_dict.items():
|
|
|
|
| 120 |
benchmark_name = get_safe_name(benchmark_name)
|
| 121 |
col_name = f"{domain}_{lang}"
|
| 122 |
for metric in dataset_list:
|
| 123 |
+
qa_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name)
|
| 124 |
elif task == "long_doc":
|
| 125 |
for dataset in dataset_list:
|
| 126 |
col_name = f"{domain}_{lang}_{dataset}"
|
| 127 |
for metric in metric_list:
|
| 128 |
benchmark_name = f"{task}_{domain}_{lang}_{dataset}_{metric}"
|
| 129 |
benchmark_name = get_safe_name(benchmark_name)
|
| 130 |
+
long_doc_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name)
|
| 131 |
|
| 132 |
+
BenchmarksQA = Enum('BenchmarksQA', qa_benchmark_dict)
|
| 133 |
+
BenchmarksLongDoc = Enum('BenchmarksLongDoc', long_doc_benchmark_dict)
|
src/display/utils.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
from dataclasses import dataclass, make_dataclass
|
| 2 |
|
| 3 |
-
from src.benchmarks import
|
| 4 |
|
| 5 |
|
| 6 |
def fields(raw_class):
|
|
@@ -19,25 +19,32 @@ class ColumnContent:
|
|
| 19 |
never_hidden: bool = False
|
| 20 |
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
["retrieval_model", ColumnContent, ColumnContent("Retrieval Model", "markdown", True, never_hidden=True)]
|
| 27 |
-
)
|
| 28 |
-
auto_eval_column_dict.append(
|
| 29 |
-
["reranking_model", ColumnContent, ColumnContent("Reranking Model", "markdown", True, never_hidden=True)]
|
| 30 |
-
)
|
| 31 |
-
auto_eval_column_dict.append(
|
| 32 |
-
["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)]
|
| 33 |
-
)
|
| 34 |
-
for benchmark in Benchmarks:
|
| 35 |
auto_eval_column_dict.append(
|
| 36 |
-
[
|
| 37 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
-
# We use make dataclass to dynamically fill the scores from Tasks
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
## For the queue columns in the submission tab
|
|
@@ -48,10 +55,12 @@ class EvalQueueColumn: # Queue column
|
|
| 48 |
|
| 49 |
|
| 50 |
# Column selection
|
| 51 |
-
COLS = [c.name for c in fields(
|
| 52 |
-
TYPES = [c.type for c in fields(
|
| 53 |
-
COLS_LITE = [c.name for c in fields(
|
| 54 |
|
| 55 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 56 |
|
| 57 |
-
|
|
|
|
|
|
|
|
|
| 1 |
from dataclasses import dataclass, make_dataclass
|
| 2 |
|
| 3 |
+
from src.benchmarks import BenchmarksQA, BenchmarksLongDoc
|
| 4 |
|
| 5 |
|
| 6 |
def fields(raw_class):
|
|
|
|
| 19 |
never_hidden: bool = False
|
| 20 |
|
| 21 |
|
| 22 |
+
def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
|
| 23 |
+
## Leaderboard columns
|
| 24 |
+
auto_eval_column_dict = []
|
| 25 |
+
# Init
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
auto_eval_column_dict.append(
|
| 27 |
+
["retrieval_model", ColumnContent, ColumnContent("Retrieval Model", "markdown", True, never_hidden=True)]
|
| 28 |
)
|
| 29 |
+
auto_eval_column_dict.append(
|
| 30 |
+
["reranking_model", ColumnContent, ColumnContent("Reranking Model", "markdown", True, never_hidden=True)]
|
| 31 |
+
)
|
| 32 |
+
auto_eval_column_dict.append(
|
| 33 |
+
["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)]
|
| 34 |
+
)
|
| 35 |
+
for benchmark in benchmarks:
|
| 36 |
+
auto_eval_column_dict.append(
|
| 37 |
+
[benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
|
| 38 |
+
)
|
| 39 |
|
| 40 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
| 41 |
+
return make_dataclass(cls_name, auto_eval_column_dict, frozen=True)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
AutoEvalColumnQA = make_autoevalcolumn(
|
| 45 |
+
"AutoEvalColumnQA", BenchmarksQA)
|
| 46 |
+
AutoEvalColumnLongDoc = make_autoevalcolumn(
|
| 47 |
+
"AutoEvalColumnLongDoc", BenchmarksLongDoc)
|
| 48 |
|
| 49 |
|
| 50 |
## For the queue columns in the submission tab
|
|
|
|
| 55 |
|
| 56 |
|
| 57 |
# Column selection
|
| 58 |
+
COLS = [c.name for c in fields(AutoEvalColumnQA) if not c.hidden]
|
| 59 |
+
TYPES = [c.type for c in fields(AutoEvalColumnQA) if not c.hidden]
|
| 60 |
+
COLS_LITE = [c.name for c in fields(AutoEvalColumnQA) if c.displayed_by_default and not c.hidden]
|
| 61 |
|
| 62 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 63 |
|
| 64 |
+
QA_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksQA]
|
| 65 |
+
|
| 66 |
+
LONG_DOC_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksLongDoc]
|
tests/src/display/test_utils.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
| 1 |
import pytest
|
| 2 |
-
from src.display.utils import fields,
|
| 3 |
|
| 4 |
|
|
|
|
| 5 |
def test_fields():
|
| 6 |
-
for c in fields(
|
| 7 |
-
print(c
|
| 8 |
|
| 9 |
|
| 10 |
def test_macro_variables():
|
|
@@ -12,4 +13,4 @@ def test_macro_variables():
|
|
| 12 |
print(f'COLS_LITE: {COLS_LITE}')
|
| 13 |
print(f'TYPES: {TYPES}')
|
| 14 |
print(f'EVAL_COLS: {EVAL_COLS}')
|
| 15 |
-
print(f'BENCHMARK_COLS: {
|
|
|
|
| 1 |
import pytest
|
| 2 |
+
from src.display.utils import fields, AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS, COLS_LITE, TYPES, EVAL_COLS, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS
|
| 3 |
|
| 4 |
|
| 5 |
+
@pytest.mark.parametrize('auto_eval_column')
|
| 6 |
def test_fields():
|
| 7 |
+
for c in fields(AutoEvalColumnQA):
|
| 8 |
+
print(c)
|
| 9 |
|
| 10 |
|
| 11 |
def test_macro_variables():
|
|
|
|
| 13 |
print(f'COLS_LITE: {COLS_LITE}')
|
| 14 |
print(f'TYPES: {TYPES}')
|
| 15 |
print(f'EVAL_COLS: {EVAL_COLS}')
|
| 16 |
+
print(f'BENCHMARK_COLS: {QA_BENCHMARK_COLS}')
|