Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
style: reformat the styles
Browse files- Makefile +6 -0
- app.py +20 -20
- pyproject.toml +3 -3
- src/benchmarks.py +5 -13
- src/columns.py +9 -59
- src/envs.py +3 -1
- src/loaders.py +1 -2
- src/models.py +1 -1
- src/utils.py +72 -79
- tests/src/test_benchmarks.py +3 -19
- tests/src/test_columns.py +24 -21
- tests/src/test_envs.py +2 -3
- tests/src/test_loaders.py +15 -23
- tests/src/test_models.py +30 -19
- tests/src/test_read_evals.py +0 -78
- tests/src/test_utils.py +102 -84
- tests/test_utils.py +0 -136
- tests/toydata/test_data.json +0 -98
- tests/toydata/test_results/bge-m3/NoReranker/results_2023-11-21T18-10-08.json +0 -98
- tests/toydata/test_results/bge-m3/bge-reranker-v2-m3/results_2023-11-21T18-10-08.json +0 -98
Makefile
CHANGED
|
@@ -3,14 +3,20 @@
|
|
| 3 |
|
| 4 |
style:
|
| 5 |
python -m black --line-length 119 .
|
|
|
|
| 6 |
python -m isort .
|
|
|
|
| 7 |
ruff check --fix .
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
quality:
|
| 11 |
python -m black --check --line-length 119 .
|
|
|
|
| 12 |
python -m isort --check-only .
|
|
|
|
| 13 |
ruff check .
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
test:
|
|
|
|
| 3 |
|
| 4 |
style:
|
| 5 |
python -m black --line-length 119 .
|
| 6 |
+
python -m black --line-length 119 src
|
| 7 |
python -m isort .
|
| 8 |
+
python -m isort src
|
| 9 |
ruff check --fix .
|
| 10 |
+
ruff check --fix src
|
| 11 |
|
| 12 |
|
| 13 |
quality:
|
| 14 |
python -m black --check --line-length 119 .
|
| 15 |
+
python -m black --check --line-length 119 src
|
| 16 |
python -m isort --check-only .
|
| 17 |
+
python -m isort --check-only src
|
| 18 |
ruff check .
|
| 19 |
+
ruff check src
|
| 20 |
|
| 21 |
|
| 22 |
test:
|
app.py
CHANGED
|
@@ -63,13 +63,13 @@ datastore = ds_dict[LATEST_BENCHMARK_VERSION]
|
|
| 63 |
|
| 64 |
|
| 65 |
def update_qa_metric(
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
):
|
| 74 |
global datastore
|
| 75 |
return update_metric(
|
|
@@ -86,13 +86,13 @@ def update_qa_metric(
|
|
| 86 |
|
| 87 |
|
| 88 |
def update_doc_metric(
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
):
|
| 97 |
global datastore
|
| 98 |
return update_metric(
|
|
@@ -218,7 +218,7 @@ with demo:
|
|
| 218 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 219 |
_qa_df_ret_hidden = datastore.qa_raw_df[
|
| 220 |
datastore.qa_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
| 221 |
-
|
| 222 |
_qa_df_ret_hidden = reset_rank(_qa_df_ret_hidden)
|
| 223 |
qa_df_elem_ret_hidden = get_leaderboard_table(
|
| 224 |
_qa_df_ret_hidden, datastore.qa_types, visible=False
|
|
@@ -277,7 +277,7 @@ with demo:
|
|
| 277 |
|
| 278 |
_qa_df_rerank_hidden = datastore.qa_raw_df[
|
| 279 |
datastore.qa_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
| 280 |
-
|
| 281 |
_qa_df_rerank_hidden = reset_rank(_qa_df_rerank_hidden)
|
| 282 |
qa_df_elem_rerank_hidden = get_leaderboard_table(
|
| 283 |
_qa_df_rerank_hidden, datastore.qa_types, visible=False
|
|
@@ -391,13 +391,13 @@ with demo:
|
|
| 391 |
|
| 392 |
_doc_df_ret = datastore.doc_fmt_df[
|
| 393 |
datastore.doc_fmt_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
| 394 |
-
|
| 395 |
_doc_df_ret = reset_rank(_doc_df_ret)
|
| 396 |
doc_df_elem_ret = get_leaderboard_table(_doc_df_ret, datastore.doc_types)
|
| 397 |
|
| 398 |
_doc_df_ret_hidden = datastore.doc_raw_df[
|
| 399 |
datastore.doc_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
| 400 |
-
|
| 401 |
_doc_df_ret_hidden = reset_rank(_doc_df_ret_hidden)
|
| 402 |
doc_df_elem_ret_hidden = get_leaderboard_table(
|
| 403 |
_doc_df_ret_hidden, datastore.doc_types, visible=False
|
|
@@ -439,7 +439,7 @@ with demo:
|
|
| 439 |
with gr.TabItem("Reranking Only", id=22):
|
| 440 |
_doc_df_rerank = datastore.doc_fmt_df[
|
| 441 |
datastore.doc_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
| 442 |
-
|
| 443 |
_doc_df_rerank = reset_rank(_doc_df_rerank)
|
| 444 |
doc_rerank_models = (
|
| 445 |
_doc_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
|
@@ -452,7 +452,7 @@ with demo:
|
|
| 452 |
doc_df_elem_rerank = get_leaderboard_table(_doc_df_rerank, datastore.doc_types)
|
| 453 |
_doc_df_rerank_hidden = datastore.doc_raw_df[
|
| 454 |
datastore.doc_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
| 455 |
-
|
| 456 |
_doc_df_rerank_hidden = reset_rank(_doc_df_rerank_hidden)
|
| 457 |
doc_df_elem_rerank_hidden = get_leaderboard_table(
|
| 458 |
_doc_df_rerank_hidden, datastore.doc_types, visible=False
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
def update_qa_metric(
|
| 66 |
+
metric: str,
|
| 67 |
+
domains: list,
|
| 68 |
+
langs: list,
|
| 69 |
+
reranking_model: list,
|
| 70 |
+
query: str,
|
| 71 |
+
show_anonymous: bool,
|
| 72 |
+
show_revision_and_timestamp: bool,
|
| 73 |
):
|
| 74 |
global datastore
|
| 75 |
return update_metric(
|
|
|
|
| 86 |
|
| 87 |
|
| 88 |
def update_doc_metric(
|
| 89 |
+
metric: str,
|
| 90 |
+
domains: list,
|
| 91 |
+
langs: list,
|
| 92 |
+
reranking_model: list,
|
| 93 |
+
query: str,
|
| 94 |
+
show_anonymous: bool,
|
| 95 |
+
show_revision_and_timestamp,
|
| 96 |
):
|
| 97 |
global datastore
|
| 98 |
return update_metric(
|
|
|
|
| 218 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 219 |
_qa_df_ret_hidden = datastore.qa_raw_df[
|
| 220 |
datastore.qa_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
| 221 |
+
]
|
| 222 |
_qa_df_ret_hidden = reset_rank(_qa_df_ret_hidden)
|
| 223 |
qa_df_elem_ret_hidden = get_leaderboard_table(
|
| 224 |
_qa_df_ret_hidden, datastore.qa_types, visible=False
|
|
|
|
| 277 |
|
| 278 |
_qa_df_rerank_hidden = datastore.qa_raw_df[
|
| 279 |
datastore.qa_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
| 280 |
+
]
|
| 281 |
_qa_df_rerank_hidden = reset_rank(_qa_df_rerank_hidden)
|
| 282 |
qa_df_elem_rerank_hidden = get_leaderboard_table(
|
| 283 |
_qa_df_rerank_hidden, datastore.qa_types, visible=False
|
|
|
|
| 391 |
|
| 392 |
_doc_df_ret = datastore.doc_fmt_df[
|
| 393 |
datastore.doc_fmt_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
| 394 |
+
]
|
| 395 |
_doc_df_ret = reset_rank(_doc_df_ret)
|
| 396 |
doc_df_elem_ret = get_leaderboard_table(_doc_df_ret, datastore.doc_types)
|
| 397 |
|
| 398 |
_doc_df_ret_hidden = datastore.doc_raw_df[
|
| 399 |
datastore.doc_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
| 400 |
+
]
|
| 401 |
_doc_df_ret_hidden = reset_rank(_doc_df_ret_hidden)
|
| 402 |
doc_df_elem_ret_hidden = get_leaderboard_table(
|
| 403 |
_doc_df_ret_hidden, datastore.doc_types, visible=False
|
|
|
|
| 439 |
with gr.TabItem("Reranking Only", id=22):
|
| 440 |
_doc_df_rerank = datastore.doc_fmt_df[
|
| 441 |
datastore.doc_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
| 442 |
+
]
|
| 443 |
_doc_df_rerank = reset_rank(_doc_df_rerank)
|
| 444 |
doc_rerank_models = (
|
| 445 |
_doc_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
|
|
|
| 452 |
doc_df_elem_rerank = get_leaderboard_table(_doc_df_rerank, datastore.doc_types)
|
| 453 |
_doc_df_rerank_hidden = datastore.doc_raw_df[
|
| 454 |
datastore.doc_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
| 455 |
+
]
|
| 456 |
_doc_df_rerank_hidden = reset_rank(_doc_df_rerank_hidden)
|
| 457 |
doc_df_elem_rerank_hidden = get_leaderboard_table(
|
| 458 |
_doc_df_rerank_hidden, datastore.doc_types, visible=False
|
pyproject.toml
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
[tool.ruff]
|
| 2 |
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
|
| 3 |
-
select = ["E", "F"]
|
| 4 |
-
ignore = ["E501"] # line too long (black is taking care of this)
|
| 5 |
line-length = 119
|
| 6 |
-
fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
|
| 7 |
|
| 8 |
[tool.isort]
|
| 9 |
profile = "black"
|
|
|
|
| 1 |
[tool.ruff]
|
| 2 |
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
|
| 3 |
+
lint.select = ["E", "F"]
|
| 4 |
+
lint.ignore = ["E501"] # line too long (black is taking care of this)
|
| 5 |
line-length = 119
|
| 6 |
+
lint.fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
|
| 7 |
|
| 8 |
[tool.isort]
|
| 9 |
profile = "black"
|
src/benchmarks.py
CHANGED
|
@@ -30,9 +30,7 @@ def get_qa_benchmarks_dict(version: str):
|
|
| 30 |
for metric in dataset_list:
|
| 31 |
if "test" not in dataset_list[metric]["splits"]:
|
| 32 |
continue
|
| 33 |
-
benchmark_dict[benchmark_name] = Benchmark(
|
| 34 |
-
benchmark_name, metric, col_name, domain, lang, task
|
| 35 |
-
)
|
| 36 |
return benchmark_dict
|
| 37 |
|
| 38 |
|
|
@@ -59,20 +57,14 @@ def get_doc_benchmarks_dict(version: str):
|
|
| 59 |
_qa_benchmark_dict = {}
|
| 60 |
for version in BENCHMARK_VERSION_LIST:
|
| 61 |
safe_version_name = get_safe_name(version)
|
| 62 |
-
_qa_benchmark_dict[safe_version_name] =
|
| 63 |
-
Enum(
|
| 64 |
-
f"QABenchmarks_{safe_version_name}",
|
| 65 |
-
get_qa_benchmarks_dict(version)
|
| 66 |
-
)
|
| 67 |
|
| 68 |
_doc_benchmark_dict = {}
|
| 69 |
for version in BENCHMARK_VERSION_LIST:
|
| 70 |
safe_version_name = get_safe_name(version)
|
| 71 |
-
_doc_benchmark_dict[safe_version_name] =
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
get_doc_benchmarks_dict(version)
|
| 75 |
-
)
|
| 76 |
|
| 77 |
|
| 78 |
QABenchmarks = Enum("QABenchmarks", _qa_benchmark_dict)
|
|
|
|
| 30 |
for metric in dataset_list:
|
| 31 |
if "test" not in dataset_list[metric]["splits"]:
|
| 32 |
continue
|
| 33 |
+
benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain, lang, task)
|
|
|
|
|
|
|
| 34 |
return benchmark_dict
|
| 35 |
|
| 36 |
|
|
|
|
| 57 |
_qa_benchmark_dict = {}
|
| 58 |
for version in BENCHMARK_VERSION_LIST:
|
| 59 |
safe_version_name = get_safe_name(version)
|
| 60 |
+
_qa_benchmark_dict[safe_version_name] = Enum(f"QABenchmarks_{safe_version_name}", get_qa_benchmarks_dict(version))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
_doc_benchmark_dict = {}
|
| 63 |
for version in BENCHMARK_VERSION_LIST:
|
| 64 |
safe_version_name = get_safe_name(version)
|
| 65 |
+
_doc_benchmark_dict[safe_version_name] = Enum(
|
| 66 |
+
f"LongDocBenchmarks_{safe_version_name}", get_doc_benchmarks_dict(version)
|
| 67 |
+
)
|
|
|
|
|
|
|
| 68 |
|
| 69 |
|
| 70 |
QABenchmarks = Enum("QABenchmarks", _qa_benchmark_dict)
|
src/columns.py
CHANGED
|
@@ -19,69 +19,28 @@ class ColumnContent:
|
|
| 19 |
|
| 20 |
def get_default_auto_eval_column_dict():
|
| 21 |
auto_eval_column_dict = []
|
| 22 |
-
auto_eval_column_dict.append(
|
| 23 |
-
[
|
| 24 |
-
"rank",
|
| 25 |
-
ColumnContent,
|
| 26 |
-
ColumnContent(
|
| 27 |
-
COL_NAME_RANK,
|
| 28 |
-
"number",
|
| 29 |
-
True
|
| 30 |
-
)
|
| 31 |
-
]
|
| 32 |
-
)
|
| 33 |
auto_eval_column_dict.append(
|
| 34 |
[
|
| 35 |
"retrieval_model",
|
| 36 |
ColumnContent,
|
| 37 |
-
ColumnContent(
|
| 38 |
-
COL_NAME_RETRIEVAL_MODEL,
|
| 39 |
-
"markdown",
|
| 40 |
-
True,
|
| 41 |
-
never_hidden=True
|
| 42 |
-
)
|
| 43 |
]
|
| 44 |
)
|
| 45 |
auto_eval_column_dict.append(
|
| 46 |
[
|
| 47 |
"reranking_model",
|
| 48 |
ColumnContent,
|
| 49 |
-
ColumnContent(
|
| 50 |
-
COL_NAME_RERANKING_MODEL,
|
| 51 |
-
"markdown",
|
| 52 |
-
True,
|
| 53 |
-
never_hidden=True
|
| 54 |
-
)
|
| 55 |
]
|
| 56 |
)
|
| 57 |
auto_eval_column_dict.append(
|
| 58 |
-
[
|
| 59 |
-
"revision",
|
| 60 |
-
ColumnContent,
|
| 61 |
-
ColumnContent(
|
| 62 |
-
COL_NAME_REVISION,
|
| 63 |
-
"markdown",
|
| 64 |
-
True,
|
| 65 |
-
never_hidden=True
|
| 66 |
-
)
|
| 67 |
-
]
|
| 68 |
)
|
| 69 |
auto_eval_column_dict.append(
|
| 70 |
-
[
|
| 71 |
-
"timestamp",
|
| 72 |
-
ColumnContent,
|
| 73 |
-
ColumnContent(
|
| 74 |
-
COL_NAME_TIMESTAMP, "date", True, never_hidden=True
|
| 75 |
-
)
|
| 76 |
-
]
|
| 77 |
-
)
|
| 78 |
-
auto_eval_column_dict.append(
|
| 79 |
-
[
|
| 80 |
-
"average",
|
| 81 |
-
ColumnContent,
|
| 82 |
-
ColumnContent(COL_NAME_AVG, "number", True)
|
| 83 |
-
]
|
| 84 |
)
|
|
|
|
| 85 |
auto_eval_column_dict.append(
|
| 86 |
[
|
| 87 |
"retrieval_model_link",
|
|
@@ -91,7 +50,7 @@ def get_default_auto_eval_column_dict():
|
|
| 91 |
"markdown",
|
| 92 |
False,
|
| 93 |
hidden=True,
|
| 94 |
-
)
|
| 95 |
]
|
| 96 |
)
|
| 97 |
auto_eval_column_dict.append(
|
|
@@ -103,20 +62,11 @@ def get_default_auto_eval_column_dict():
|
|
| 103 |
"markdown",
|
| 104 |
False,
|
| 105 |
hidden=True,
|
| 106 |
-
)
|
| 107 |
]
|
| 108 |
)
|
| 109 |
auto_eval_column_dict.append(
|
| 110 |
-
[
|
| 111 |
-
"is_anonymous",
|
| 112 |
-
ColumnContent,
|
| 113 |
-
ColumnContent(
|
| 114 |
-
COL_NAME_IS_ANONYMOUS,
|
| 115 |
-
"bool",
|
| 116 |
-
False,
|
| 117 |
-
hidden=True
|
| 118 |
-
)
|
| 119 |
-
]
|
| 120 |
)
|
| 121 |
return auto_eval_column_dict
|
| 122 |
|
|
|
|
| 19 |
|
| 20 |
def get_default_auto_eval_column_dict():
|
| 21 |
auto_eval_column_dict = []
|
| 22 |
+
auto_eval_column_dict.append(["rank", ColumnContent, ColumnContent(COL_NAME_RANK, "number", True)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
auto_eval_column_dict.append(
|
| 24 |
[
|
| 25 |
"retrieval_model",
|
| 26 |
ColumnContent,
|
| 27 |
+
ColumnContent(COL_NAME_RETRIEVAL_MODEL, "markdown", True, never_hidden=True),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
]
|
| 29 |
)
|
| 30 |
auto_eval_column_dict.append(
|
| 31 |
[
|
| 32 |
"reranking_model",
|
| 33 |
ColumnContent,
|
| 34 |
+
ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", True, never_hidden=True),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
]
|
| 36 |
)
|
| 37 |
auto_eval_column_dict.append(
|
| 38 |
+
["revision", ColumnContent, ColumnContent(COL_NAME_REVISION, "markdown", True, never_hidden=True)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
)
|
| 40 |
auto_eval_column_dict.append(
|
| 41 |
+
["timestamp", ColumnContent, ColumnContent(COL_NAME_TIMESTAMP, "date", True, never_hidden=True)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
)
|
| 43 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)])
|
| 44 |
auto_eval_column_dict.append(
|
| 45 |
[
|
| 46 |
"retrieval_model_link",
|
|
|
|
| 50 |
"markdown",
|
| 51 |
False,
|
| 52 |
hidden=True,
|
| 53 |
+
),
|
| 54 |
]
|
| 55 |
)
|
| 56 |
auto_eval_column_dict.append(
|
|
|
|
| 62 |
"markdown",
|
| 63 |
False,
|
| 64 |
hidden=True,
|
| 65 |
+
),
|
| 66 |
]
|
| 67 |
)
|
| 68 |
auto_eval_column_dict.append(
|
| 69 |
+
["is_anonymous", ColumnContent, ColumnContent(COL_NAME_IS_ANONYMOUS, "bool", False, hidden=True)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
)
|
| 71 |
return auto_eval_column_dict
|
| 72 |
|
src/envs.py
CHANGED
|
@@ -6,7 +6,9 @@ from huggingface_hub import HfApi
|
|
| 6 |
# ----------------------------------
|
| 7 |
TOKEN = os.environ.get("TOKEN", "") # A read/write token for your org
|
| 8 |
|
| 9 |
-
OWNER =
|
|
|
|
|
|
|
| 10 |
# ----------------------------------
|
| 11 |
|
| 12 |
REPO_ID = f"{OWNER}/leaderboard"
|
|
|
|
| 6 |
# ----------------------------------
|
| 7 |
TOKEN = os.environ.get("TOKEN", "") # A read/write token for your org
|
| 8 |
|
| 9 |
+
OWNER = (
|
| 10 |
+
"AIR-Bench" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
| 11 |
+
)
|
| 12 |
# ----------------------------------
|
| 13 |
|
| 14 |
REPO_ID = f"{OWNER}/leaderboard"
|
src/loaders.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import os.path
|
| 2 |
from pathlib import Path
|
| 3 |
-
from typing import Union
|
| 4 |
-
from typing import Dict, List
|
| 5 |
|
| 6 |
import pandas as pd
|
| 7 |
|
|
|
|
| 1 |
import os.path
|
| 2 |
from pathlib import Path
|
| 3 |
+
from typing import Dict, List, Union
|
|
|
|
| 4 |
|
| 5 |
import pandas as pd
|
| 6 |
|
src/models.py
CHANGED
|
@@ -2,7 +2,7 @@ import json
|
|
| 2 |
from collections import defaultdict
|
| 3 |
from dataclasses import dataclass
|
| 4 |
from enum import Enum
|
| 5 |
-
from typing import List
|
| 6 |
|
| 7 |
import pandas as pd
|
| 8 |
|
|
|
|
| 2 |
from collections import defaultdict
|
| 3 |
from dataclasses import dataclass
|
| 4 |
from enum import Enum
|
| 5 |
+
from typing import List
|
| 6 |
|
| 7 |
import pandas as pd
|
| 8 |
|
src/utils.py
CHANGED
|
@@ -118,39 +118,36 @@ def get_selected_cols(task, version_slug, domains, languages):
|
|
| 118 |
|
| 119 |
|
| 120 |
def select_columns(
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
) -> pd.DataFrame:
|
| 128 |
-
selected_cols = get_selected_cols(
|
| 129 |
-
task, version_slug, domains, languages)
|
| 130 |
fixed_cols, _ = get_fixed_col_names_and_types()
|
| 131 |
filtered_df = df[fixed_cols + selected_cols]
|
| 132 |
filtered_df.replace({"": pd.NA}, inplace=True)
|
| 133 |
if reset_ranking:
|
| 134 |
-
filtered_df[COL_NAME_AVG] =
|
| 135 |
-
|
| 136 |
-
filtered_df.sort_values(
|
| 137 |
-
by=[COL_NAME_AVG], ascending=False, inplace=True)
|
| 138 |
filtered_df.reset_index(inplace=True, drop=True)
|
| 139 |
filtered_df = reset_rank(filtered_df)
|
| 140 |
return filtered_df
|
| 141 |
|
| 142 |
|
| 143 |
def _update_df_elem(
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
):
|
| 155 |
filtered_df = source_df.copy()
|
| 156 |
if not show_anonymous:
|
|
@@ -164,15 +161,15 @@ def _update_df_elem(
|
|
| 164 |
|
| 165 |
|
| 166 |
def update_doc_df_elem(
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
):
|
| 177 |
return _update_df_elem(
|
| 178 |
TaskType.long_doc,
|
|
@@ -189,15 +186,15 @@ def update_doc_df_elem(
|
|
| 189 |
|
| 190 |
|
| 191 |
def update_metric(
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
) -> pd.DataFrame:
|
| 202 |
if task == TaskType.qa:
|
| 203 |
update_func = update_qa_df_elem
|
|
@@ -253,13 +250,13 @@ def calculate_file_md5(file_path):
|
|
| 253 |
|
| 254 |
|
| 255 |
def submit_results(
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
):
|
| 264 |
if not filepath.endswith(".zip"):
|
| 265 |
return styled_error(f"file uploading aborted. wrong file type: {filepath}")
|
|
@@ -355,11 +352,7 @@ def get_leaderboard_df(datastore, task: TaskType, metric: str) -> pd.DataFrame:
|
|
| 355 |
benchmark_cols.append(t.value.col_name)
|
| 356 |
|
| 357 |
# filter out the columns that are not in the data
|
| 358 |
-
df[COL_NAME_AVG] = (
|
| 359 |
-
df[list(benchmark_cols)]
|
| 360 |
-
.apply(calculate_mean, axis=1)
|
| 361 |
-
.round(decimals=2)
|
| 362 |
-
)
|
| 363 |
df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
|
| 364 |
df.reset_index(inplace=True, drop=True)
|
| 365 |
|
|
@@ -381,16 +374,16 @@ def get_leaderboard_df(datastore, task: TaskType, metric: str) -> pd.DataFrame:
|
|
| 381 |
|
| 382 |
|
| 383 |
def set_listeners(
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
):
|
| 395 |
if task == TaskType.qa:
|
| 396 |
update_table_func = update_qa_df_elem
|
|
@@ -400,15 +393,15 @@ def set_listeners(
|
|
| 400 |
raise NotImplementedError
|
| 401 |
selector_list = [selected_domains, selected_langs, selected_rerankings, search_bar, show_anonymous]
|
| 402 |
search_bar_args = [
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
selector_args = (
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
)
|
| 413 |
# Set search_bar listener
|
| 414 |
search_bar.submit(update_table_func, search_bar_args, target_df)
|
|
@@ -424,15 +417,15 @@ def set_listeners(
|
|
| 424 |
|
| 425 |
|
| 426 |
def update_qa_df_elem(
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
):
|
| 437 |
return _update_df_elem(
|
| 438 |
TaskType.qa,
|
|
|
|
| 118 |
|
| 119 |
|
| 120 |
def select_columns(
|
| 121 |
+
df: pd.DataFrame,
|
| 122 |
+
domains: list,
|
| 123 |
+
languages: list,
|
| 124 |
+
task: TaskType = TaskType.qa,
|
| 125 |
+
reset_ranking: bool = True,
|
| 126 |
+
version_slug: str = None,
|
| 127 |
) -> pd.DataFrame:
|
| 128 |
+
selected_cols = get_selected_cols(task, version_slug, domains, languages)
|
|
|
|
| 129 |
fixed_cols, _ = get_fixed_col_names_and_types()
|
| 130 |
filtered_df = df[fixed_cols + selected_cols]
|
| 131 |
filtered_df.replace({"": pd.NA}, inplace=True)
|
| 132 |
if reset_ranking:
|
| 133 |
+
filtered_df[COL_NAME_AVG] = filtered_df[selected_cols].apply(calculate_mean, axis=1).round(decimals=2)
|
| 134 |
+
filtered_df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
|
|
|
|
|
|
|
| 135 |
filtered_df.reset_index(inplace=True, drop=True)
|
| 136 |
filtered_df = reset_rank(filtered_df)
|
| 137 |
return filtered_df
|
| 138 |
|
| 139 |
|
| 140 |
def _update_df_elem(
|
| 141 |
+
task: TaskType,
|
| 142 |
+
version: str,
|
| 143 |
+
source_df: pd.DataFrame,
|
| 144 |
+
domains: list,
|
| 145 |
+
langs: list,
|
| 146 |
+
reranking_query: list,
|
| 147 |
+
query: str,
|
| 148 |
+
show_anonymous: bool,
|
| 149 |
+
reset_ranking: bool = True,
|
| 150 |
+
show_revision_and_timestamp: bool = False,
|
| 151 |
):
|
| 152 |
filtered_df = source_df.copy()
|
| 153 |
if not show_anonymous:
|
|
|
|
| 161 |
|
| 162 |
|
| 163 |
def update_doc_df_elem(
|
| 164 |
+
version: str,
|
| 165 |
+
hidden_df: pd.DataFrame,
|
| 166 |
+
domains: list,
|
| 167 |
+
langs: list,
|
| 168 |
+
reranking_query: list,
|
| 169 |
+
query: str,
|
| 170 |
+
show_anonymous: bool,
|
| 171 |
+
show_revision_and_timestamp: bool = False,
|
| 172 |
+
reset_ranking: bool = True,
|
| 173 |
):
|
| 174 |
return _update_df_elem(
|
| 175 |
TaskType.long_doc,
|
|
|
|
| 186 |
|
| 187 |
|
| 188 |
def update_metric(
|
| 189 |
+
datastore,
|
| 190 |
+
task: TaskType,
|
| 191 |
+
metric: str,
|
| 192 |
+
domains: list,
|
| 193 |
+
langs: list,
|
| 194 |
+
reranking_model: list,
|
| 195 |
+
query: str,
|
| 196 |
+
show_anonymous: bool = False,
|
| 197 |
+
show_revision_and_timestamp: bool = False,
|
| 198 |
) -> pd.DataFrame:
|
| 199 |
if task == TaskType.qa:
|
| 200 |
update_func = update_qa_df_elem
|
|
|
|
| 250 |
|
| 251 |
|
| 252 |
def submit_results(
|
| 253 |
+
filepath: str,
|
| 254 |
+
model: str,
|
| 255 |
+
model_url: str,
|
| 256 |
+
reranking_model: str = "",
|
| 257 |
+
reranking_model_url: str = "",
|
| 258 |
+
version: str = LATEST_BENCHMARK_VERSION,
|
| 259 |
+
is_anonymous=False,
|
| 260 |
):
|
| 261 |
if not filepath.endswith(".zip"):
|
| 262 |
return styled_error(f"file uploading aborted. wrong file type: {filepath}")
|
|
|
|
| 352 |
benchmark_cols.append(t.value.col_name)
|
| 353 |
|
| 354 |
# filter out the columns that are not in the data
|
| 355 |
+
df[COL_NAME_AVG] = df[list(benchmark_cols)].apply(calculate_mean, axis=1).round(decimals=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
|
| 357 |
df.reset_index(inplace=True, drop=True)
|
| 358 |
|
|
|
|
| 374 |
|
| 375 |
|
| 376 |
def set_listeners(
|
| 377 |
+
task: TaskType,
|
| 378 |
+
target_df,
|
| 379 |
+
source_df,
|
| 380 |
+
search_bar,
|
| 381 |
+
version,
|
| 382 |
+
selected_domains,
|
| 383 |
+
selected_langs,
|
| 384 |
+
selected_rerankings,
|
| 385 |
+
show_anonymous,
|
| 386 |
+
show_revision_and_timestamp,
|
| 387 |
):
|
| 388 |
if task == TaskType.qa:
|
| 389 |
update_table_func = update_qa_df_elem
|
|
|
|
| 393 |
raise NotImplementedError
|
| 394 |
selector_list = [selected_domains, selected_langs, selected_rerankings, search_bar, show_anonymous]
|
| 395 |
search_bar_args = [
|
| 396 |
+
source_df,
|
| 397 |
+
version,
|
| 398 |
+
] + selector_list
|
| 399 |
selector_args = (
|
| 400 |
+
[version, source_df]
|
| 401 |
+
+ selector_list
|
| 402 |
+
+ [
|
| 403 |
+
show_revision_and_timestamp,
|
| 404 |
+
]
|
| 405 |
)
|
| 406 |
# Set search_bar listener
|
| 407 |
search_bar.submit(update_table_func, search_bar_args, target_df)
|
|
|
|
| 417 |
|
| 418 |
|
| 419 |
def update_qa_df_elem(
|
| 420 |
+
version: str,
|
| 421 |
+
hidden_df: pd.DataFrame,
|
| 422 |
+
domains: list,
|
| 423 |
+
langs: list,
|
| 424 |
+
reranking_query: list,
|
| 425 |
+
query: str,
|
| 426 |
+
show_anonymous: bool,
|
| 427 |
+
show_revision_and_timestamp: bool = False,
|
| 428 |
+
reset_ranking: bool = True,
|
| 429 |
):
|
| 430 |
return _update_df_elem(
|
| 431 |
TaskType.qa,
|
tests/src/test_benchmarks.py
CHANGED
|
@@ -3,7 +3,6 @@ import pytest
|
|
| 3 |
from src.benchmarks import LongDocBenchmarks, QABenchmarks
|
| 4 |
from src.envs import BENCHMARK_VERSION_LIST
|
| 5 |
|
| 6 |
-
|
| 7 |
# Ref: https://github.com/AIR-Bench/AIR-Bench/blob/4b27b8a8f2047a963805fcf6fb9d74be51ec440c/docs/available_tasks.md
|
| 8 |
# 24.05
|
| 9 |
# | Task | dev | test |
|
|
@@ -17,15 +16,8 @@ from src.envs import BENCHMARK_VERSION_LIST
|
|
| 17 |
# | Long-Doc | 15 |
|
| 18 |
# | QA | 13 |
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
[
|
| 23 |
-
{
|
| 24 |
-
"air_bench_2404": 13,
|
| 25 |
-
"air_bench_2405": 53
|
| 26 |
-
}
|
| 27 |
-
]
|
| 28 |
-
)
|
| 29 |
def test_qa_benchmarks(num_datasets_dict):
|
| 30 |
assert len(QABenchmarks) == len(BENCHMARK_VERSION_LIST)
|
| 31 |
for benchmark_list in list(QABenchmarks):
|
|
@@ -33,15 +25,7 @@ def test_qa_benchmarks(num_datasets_dict):
|
|
| 33 |
assert num_datasets_dict[version_slug] == len(benchmark_list.value)
|
| 34 |
|
| 35 |
|
| 36 |
-
@pytest.mark.parametrize(
|
| 37 |
-
"num_datasets_dict",
|
| 38 |
-
[
|
| 39 |
-
{
|
| 40 |
-
"air_bench_2404": 15,
|
| 41 |
-
"air_bench_2405": 11
|
| 42 |
-
}
|
| 43 |
-
]
|
| 44 |
-
)
|
| 45 |
def test_doc_benchmarks(num_datasets_dict):
|
| 46 |
assert len(LongDocBenchmarks) == len(BENCHMARK_VERSION_LIST)
|
| 47 |
for benchmark_list in list(LongDocBenchmarks):
|
|
|
|
| 3 |
from src.benchmarks import LongDocBenchmarks, QABenchmarks
|
| 4 |
from src.envs import BENCHMARK_VERSION_LIST
|
| 5 |
|
|
|
|
| 6 |
# Ref: https://github.com/AIR-Bench/AIR-Bench/blob/4b27b8a8f2047a963805fcf6fb9d74be51ec440c/docs/available_tasks.md
|
| 7 |
# 24.05
|
| 8 |
# | Task | dev | test |
|
|
|
|
| 16 |
# | Long-Doc | 15 |
|
| 17 |
# | QA | 13 |
|
| 18 |
|
| 19 |
+
|
| 20 |
+
@pytest.mark.parametrize("num_datasets_dict", [{"air_bench_2404": 13, "air_bench_2405": 53}])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
def test_qa_benchmarks(num_datasets_dict):
|
| 22 |
assert len(QABenchmarks) == len(BENCHMARK_VERSION_LIST)
|
| 23 |
for benchmark_list in list(QABenchmarks):
|
|
|
|
| 25 |
assert num_datasets_dict[version_slug] == len(benchmark_list.value)
|
| 26 |
|
| 27 |
|
| 28 |
+
@pytest.mark.parametrize("num_datasets_dict", [{"air_bench_2404": 15, "air_bench_2405": 11}])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def test_doc_benchmarks(num_datasets_dict):
|
| 30 |
assert len(LongDocBenchmarks) == len(BENCHMARK_VERSION_LIST)
|
| 31 |
for benchmark_list in list(LongDocBenchmarks):
|
tests/src/test_columns.py
CHANGED
|
@@ -1,12 +1,18 @@
|
|
| 1 |
import pytest
|
| 2 |
|
| 3 |
-
from src.benchmarks import
|
| 4 |
-
from src.columns import
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
COL_NAME_RERANKING_MODEL,
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
# Ref: https://github.com/AIR-Bench/AIR-Bench/blob/4b27b8a8f2047a963805fcf6fb9d74be51ec440c/docs/available_tasks.md
|
| 12 |
# 24.05
|
|
@@ -21,6 +27,7 @@ from src.columns import get_default_auto_eval_column_dict, \
|
|
| 21 |
# | Long-Doc | 15 |
|
| 22 |
# | QA | 13 |
|
| 23 |
|
|
|
|
| 24 |
@pytest.fixture()
|
| 25 |
def expected_col_names():
|
| 26 |
return [
|
|
@@ -45,8 +52,7 @@ def expected_hidden_col_names():
|
|
| 45 |
]
|
| 46 |
|
| 47 |
|
| 48 |
-
def test_get_default_auto_eval_column_dict(
|
| 49 |
-
expected_col_names, expected_hidden_col_names):
|
| 50 |
col_list = get_default_auto_eval_column_dict()
|
| 51 |
assert len(col_list) == 9
|
| 52 |
hidden_cols = []
|
|
@@ -76,14 +82,13 @@ def test_get_fixed_col_names_and_types():
|
|
| 76 |
|
| 77 |
|
| 78 |
@pytest.mark.parametrize(
|
| 79 |
-
|
| 80 |
[
|
| 81 |
(QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
|
| 82 |
-
(LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11})
|
| 83 |
-
]
|
| 84 |
)
|
| 85 |
-
def test_make_autoevalcolumn(
|
| 86 |
-
benchmarks, expected_benchmark_len, expected_col_names):
|
| 87 |
expected_default_attrs = frozenset(expected_col_names)
|
| 88 |
for benchmark in benchmarks:
|
| 89 |
TestEvalColumn = make_autoevalcolumn("TestEvalColumn", benchmark)
|
|
@@ -98,17 +103,15 @@ def test_make_autoevalcolumn(
|
|
| 98 |
|
| 99 |
|
| 100 |
@pytest.mark.parametrize(
|
| 101 |
-
|
| 102 |
[
|
| 103 |
(QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
|
| 104 |
-
(LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11})
|
| 105 |
-
]
|
| 106 |
)
|
| 107 |
def test_get_default_col_names_and_types(
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
expected_col_names,
|
| 111 |
-
expected_hidden_col_names):
|
| 112 |
default_col_len = len(expected_col_names)
|
| 113 |
hidden_col_len = len(expected_hidden_col_names)
|
| 114 |
for benchmark in benchmarks:
|
|
|
|
| 1 |
import pytest
|
| 2 |
|
| 3 |
+
from src.benchmarks import LongDocBenchmarks, QABenchmarks
|
| 4 |
+
from src.columns import (
|
| 5 |
+
COL_NAME_AVG,
|
| 6 |
+
COL_NAME_RANK,
|
| 7 |
+
COL_NAME_RERANKING_MODEL,
|
| 8 |
+
COL_NAME_RETRIEVAL_MODEL,
|
| 9 |
+
COL_NAME_REVISION,
|
| 10 |
+
COL_NAME_TIMESTAMP,
|
| 11 |
+
get_default_auto_eval_column_dict,
|
| 12 |
+
get_default_col_names_and_types,
|
| 13 |
+
get_fixed_col_names_and_types,
|
| 14 |
+
make_autoevalcolumn,
|
| 15 |
+
)
|
| 16 |
|
| 17 |
# Ref: https://github.com/AIR-Bench/AIR-Bench/blob/4b27b8a8f2047a963805fcf6fb9d74be51ec440c/docs/available_tasks.md
|
| 18 |
# 24.05
|
|
|
|
| 27 |
# | Long-Doc | 15 |
|
| 28 |
# | QA | 13 |
|
| 29 |
|
| 30 |
+
|
| 31 |
@pytest.fixture()
|
| 32 |
def expected_col_names():
|
| 33 |
return [
|
|
|
|
| 52 |
]
|
| 53 |
|
| 54 |
|
| 55 |
+
def test_get_default_auto_eval_column_dict(expected_col_names, expected_hidden_col_names):
|
|
|
|
| 56 |
col_list = get_default_auto_eval_column_dict()
|
| 57 |
assert len(col_list) == 9
|
| 58 |
hidden_cols = []
|
|
|
|
| 82 |
|
| 83 |
|
| 84 |
@pytest.mark.parametrize(
|
| 85 |
+
"benchmarks, expected_benchmark_len",
|
| 86 |
[
|
| 87 |
(QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
|
| 88 |
+
(LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11}),
|
| 89 |
+
],
|
| 90 |
)
|
| 91 |
+
def test_make_autoevalcolumn(benchmarks, expected_benchmark_len, expected_col_names):
|
|
|
|
| 92 |
expected_default_attrs = frozenset(expected_col_names)
|
| 93 |
for benchmark in benchmarks:
|
| 94 |
TestEvalColumn = make_autoevalcolumn("TestEvalColumn", benchmark)
|
|
|
|
| 103 |
|
| 104 |
|
| 105 |
@pytest.mark.parametrize(
|
| 106 |
+
"benchmarks, expected_benchmark_len",
|
| 107 |
[
|
| 108 |
(QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
|
| 109 |
+
(LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11}),
|
| 110 |
+
],
|
| 111 |
)
|
| 112 |
def test_get_default_col_names_and_types(
|
| 113 |
+
benchmarks, expected_benchmark_len, expected_col_names, expected_hidden_col_names
|
| 114 |
+
):
|
|
|
|
|
|
|
| 115 |
default_col_len = len(expected_col_names)
|
| 116 |
hidden_col_len = len(expected_hidden_col_names)
|
| 117 |
for benchmark in benchmarks:
|
tests/src/test_envs.py
CHANGED
|
@@ -1,13 +1,12 @@
|
|
| 1 |
from air_benchmark.tasks import BenchmarkTable
|
| 2 |
|
| 3 |
-
from src.envs import BENCHMARK_VERSION_LIST,
|
| 4 |
|
| 5 |
|
| 6 |
def test_benchmark_version_list():
|
| 7 |
leaderboard_versions = frozenset(BENCHMARK_VERSION_LIST)
|
| 8 |
available_versions = frozenset([k for k in BenchmarkTable.keys()])
|
| 9 |
-
assert leaderboard_versions.issubset(
|
| 10 |
-
available_versions)
|
| 11 |
|
| 12 |
|
| 13 |
def test_default_metrics():
|
|
|
|
| 1 |
from air_benchmark.tasks import BenchmarkTable
|
| 2 |
|
| 3 |
+
from src.envs import BENCHMARK_VERSION_LIST, DEFAULT_METRIC_LONG_DOC, DEFAULT_METRIC_QA, METRIC_LIST
|
| 4 |
|
| 5 |
|
| 6 |
def test_benchmark_version_list():
|
| 7 |
leaderboard_versions = frozenset(BENCHMARK_VERSION_LIST)
|
| 8 |
available_versions = frozenset([k for k in BenchmarkTable.keys()])
|
| 9 |
+
assert leaderboard_versions.issubset(available_versions)
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
def test_default_metrics():
|
tests/src/test_loaders.py
CHANGED
|
@@ -1,41 +1,34 @@
|
|
|
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import pytest
|
| 3 |
-
from pathlib import Path
|
| 4 |
|
| 5 |
-
from src.loaders import
|
| 6 |
|
| 7 |
cur_fp = Path(__file__)
|
| 8 |
|
| 9 |
|
| 10 |
-
@pytest.mark.parametrize(
|
| 11 |
-
"version",
|
| 12 |
-
["AIR-Bench_24.04", "AIR-Bench_24.05"]
|
| 13 |
-
)
|
| 14 |
def test_load_raw_eval_results(version):
|
| 15 |
-
raw_data = load_raw_eval_results(
|
| 16 |
-
cur_fp.parents[1] / f"toydata/eval_results/{version}"
|
| 17 |
-
)
|
| 18 |
assert len(raw_data) == 1
|
| 19 |
full_eval_result = raw_data[0]
|
| 20 |
expected_attr = [
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
]
|
| 31 |
result_attr = [k for k in full_eval_result.__dict__.keys() if k[:2] != "__" and k[-2:] != "__"]
|
| 32 |
assert sorted(expected_attr) == sorted(result_attr)
|
| 33 |
|
| 34 |
|
| 35 |
-
@pytest.mark.parametrize(
|
| 36 |
-
"version",
|
| 37 |
-
["AIR-Bench_24.04", "AIR-Bench_24.05"]
|
| 38 |
-
)
|
| 39 |
def test_load_leaderboard_datastore(version):
|
| 40 |
file_path = cur_fp.parents[1] / f"toydata/eval_results/{version}"
|
| 41 |
datastore = load_leaderboard_datastore(file_path, version)
|
|
@@ -51,4 +44,3 @@ def test_load_eval_results():
|
|
| 51 |
file_path = cur_fp.parents[1] / "toydata/eval_results/"
|
| 52 |
datastore_dict = load_eval_results(file_path)
|
| 53 |
assert len(datastore_dict) == 2
|
| 54 |
-
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
import pandas as pd
|
| 4 |
import pytest
|
|
|
|
| 5 |
|
| 6 |
+
from src.loaders import load_eval_results, load_leaderboard_datastore, load_raw_eval_results
|
| 7 |
|
| 8 |
cur_fp = Path(__file__)
|
| 9 |
|
| 10 |
|
| 11 |
+
@pytest.mark.parametrize("version", ["AIR-Bench_24.04", "AIR-Bench_24.05"])
|
|
|
|
|
|
|
|
|
|
| 12 |
def test_load_raw_eval_results(version):
|
| 13 |
+
raw_data = load_raw_eval_results(cur_fp.parents[1] / f"toydata/eval_results/{version}")
|
|
|
|
|
|
|
| 14 |
assert len(raw_data) == 1
|
| 15 |
full_eval_result = raw_data[0]
|
| 16 |
expected_attr = [
|
| 17 |
+
"eval_name",
|
| 18 |
+
"retrieval_model",
|
| 19 |
+
"reranking_model",
|
| 20 |
+
"retrieval_model_link",
|
| 21 |
+
"reranking_model_link",
|
| 22 |
+
"results",
|
| 23 |
+
"timestamp",
|
| 24 |
+
"revision",
|
| 25 |
+
"is_anonymous",
|
| 26 |
]
|
| 27 |
result_attr = [k for k in full_eval_result.__dict__.keys() if k[:2] != "__" and k[-2:] != "__"]
|
| 28 |
assert sorted(expected_attr) == sorted(result_attr)
|
| 29 |
|
| 30 |
|
| 31 |
+
@pytest.mark.parametrize("version", ["AIR-Bench_24.04", "AIR-Bench_24.05"])
|
|
|
|
|
|
|
|
|
|
| 32 |
def test_load_leaderboard_datastore(version):
|
| 33 |
file_path = cur_fp.parents[1] / f"toydata/eval_results/{version}"
|
| 34 |
datastore = load_leaderboard_datastore(file_path, version)
|
|
|
|
| 44 |
file_path = cur_fp.parents[1] / "toydata/eval_results/"
|
| 45 |
datastore_dict = load_eval_results(file_path)
|
| 46 |
assert len(datastore_dict) == 2
|
|
|
tests/src/test_models.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
-
import pytest
|
| 2 |
from pathlib import Path
|
| 3 |
|
|
|
|
|
|
|
| 4 |
from src.models import EvalResult, FullEvalResult
|
| 5 |
|
| 6 |
cur_fp = Path(__file__)
|
|
@@ -23,19 +24,13 @@ NUM_DOC_BENCHMARKS_24_05 = 11
|
|
| 23 |
NUM_QA_BENCHMARKS_24_04 = 13
|
| 24 |
NUM_DOC_BENCHMARKS_24_04 = 15
|
| 25 |
|
|
|
|
| 26 |
def test_eval_result():
|
| 27 |
-
|
| 28 |
eval_name="eval_name",
|
| 29 |
retrieval_model="bge-m3",
|
| 30 |
reranking_model="NoReranking",
|
| 31 |
-
results=[
|
| 32 |
-
{
|
| 33 |
-
"domain": "law",
|
| 34 |
-
"lang": "en",
|
| 35 |
-
"dataset": "lex_files_500K-600K",
|
| 36 |
-
"value": 0.45723
|
| 37 |
-
}
|
| 38 |
-
],
|
| 39 |
task="qa",
|
| 40 |
metric="ndcg_at_3",
|
| 41 |
timestamp="2024-05-14T03:09:08Z",
|
|
@@ -45,11 +40,12 @@ def test_eval_result():
|
|
| 45 |
|
| 46 |
|
| 47 |
@pytest.mark.parametrize(
|
| 48 |
-
|
| 49 |
[
|
| 50 |
"AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json",
|
| 51 |
-
"AIR-Bench_24.05/bge-m3/NoReranker/results.json"
|
| 52 |
-
]
|
|
|
|
| 53 |
def test_full_eval_result_init_from_json_file(file_path):
|
| 54 |
json_fp = cur_fp.parents[1] / "toydata/eval_results/" / file_path
|
| 55 |
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
|
@@ -59,20 +55,35 @@ def test_full_eval_result_init_from_json_file(file_path):
|
|
| 59 |
|
| 60 |
|
| 61 |
@pytest.mark.parametrize(
|
| 62 |
-
|
| 63 |
[
|
| 64 |
("AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json", "qa", NUM_QA_BENCHMARKS_24_04),
|
| 65 |
-
(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
("AIR-Bench_24.05/bge-m3/NoReranker/results.json", "qa", NUM_QA_BENCHMARKS_24_05),
|
| 67 |
("AIR-Bench_24.05/bge-m3/NoReranker/results.json", "long-doc", NUM_DOC_BENCHMARKS_24_05),
|
| 68 |
-
]
|
|
|
|
| 69 |
def test_full_eval_result_to_dict(file_path, task, expected_num_results):
|
| 70 |
json_fp = cur_fp.parents[1] / "toydata/eval_results/" / file_path
|
| 71 |
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
| 72 |
result_dict_list = full_eval_result.to_dict(task)
|
| 73 |
assert len(result_dict_list) == 1
|
| 74 |
result = result_dict_list[0]
|
| 75 |
-
attr_list = frozenset(
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
result_cols = list(result.keys())
|
| 78 |
-
assert len(result_cols) == (expected_num_results + len(attr_list))
|
|
|
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
|
| 3 |
+
import pytest
|
| 4 |
+
|
| 5 |
from src.models import EvalResult, FullEvalResult
|
| 6 |
|
| 7 |
cur_fp = Path(__file__)
|
|
|
|
| 24 |
NUM_QA_BENCHMARKS_24_04 = 13
|
| 25 |
NUM_DOC_BENCHMARKS_24_04 = 15
|
| 26 |
|
| 27 |
+
|
| 28 |
def test_eval_result():
|
| 29 |
+
EvalResult(
|
| 30 |
eval_name="eval_name",
|
| 31 |
retrieval_model="bge-m3",
|
| 32 |
reranking_model="NoReranking",
|
| 33 |
+
results=[{"domain": "law", "lang": "en", "dataset": "lex_files_500K-600K", "value": 0.45723}],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
task="qa",
|
| 35 |
metric="ndcg_at_3",
|
| 36 |
timestamp="2024-05-14T03:09:08Z",
|
|
|
|
| 40 |
|
| 41 |
|
| 42 |
@pytest.mark.parametrize(
|
| 43 |
+
"file_path",
|
| 44 |
[
|
| 45 |
"AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json",
|
| 46 |
+
"AIR-Bench_24.05/bge-m3/NoReranker/results.json",
|
| 47 |
+
],
|
| 48 |
+
)
|
| 49 |
def test_full_eval_result_init_from_json_file(file_path):
|
| 50 |
json_fp = cur_fp.parents[1] / "toydata/eval_results/" / file_path
|
| 51 |
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
|
|
|
| 55 |
|
| 56 |
|
| 57 |
@pytest.mark.parametrize(
|
| 58 |
+
"file_path, task, expected_num_results",
|
| 59 |
[
|
| 60 |
("AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json", "qa", NUM_QA_BENCHMARKS_24_04),
|
| 61 |
+
(
|
| 62 |
+
"AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json",
|
| 63 |
+
"long-doc",
|
| 64 |
+
NUM_DOC_BENCHMARKS_24_04,
|
| 65 |
+
),
|
| 66 |
("AIR-Bench_24.05/bge-m3/NoReranker/results.json", "qa", NUM_QA_BENCHMARKS_24_05),
|
| 67 |
("AIR-Bench_24.05/bge-m3/NoReranker/results.json", "long-doc", NUM_DOC_BENCHMARKS_24_05),
|
| 68 |
+
],
|
| 69 |
+
)
|
| 70 |
def test_full_eval_result_to_dict(file_path, task, expected_num_results):
|
| 71 |
json_fp = cur_fp.parents[1] / "toydata/eval_results/" / file_path
|
| 72 |
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
| 73 |
result_dict_list = full_eval_result.to_dict(task)
|
| 74 |
assert len(result_dict_list) == 1
|
| 75 |
result = result_dict_list[0]
|
| 76 |
+
attr_list = frozenset(
|
| 77 |
+
[
|
| 78 |
+
"eval_name",
|
| 79 |
+
"Retrieval Method",
|
| 80 |
+
"Reranking Model",
|
| 81 |
+
"Retrieval Model LINK",
|
| 82 |
+
"Reranking Model LINK",
|
| 83 |
+
"Revision",
|
| 84 |
+
"Submission Date",
|
| 85 |
+
"Anonymous Submission",
|
| 86 |
+
]
|
| 87 |
+
)
|
| 88 |
result_cols = list(result.keys())
|
| 89 |
+
assert len(result_cols) == (expected_num_results + len(attr_list))
|
tests/src/test_read_evals.py
DELETED
|
@@ -1,78 +0,0 @@
|
|
| 1 |
-
from pathlib import Path
|
| 2 |
-
|
| 3 |
-
from src.models import FullEvalResult
|
| 4 |
-
from src.read_evals import load_raw_eval_results
|
| 5 |
-
from src.utils import get_leaderboard_df
|
| 6 |
-
|
| 7 |
-
cur_fp = Path(__file__)
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
def test_init_from_json_file():
|
| 11 |
-
json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
|
| 12 |
-
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
| 13 |
-
num_different_task_domain_lang_metric_dataset_combination = 6
|
| 14 |
-
assert len(full_eval_result.results) == num_different_task_domain_lang_metric_dataset_combination
|
| 15 |
-
assert full_eval_result.retrieval_model == "bge-m3"
|
| 16 |
-
assert full_eval_result.reranking_model == "bge-reranker-v2-m3"
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
def test_to_dict():
|
| 20 |
-
json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
|
| 21 |
-
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
| 22 |
-
result_list = full_eval_result.to_dict(task="qa", metric="ndcg_at_1")
|
| 23 |
-
assert len(result_list) == 1
|
| 24 |
-
result_dict = result_list[0]
|
| 25 |
-
assert result_dict["Retrieval Model"] == "bge-m3"
|
| 26 |
-
assert result_dict["Reranking Model"] == "bge-reranker-v2-m3"
|
| 27 |
-
assert result_dict["wiki_en"] is not None
|
| 28 |
-
assert result_dict["wiki_zh"] is not None
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
def test_get_raw_eval_results():
|
| 32 |
-
results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
|
| 33 |
-
results = load_raw_eval_results(results_path)
|
| 34 |
-
# only load the latest results
|
| 35 |
-
assert len(results) == 4
|
| 36 |
-
assert results[0].eval_name == "bge-base-en-v1.5_NoReranker"
|
| 37 |
-
assert len(results[0].results) == 70
|
| 38 |
-
assert results[0].eval_name == "bge-base-en-v1.5_bge-reranker-v2-m3"
|
| 39 |
-
assert len(results[1].results) == 70
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
def test_get_leaderboard_df():
|
| 43 |
-
results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
|
| 44 |
-
raw_data = load_raw_eval_results(results_path)
|
| 45 |
-
df = get_leaderboard_df(raw_data, "qa", "ndcg_at_10")
|
| 46 |
-
assert df.shape[0] == 4
|
| 47 |
-
# the results contain only one embedding model
|
| 48 |
-
# for i in range(4):
|
| 49 |
-
# assert df["Retrieval Model"][i] == "bge-m3"
|
| 50 |
-
# # the results contain only two reranking model
|
| 51 |
-
# assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
|
| 52 |
-
# assert df["Reranking Model"][1] == "NoReranker"
|
| 53 |
-
# assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
|
| 54 |
-
# assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh', ]].isnull().values.any()
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
def test_get_leaderboard_df_long_doc():
|
| 58 |
-
results_path = cur_fp.parents[2] / "toydata" / "test_results"
|
| 59 |
-
raw_data = load_raw_eval_results(results_path)
|
| 60 |
-
df = get_leaderboard_df(raw_data, "long-doc", "ndcg_at_1")
|
| 61 |
-
assert df.shape[0] == 2
|
| 62 |
-
# the results contain only one embedding model
|
| 63 |
-
for i in range(2):
|
| 64 |
-
assert df["Retrieval Model"][i] == "bge-m3"
|
| 65 |
-
# the results contains only two reranking model
|
| 66 |
-
assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
|
| 67 |
-
assert df["Reranking Model"][1] == "NoReranker"
|
| 68 |
-
assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
|
| 69 |
-
assert (
|
| 70 |
-
not df[
|
| 71 |
-
[
|
| 72 |
-
"Average ⬆️",
|
| 73 |
-
"law_en_lex_files_500k_600k",
|
| 74 |
-
]
|
| 75 |
-
]
|
| 76 |
-
.isnull()
|
| 77 |
-
.values.any()
|
| 78 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/src/test_utils.py
CHANGED
|
@@ -1,10 +1,21 @@
|
|
| 1 |
-
import pytest
|
| 2 |
-
import pandas as pd
|
| 3 |
from pathlib import Path
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
|
|
|
| 7 |
from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
cur_fp = Path(__file__)
|
| 10 |
|
|
@@ -18,18 +29,8 @@ NUM_DOC_BENCHMARKS_24_04 = 15
|
|
| 18 |
def toy_df():
|
| 19 |
return pd.DataFrame(
|
| 20 |
{
|
| 21 |
-
"Retrieval Method": [
|
| 22 |
-
|
| 23 |
-
"bge-m3",
|
| 24 |
-
"jina-embeddings-v2-base",
|
| 25 |
-
"jina-embeddings-v2-base"
|
| 26 |
-
],
|
| 27 |
-
"Reranking Model": [
|
| 28 |
-
"bge-reranker-v2-m3",
|
| 29 |
-
"NoReranker",
|
| 30 |
-
"bge-reranker-v2-m3",
|
| 31 |
-
"NoReranker"
|
| 32 |
-
],
|
| 33 |
"Rank 🏆": [1, 2, 3, 4],
|
| 34 |
"Revision": ["123", "234", "345", "456"],
|
| 35 |
"Submission Date": ["", "", "", ""],
|
|
@@ -45,8 +46,7 @@ def toy_df():
|
|
| 45 |
|
| 46 |
def test_remove_html():
|
| 47 |
model_name = "jina-embeddings-v3"
|
| 48 |
-
html_str = model_hyperlink(
|
| 49 |
-
"https://jina.ai", model_name)
|
| 50 |
output_str = remove_html(html_str)
|
| 51 |
assert output_str == model_name
|
| 52 |
|
|
@@ -60,17 +60,29 @@ def test_calculate_mean():
|
|
| 60 |
assert result[1] == -1
|
| 61 |
|
| 62 |
|
| 63 |
-
@pytest.mark.parametrize(
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
def test_filter_models(models, expected):
|
| 71 |
df = pd.DataFrame(
|
| 72 |
{
|
| 73 |
-
COL_NAME_RERANKING_MODEL: [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
"col2": [1, 2, 3],
|
| 75 |
}
|
| 76 |
)
|
|
@@ -78,18 +90,29 @@ def test_filter_models(models, expected):
|
|
| 78 |
assert len(output_df) == expected
|
| 79 |
|
| 80 |
|
| 81 |
-
@pytest.mark.parametrize(
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
| 88 |
def test_filter_queries(query, expected):
|
| 89 |
df = pd.DataFrame(
|
| 90 |
{
|
| 91 |
-
COL_NAME_RETRIEVAL_MODEL: [
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
}
|
| 94 |
)
|
| 95 |
output_df = filter_queries(query, df)
|
|
@@ -103,10 +126,10 @@ def test_filter_queries(query, expected):
|
|
| 103 |
(TaskType.long_doc, "air_bench_2404", True, NUM_DOC_BENCHMARKS_24_04),
|
| 104 |
(TaskType.qa, "air_bench_2405", False, NUM_QA_BENCHMARKS_24_05),
|
| 105 |
(TaskType.long_doc, "air_bench_2405", False, NUM_DOC_BENCHMARKS_24_05),
|
| 106 |
-
]
|
| 107 |
)
|
| 108 |
def test_get_default_cols(task_type, slug, add_fix_cols, expected):
|
| 109 |
-
attr_cols = [
|
| 110 |
cols, types = get_default_cols(task_type, slug)
|
| 111 |
cols_set = frozenset(cols)
|
| 112 |
attrs_set = frozenset(attr_cols)
|
|
@@ -119,44 +142,54 @@ def test_get_default_cols(task_type, slug, add_fix_cols, expected):
|
|
| 119 |
@pytest.mark.parametrize(
|
| 120 |
"task_type, domains, languages, expected",
|
| 121 |
[
|
| 122 |
-
(
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
(
|
| 125 |
TaskType.long_doc,
|
| 126 |
["healthcare"],
|
| 127 |
["zh", "en"],
|
| 128 |
[
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
]
|
| 135 |
-
)
|
| 136 |
-
]
|
| 137 |
)
|
| 138 |
def test_get_selected_cols(task_type, domains, languages, expected):
|
| 139 |
slug = "air_bench_2404"
|
| 140 |
cols = get_selected_cols(task_type, slug, domains, languages)
|
| 141 |
assert sorted(cols) == sorted(expected)
|
| 142 |
|
|
|
|
| 143 |
@pytest.mark.parametrize("reset_rank", [False])
|
| 144 |
def test_select_columns(toy_df, reset_rank):
|
| 145 |
expected = [
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
["news"],
|
| 156 |
-
["zh"],
|
| 157 |
-
version_slug="air_bench_2404",
|
| 158 |
-
reset_ranking=reset_rank
|
| 159 |
-
)
|
| 160 |
assert len(df_result.columns) == len(expected)
|
| 161 |
if reset_rank:
|
| 162 |
assert df_result["Average ⬆️"].equals(df_result["news_zh"])
|
|
@@ -170,20 +203,10 @@ def test_select_columns(toy_df, reset_rank):
|
|
| 170 |
(False, True),
|
| 171 |
(True, True),
|
| 172 |
(True, False),
|
| 173 |
-
]
|
| 174 |
)
|
| 175 |
def test__update_df_elem(toy_df, reset_rank, show_anony):
|
| 176 |
-
df = _update_df_elem(
|
| 177 |
-
TaskType.qa,
|
| 178 |
-
"AIR-Bench_24.04",
|
| 179 |
-
toy_df,
|
| 180 |
-
["news"],
|
| 181 |
-
["zh"],
|
| 182 |
-
[],
|
| 183 |
-
"",
|
| 184 |
-
show_anony,
|
| 185 |
-
reset_rank
|
| 186 |
-
)
|
| 187 |
if show_anony:
|
| 188 |
assert df.shape[0] == 4
|
| 189 |
else:
|
|
@@ -201,19 +224,14 @@ def test__update_df_elem(toy_df, reset_rank, show_anony):
|
|
| 201 |
("AIR-Bench_24.04", TaskType.qa),
|
| 202 |
("AIR-Bench_24.04", TaskType.long_doc),
|
| 203 |
("AIR-Bench_24.05", TaskType.qa),
|
| 204 |
-
("AIR-Bench_24.05", TaskType.long_doc)
|
| 205 |
-
]
|
| 206 |
)
|
| 207 |
def test_get_leaderboard_df(version, task_type):
|
| 208 |
from src.loaders import load_raw_eval_results
|
| 209 |
from src.models import LeaderboardDataStore, get_safe_name
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
)
|
| 213 |
ds = LeaderboardDataStore(version, get_safe_name(version), raw_data=raw_data)
|
| 214 |
-
df = get_leaderboard_df(
|
| 215 |
-
|
| 216 |
-
task_type,
|
| 217 |
-
"ndcg_at_10"
|
| 218 |
-
)
|
| 219 |
-
assert df.shape[0] == 1
|
|
|
|
|
|
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import pytest
|
| 5 |
+
|
| 6 |
from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
|
| 7 |
+
from src.models import TaskType, model_hyperlink
|
| 8 |
+
from src.utils import (
|
| 9 |
+
_update_df_elem,
|
| 10 |
+
calculate_mean,
|
| 11 |
+
filter_models,
|
| 12 |
+
filter_queries,
|
| 13 |
+
get_default_cols,
|
| 14 |
+
get_leaderboard_df,
|
| 15 |
+
get_selected_cols,
|
| 16 |
+
remove_html,
|
| 17 |
+
select_columns,
|
| 18 |
+
)
|
| 19 |
|
| 20 |
cur_fp = Path(__file__)
|
| 21 |
|
|
|
|
| 29 |
def toy_df():
|
| 30 |
return pd.DataFrame(
|
| 31 |
{
|
| 32 |
+
"Retrieval Method": ["bge-m3", "bge-m3", "jina-embeddings-v2-base", "jina-embeddings-v2-base"],
|
| 33 |
+
"Reranking Model": ["bge-reranker-v2-m3", "NoReranker", "bge-reranker-v2-m3", "NoReranker"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
"Rank 🏆": [1, 2, 3, 4],
|
| 35 |
"Revision": ["123", "234", "345", "456"],
|
| 36 |
"Submission Date": ["", "", "", ""],
|
|
|
|
| 46 |
|
| 47 |
def test_remove_html():
|
| 48 |
model_name = "jina-embeddings-v3"
|
| 49 |
+
html_str = model_hyperlink("https://jina.ai", model_name)
|
|
|
|
| 50 |
output_str = remove_html(html_str)
|
| 51 |
assert output_str == model_name
|
| 52 |
|
|
|
|
| 60 |
assert result[1] == -1
|
| 61 |
|
| 62 |
|
| 63 |
+
@pytest.mark.parametrize(
|
| 64 |
+
"models, expected",
|
| 65 |
+
[
|
| 66 |
+
(["model1", "model3"], 2),
|
| 67 |
+
(["model1", "model_missing"], 1),
|
| 68 |
+
(["model1", "model2", "model3"], 3),
|
| 69 |
+
(
|
| 70 |
+
[
|
| 71 |
+
"model1",
|
| 72 |
+
],
|
| 73 |
+
1,
|
| 74 |
+
),
|
| 75 |
+
([], 3),
|
| 76 |
+
],
|
| 77 |
+
)
|
| 78 |
def test_filter_models(models, expected):
|
| 79 |
df = pd.DataFrame(
|
| 80 |
{
|
| 81 |
+
COL_NAME_RERANKING_MODEL: [
|
| 82 |
+
"model1",
|
| 83 |
+
"model2",
|
| 84 |
+
"model3",
|
| 85 |
+
],
|
| 86 |
"col2": [1, 2, 3],
|
| 87 |
}
|
| 88 |
)
|
|
|
|
| 90 |
assert len(output_df) == expected
|
| 91 |
|
| 92 |
|
| 93 |
+
@pytest.mark.parametrize(
|
| 94 |
+
"query, expected",
|
| 95 |
+
[
|
| 96 |
+
("model1;model3", 2),
|
| 97 |
+
("model1;model4", 1),
|
| 98 |
+
("model1;model2;model3", 3),
|
| 99 |
+
("model1", 1),
|
| 100 |
+
("", 3),
|
| 101 |
+
],
|
| 102 |
+
)
|
| 103 |
def test_filter_queries(query, expected):
|
| 104 |
df = pd.DataFrame(
|
| 105 |
{
|
| 106 |
+
COL_NAME_RETRIEVAL_MODEL: [
|
| 107 |
+
"model1",
|
| 108 |
+
"model2",
|
| 109 |
+
"model3",
|
| 110 |
+
],
|
| 111 |
+
COL_NAME_RERANKING_MODEL: [
|
| 112 |
+
"model4",
|
| 113 |
+
"model5",
|
| 114 |
+
"model6",
|
| 115 |
+
],
|
| 116 |
}
|
| 117 |
)
|
| 118 |
output_df = filter_queries(query, df)
|
|
|
|
| 126 |
(TaskType.long_doc, "air_bench_2404", True, NUM_DOC_BENCHMARKS_24_04),
|
| 127 |
(TaskType.qa, "air_bench_2405", False, NUM_QA_BENCHMARKS_24_05),
|
| 128 |
(TaskType.long_doc, "air_bench_2405", False, NUM_DOC_BENCHMARKS_24_05),
|
| 129 |
+
],
|
| 130 |
)
|
| 131 |
def test_get_default_cols(task_type, slug, add_fix_cols, expected):
|
| 132 |
+
attr_cols = ["Rank 🏆", "Retrieval Method", "Reranking Model", "Revision", "Submission Date", "Average ⬆️"]
|
| 133 |
cols, types = get_default_cols(task_type, slug)
|
| 134 |
cols_set = frozenset(cols)
|
| 135 |
attrs_set = frozenset(attr_cols)
|
|
|
|
| 142 |
@pytest.mark.parametrize(
|
| 143 |
"task_type, domains, languages, expected",
|
| 144 |
[
|
| 145 |
+
(
|
| 146 |
+
TaskType.qa,
|
| 147 |
+
["wiki", "news"],
|
| 148 |
+
[
|
| 149 |
+
"zh",
|
| 150 |
+
],
|
| 151 |
+
["wiki_zh", "news_zh"],
|
| 152 |
+
),
|
| 153 |
+
(
|
| 154 |
+
TaskType.qa,
|
| 155 |
+
[
|
| 156 |
+
"law",
|
| 157 |
+
],
|
| 158 |
+
["zh", "en"],
|
| 159 |
+
["law_en"],
|
| 160 |
+
),
|
| 161 |
(
|
| 162 |
TaskType.long_doc,
|
| 163 |
["healthcare"],
|
| 164 |
["zh", "en"],
|
| 165 |
[
|
| 166 |
+
"healthcare_en_pubmed_100k_200k_1",
|
| 167 |
+
"healthcare_en_pubmed_100k_200k_2",
|
| 168 |
+
"healthcare_en_pubmed_100k_200k_3",
|
| 169 |
+
"healthcare_en_pubmed_40k_50k_5_merged",
|
| 170 |
+
"healthcare_en_pubmed_30k_40k_10_merged",
|
| 171 |
+
],
|
| 172 |
+
),
|
| 173 |
+
],
|
| 174 |
)
|
| 175 |
def test_get_selected_cols(task_type, domains, languages, expected):
|
| 176 |
slug = "air_bench_2404"
|
| 177 |
cols = get_selected_cols(task_type, slug, domains, languages)
|
| 178 |
assert sorted(cols) == sorted(expected)
|
| 179 |
|
| 180 |
+
|
| 181 |
@pytest.mark.parametrize("reset_rank", [False])
|
| 182 |
def test_select_columns(toy_df, reset_rank):
|
| 183 |
expected = [
|
| 184 |
+
"Rank 🏆",
|
| 185 |
+
"Retrieval Method",
|
| 186 |
+
"Reranking Model",
|
| 187 |
+
"Revision",
|
| 188 |
+
"Submission Date",
|
| 189 |
+
"Average ⬆️",
|
| 190 |
+
"news_zh",
|
| 191 |
+
]
|
| 192 |
+
df_result = select_columns(toy_df, ["news"], ["zh"], version_slug="air_bench_2404", reset_ranking=reset_rank)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
assert len(df_result.columns) == len(expected)
|
| 194 |
if reset_rank:
|
| 195 |
assert df_result["Average ⬆️"].equals(df_result["news_zh"])
|
|
|
|
| 203 |
(False, True),
|
| 204 |
(True, True),
|
| 205 |
(True, False),
|
| 206 |
+
],
|
| 207 |
)
|
| 208 |
def test__update_df_elem(toy_df, reset_rank, show_anony):
|
| 209 |
+
df = _update_df_elem(TaskType.qa, "AIR-Bench_24.04", toy_df, ["news"], ["zh"], [], "", show_anony, reset_rank)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
if show_anony:
|
| 211 |
assert df.shape[0] == 4
|
| 212 |
else:
|
|
|
|
| 224 |
("AIR-Bench_24.04", TaskType.qa),
|
| 225 |
("AIR-Bench_24.04", TaskType.long_doc),
|
| 226 |
("AIR-Bench_24.05", TaskType.qa),
|
| 227 |
+
("AIR-Bench_24.05", TaskType.long_doc),
|
| 228 |
+
],
|
| 229 |
)
|
| 230 |
def test_get_leaderboard_df(version, task_type):
|
| 231 |
from src.loaders import load_raw_eval_results
|
| 232 |
from src.models import LeaderboardDataStore, get_safe_name
|
| 233 |
+
|
| 234 |
+
raw_data = load_raw_eval_results(cur_fp.parents[1] / f"toydata/eval_results/{version}")
|
|
|
|
| 235 |
ds = LeaderboardDataStore(version, get_safe_name(version), raw_data=raw_data)
|
| 236 |
+
df = get_leaderboard_df(ds, task_type, "ndcg_at_10")
|
| 237 |
+
assert df.shape[0] == 1
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_utils.py
DELETED
|
@@ -1,136 +0,0 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
-
import pytest
|
| 3 |
-
|
| 4 |
-
from app import update_table
|
| 5 |
-
from src.columns import (
|
| 6 |
-
COL_NAME_AVG,
|
| 7 |
-
COL_NAME_IS_ANONYMOUS,
|
| 8 |
-
COL_NAME_RANK,
|
| 9 |
-
COL_NAME_RERANKING_MODEL,
|
| 10 |
-
COL_NAME_RETRIEVAL_MODEL,
|
| 11 |
-
COL_NAME_REVISION,
|
| 12 |
-
COL_NAME_TIMESTAMP,
|
| 13 |
-
)
|
| 14 |
-
from src.utils import (
|
| 15 |
-
filter_models,
|
| 16 |
-
filter_queries,
|
| 17 |
-
get_default_cols,
|
| 18 |
-
get_iso_format_timestamp,
|
| 19 |
-
search_table,
|
| 20 |
-
select_columns,
|
| 21 |
-
update_doc_df_elem,
|
| 22 |
-
)
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
@pytest.fixture
|
| 26 |
-
def toy_df():
|
| 27 |
-
return pd.DataFrame(
|
| 28 |
-
{
|
| 29 |
-
"Retrieval Model": ["bge-m3", "bge-m3", "jina-embeddings-v2-base", "jina-embeddings-v2-base"],
|
| 30 |
-
"Reranking Model": ["bge-reranker-v2-m3", "NoReranker", "bge-reranker-v2-m3", "NoReranker"],
|
| 31 |
-
"Average ⬆️": [0.6, 0.4, 0.3, 0.2],
|
| 32 |
-
"wiki_en": [0.8, 0.7, 0.2, 0.1],
|
| 33 |
-
"wiki_zh": [0.4, 0.1, 0.4, 0.3],
|
| 34 |
-
"news_en": [0.8, 0.7, 0.2, 0.1],
|
| 35 |
-
"news_zh": [0.4, 0.1, 0.4, 0.3],
|
| 36 |
-
}
|
| 37 |
-
)
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
@pytest.fixture
|
| 41 |
-
def toy_df_long_doc():
|
| 42 |
-
return pd.DataFrame(
|
| 43 |
-
{
|
| 44 |
-
"Retrieval Model": ["bge-m3", "bge-m3", "jina-embeddings-v2-base", "jina-embeddings-v2-base"],
|
| 45 |
-
"Reranking Model": ["bge-reranker-v2-m3", "NoReranker", "bge-reranker-v2-m3", "NoReranker"],
|
| 46 |
-
"Average ⬆️": [0.6, 0.4, 0.3, 0.2],
|
| 47 |
-
"law_en_lex_files_300k_400k": [0.4, 0.1, 0.4, 0.3],
|
| 48 |
-
"law_en_lex_files_400k_500k": [0.8, 0.7, 0.2, 0.1],
|
| 49 |
-
"law_en_lex_files_500k_600k": [0.8, 0.7, 0.2, 0.1],
|
| 50 |
-
"law_en_lex_files_600k_700k": [0.4, 0.1, 0.4, 0.3],
|
| 51 |
-
}
|
| 52 |
-
)
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
def test_filter_models(toy_df):
|
| 56 |
-
df_result = filter_models(
|
| 57 |
-
toy_df,
|
| 58 |
-
[
|
| 59 |
-
"bge-reranker-v2-m3",
|
| 60 |
-
],
|
| 61 |
-
)
|
| 62 |
-
assert len(df_result) == 2
|
| 63 |
-
assert df_result.iloc[0]["Reranking Model"] == "bge-reranker-v2-m3"
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
def test_search_table(toy_df):
|
| 67 |
-
df_result = search_table(toy_df, "jina")
|
| 68 |
-
assert len(df_result) == 2
|
| 69 |
-
assert df_result.iloc[0]["Retrieval Model"] == "jina-embeddings-v2-base"
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
def test_filter_queries(toy_df):
|
| 73 |
-
df_result = filter_queries("jina", toy_df)
|
| 74 |
-
assert len(df_result) == 2
|
| 75 |
-
assert df_result.iloc[0]["Retrieval Model"] == "jina-embeddings-v2-base"
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
def test_update_table_long_doc(toy_df_long_doc):
|
| 82 |
-
df_result = update_doc_df_elem(
|
| 83 |
-
toy_df_long_doc,
|
| 84 |
-
[
|
| 85 |
-
"law",
|
| 86 |
-
],
|
| 87 |
-
[
|
| 88 |
-
"en",
|
| 89 |
-
],
|
| 90 |
-
[
|
| 91 |
-
"bge-reranker-v2-m3",
|
| 92 |
-
],
|
| 93 |
-
"jina",
|
| 94 |
-
)
|
| 95 |
-
print(df_result)
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
def test_get_iso_format_timestamp():
|
| 99 |
-
timestamp_config, timestamp_fn = get_iso_format_timestamp()
|
| 100 |
-
assert len(timestamp_fn) == 14
|
| 101 |
-
assert len(timestamp_config) == 20
|
| 102 |
-
assert timestamp_config[-1] == "Z"
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
def test_get_default_cols():
|
| 106 |
-
cols, types = get_default_cols("qa")
|
| 107 |
-
for c, t in zip(cols, types):
|
| 108 |
-
print(f"type({c}): {t}")
|
| 109 |
-
assert len(frozenset(cols)) == len(cols)
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
def test_update_table():
|
| 113 |
-
df = pd.DataFrame(
|
| 114 |
-
{
|
| 115 |
-
COL_NAME_IS_ANONYMOUS: [False, False, False],
|
| 116 |
-
COL_NAME_REVISION: ["a1", "a2", "a3"],
|
| 117 |
-
COL_NAME_TIMESTAMP: ["2024-05-12T12:24:02Z"] * 3,
|
| 118 |
-
COL_NAME_RERANKING_MODEL: ["NoReranker"] * 3,
|
| 119 |
-
COL_NAME_RETRIEVAL_MODEL: ["Foo"] * 3,
|
| 120 |
-
COL_NAME_RANK: [1, 2, 3],
|
| 121 |
-
COL_NAME_AVG: [0.1, 0.2, 0.3], # unsorted values
|
| 122 |
-
"wiki_en": [0.1, 0.2, 0.3],
|
| 123 |
-
}
|
| 124 |
-
)
|
| 125 |
-
results = update_table(
|
| 126 |
-
df,
|
| 127 |
-
"wiki",
|
| 128 |
-
"en",
|
| 129 |
-
["NoReranker"],
|
| 130 |
-
"",
|
| 131 |
-
show_anonymous=False,
|
| 132 |
-
reset_ranking=False,
|
| 133 |
-
show_revision_and_timestamp=False,
|
| 134 |
-
)
|
| 135 |
-
# keep the RANK as the same regardless of the unsorted averages
|
| 136 |
-
assert results[COL_NAME_RANK].to_list() == [1, 2, 3]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/toydata/test_data.json
DELETED
|
@@ -1,98 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"config": {
|
| 4 |
-
"retrieval_model": "bge-m3",
|
| 5 |
-
"reranking_model": "bge-reranker-v2-m3",
|
| 6 |
-
"task": "long_doc",
|
| 7 |
-
"metric": "ndcg_at_1"
|
| 8 |
-
},
|
| 9 |
-
"results": [
|
| 10 |
-
{
|
| 11 |
-
"domain": "law",
|
| 12 |
-
"lang": "en",
|
| 13 |
-
"dataset": "lex_files_500K-600K",
|
| 14 |
-
"value": 0.75723
|
| 15 |
-
}
|
| 16 |
-
]
|
| 17 |
-
},
|
| 18 |
-
{
|
| 19 |
-
"config": {
|
| 20 |
-
"retrieval_model": "bge-m3",
|
| 21 |
-
"reranking_model": "bge-reranker-v2-m3",
|
| 22 |
-
"task": "long_doc",
|
| 23 |
-
"metric": "ndcg_at_3"
|
| 24 |
-
},
|
| 25 |
-
"results": [
|
| 26 |
-
{
|
| 27 |
-
"domain": "law",
|
| 28 |
-
"lang": "en",
|
| 29 |
-
"dataset": "lex_files_500K-600K",
|
| 30 |
-
"value": 0.69909
|
| 31 |
-
}
|
| 32 |
-
]
|
| 33 |
-
},
|
| 34 |
-
{
|
| 35 |
-
"config": {
|
| 36 |
-
"retrieval_model": "bge-m3",
|
| 37 |
-
"reranking_model": "bge-reranker-v2-m3",
|
| 38 |
-
"task": "qa",
|
| 39 |
-
"metric": "ndcg_at_1"
|
| 40 |
-
},
|
| 41 |
-
"results": [
|
| 42 |
-
{
|
| 43 |
-
"domain": "wiki",
|
| 44 |
-
"lang": "en",
|
| 45 |
-
"dataset": "unknown",
|
| 46 |
-
"value": 0.69083
|
| 47 |
-
}
|
| 48 |
-
]
|
| 49 |
-
},
|
| 50 |
-
{
|
| 51 |
-
"config": {
|
| 52 |
-
"retrieval_model": "bge-m3",
|
| 53 |
-
"reranking_model": "bge-reranker-v2-m3",
|
| 54 |
-
"task": "qa",
|
| 55 |
-
"metric": "ndcg_at_3"
|
| 56 |
-
},
|
| 57 |
-
"results": [
|
| 58 |
-
{
|
| 59 |
-
"domain": "wiki",
|
| 60 |
-
"lang": "en",
|
| 61 |
-
"dataset": "unknown",
|
| 62 |
-
"value": 0.73359
|
| 63 |
-
}
|
| 64 |
-
]
|
| 65 |
-
},
|
| 66 |
-
{
|
| 67 |
-
"config": {
|
| 68 |
-
"retrieval_model": "bge-m3",
|
| 69 |
-
"reranking_model": "bge-reranker-v2-m3",
|
| 70 |
-
"task": "qa",
|
| 71 |
-
"metric": "ndcg_at_1"
|
| 72 |
-
},
|
| 73 |
-
"results": [
|
| 74 |
-
{
|
| 75 |
-
"domain": "wiki",
|
| 76 |
-
"lang": "zh",
|
| 77 |
-
"dataset": "unknown",
|
| 78 |
-
"value": 0.78358
|
| 79 |
-
}
|
| 80 |
-
]
|
| 81 |
-
},
|
| 82 |
-
{
|
| 83 |
-
"config": {
|
| 84 |
-
"retrieval_model": "bge-m3",
|
| 85 |
-
"reranking_model": "bge-reranker-v2-m3",
|
| 86 |
-
"task": "qa",
|
| 87 |
-
"metric": "ndcg_at_3"
|
| 88 |
-
},
|
| 89 |
-
"results": [
|
| 90 |
-
{
|
| 91 |
-
"domain": "wiki",
|
| 92 |
-
"lang": "zh",
|
| 93 |
-
"dataset": "unknown",
|
| 94 |
-
"value": 0.78358
|
| 95 |
-
}
|
| 96 |
-
]
|
| 97 |
-
}
|
| 98 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/toydata/test_results/bge-m3/NoReranker/results_2023-11-21T18-10-08.json
DELETED
|
@@ -1,98 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"config": {
|
| 4 |
-
"retrieval_model": "bge-m3",
|
| 5 |
-
"reranking_model": "NoReranker",
|
| 6 |
-
"task": "long_doc",
|
| 7 |
-
"metric": "ndcg_at_1"
|
| 8 |
-
},
|
| 9 |
-
"results": [
|
| 10 |
-
{
|
| 11 |
-
"domain": "law",
|
| 12 |
-
"lang": "en",
|
| 13 |
-
"dataset": "lex_files_500K-600K",
|
| 14 |
-
"value": 0.45723
|
| 15 |
-
}
|
| 16 |
-
]
|
| 17 |
-
},
|
| 18 |
-
{
|
| 19 |
-
"config": {
|
| 20 |
-
"retrieval_model": "bge-m3",
|
| 21 |
-
"reranking_model": "NoReranker",
|
| 22 |
-
"task": "long_doc",
|
| 23 |
-
"metric": "ndcg_at_3"
|
| 24 |
-
},
|
| 25 |
-
"results": [
|
| 26 |
-
{
|
| 27 |
-
"domain": "law",
|
| 28 |
-
"lang": "en",
|
| 29 |
-
"dataset": "lex_files_500K-600K",
|
| 30 |
-
"value": 0.49909
|
| 31 |
-
}
|
| 32 |
-
]
|
| 33 |
-
},
|
| 34 |
-
{
|
| 35 |
-
"config": {
|
| 36 |
-
"retrieval_model": "bge-m3",
|
| 37 |
-
"reranking_model": "NoReranker",
|
| 38 |
-
"task": "qa",
|
| 39 |
-
"metric": "ndcg_at_1"
|
| 40 |
-
},
|
| 41 |
-
"results": [
|
| 42 |
-
{
|
| 43 |
-
"domain": "wiki",
|
| 44 |
-
"lang": "en",
|
| 45 |
-
"dataset": "unknown",
|
| 46 |
-
"value": 0.49083
|
| 47 |
-
}
|
| 48 |
-
]
|
| 49 |
-
},
|
| 50 |
-
{
|
| 51 |
-
"config": {
|
| 52 |
-
"retrieval_model": "bge-m3",
|
| 53 |
-
"reranking_model": "NoReranker",
|
| 54 |
-
"task": "qa",
|
| 55 |
-
"metric": "ndcg_at_3"
|
| 56 |
-
},
|
| 57 |
-
"results": [
|
| 58 |
-
{
|
| 59 |
-
"domain": "wiki",
|
| 60 |
-
"lang": "en",
|
| 61 |
-
"dataset": "unknown",
|
| 62 |
-
"value": 0.43359
|
| 63 |
-
}
|
| 64 |
-
]
|
| 65 |
-
},
|
| 66 |
-
{
|
| 67 |
-
"config": {
|
| 68 |
-
"retrieval_model": "bge-m3",
|
| 69 |
-
"reranking_model": "NoReranker",
|
| 70 |
-
"task": "qa",
|
| 71 |
-
"metric": "ndcg_at_1"
|
| 72 |
-
},
|
| 73 |
-
"results": [
|
| 74 |
-
{
|
| 75 |
-
"domain": "wiki",
|
| 76 |
-
"lang": "zh",
|
| 77 |
-
"dataset": "unknown",
|
| 78 |
-
"value": 0.78358
|
| 79 |
-
}
|
| 80 |
-
]
|
| 81 |
-
},
|
| 82 |
-
{
|
| 83 |
-
"config": {
|
| 84 |
-
"retrieval_model": "bge-m3",
|
| 85 |
-
"reranking_model": "NoReranker",
|
| 86 |
-
"task": "qa",
|
| 87 |
-
"metric": "ndcg_at_3"
|
| 88 |
-
},
|
| 89 |
-
"results": [
|
| 90 |
-
{
|
| 91 |
-
"domain": "wiki",
|
| 92 |
-
"lang": "zh",
|
| 93 |
-
"dataset": "unknown",
|
| 94 |
-
"value": 0.78358
|
| 95 |
-
}
|
| 96 |
-
]
|
| 97 |
-
}
|
| 98 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/toydata/test_results/bge-m3/bge-reranker-v2-m3/results_2023-11-21T18-10-08.json
DELETED
|
@@ -1,98 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"config": {
|
| 4 |
-
"retrieval_model": "bge-m3",
|
| 5 |
-
"reranking_model": "bge-reranker-v2-m3",
|
| 6 |
-
"task": "long_doc",
|
| 7 |
-
"metric": "ndcg_at_1"
|
| 8 |
-
},
|
| 9 |
-
"results": [
|
| 10 |
-
{
|
| 11 |
-
"domain": "law",
|
| 12 |
-
"lang": "en",
|
| 13 |
-
"dataset": "lex_files_500K-600K",
|
| 14 |
-
"value": 0.75723
|
| 15 |
-
}
|
| 16 |
-
]
|
| 17 |
-
},
|
| 18 |
-
{
|
| 19 |
-
"config": {
|
| 20 |
-
"retrieval_model": "bge-m3",
|
| 21 |
-
"reranking_model": "bge-reranker-v2-m3",
|
| 22 |
-
"task": "long_doc",
|
| 23 |
-
"metric": "ndcg_at_3"
|
| 24 |
-
},
|
| 25 |
-
"results": [
|
| 26 |
-
{
|
| 27 |
-
"domain": "law",
|
| 28 |
-
"lang": "en",
|
| 29 |
-
"dataset": "lex_files_500K-600K",
|
| 30 |
-
"value": 0.69909
|
| 31 |
-
}
|
| 32 |
-
]
|
| 33 |
-
},
|
| 34 |
-
{
|
| 35 |
-
"config": {
|
| 36 |
-
"retrieval_model": "bge-m3",
|
| 37 |
-
"reranking_model": "bge-reranker-v2-m3",
|
| 38 |
-
"task": "qa",
|
| 39 |
-
"metric": "ndcg_at_1"
|
| 40 |
-
},
|
| 41 |
-
"results": [
|
| 42 |
-
{
|
| 43 |
-
"domain": "wiki",
|
| 44 |
-
"lang": "en",
|
| 45 |
-
"dataset": "unknown",
|
| 46 |
-
"value": 0.69083
|
| 47 |
-
}
|
| 48 |
-
]
|
| 49 |
-
},
|
| 50 |
-
{
|
| 51 |
-
"config": {
|
| 52 |
-
"retrieval_model": "bge-m3",
|
| 53 |
-
"reranking_model": "bge-reranker-v2-m3",
|
| 54 |
-
"task": "qa",
|
| 55 |
-
"metric": "ndcg_at_3"
|
| 56 |
-
},
|
| 57 |
-
"results": [
|
| 58 |
-
{
|
| 59 |
-
"domain": "wiki",
|
| 60 |
-
"lang": "en",
|
| 61 |
-
"dataset": "unknown",
|
| 62 |
-
"value": 0.73359
|
| 63 |
-
}
|
| 64 |
-
]
|
| 65 |
-
},
|
| 66 |
-
{
|
| 67 |
-
"config": {
|
| 68 |
-
"retrieval_model": "bge-m3",
|
| 69 |
-
"reranking_model": "bge-reranker-v2-m3",
|
| 70 |
-
"task": "qa",
|
| 71 |
-
"metric": "ndcg_at_1"
|
| 72 |
-
},
|
| 73 |
-
"results": [
|
| 74 |
-
{
|
| 75 |
-
"domain": "wiki",
|
| 76 |
-
"lang": "zh",
|
| 77 |
-
"dataset": "unknown",
|
| 78 |
-
"value": 0.78358
|
| 79 |
-
}
|
| 80 |
-
]
|
| 81 |
-
},
|
| 82 |
-
{
|
| 83 |
-
"config": {
|
| 84 |
-
"retrieval_model": "bge-m3",
|
| 85 |
-
"reranking_model": "bge-reranker-v2-m3",
|
| 86 |
-
"task": "qa",
|
| 87 |
-
"metric": "ndcg_at_3"
|
| 88 |
-
},
|
| 89 |
-
"results": [
|
| 90 |
-
{
|
| 91 |
-
"domain": "wiki",
|
| 92 |
-
"lang": "zh",
|
| 93 |
-
"dataset": "unknown",
|
| 94 |
-
"value": 0.78358
|
| 95 |
-
}
|
| 96 |
-
]
|
| 97 |
-
}
|
| 98 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|