Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: add versioning for the qa retrieval
Browse files- app.py +17 -8
- src/envs.py +1 -1
- src/utils.py +0 -6
app.py
CHANGED
|
@@ -14,7 +14,7 @@ from src.display.css_html_js import custom_css
|
|
| 14 |
from src.envs import (
|
| 15 |
API,
|
| 16 |
EVAL_RESULTS_PATH,
|
| 17 |
-
REPO_ID, DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, METRIC_LIST, LATEST_BENCHMARK_VERSION, COL_NAME_RERANKING_MODEL
|
| 18 |
)
|
| 19 |
from src.loaders import (
|
| 20 |
load_eval_results
|
|
@@ -22,7 +22,8 @@ from src.loaders import (
|
|
| 22 |
from src.utils import (
|
| 23 |
update_metric,
|
| 24 |
set_listeners,
|
| 25 |
-
reset_rank
|
|
|
|
| 26 |
)
|
| 27 |
from src.display.gradio_formatting import (
|
| 28 |
get_version_dropdown,
|
|
@@ -183,6 +184,7 @@ with demo:
|
|
| 183 |
lb_df_retriever = datastore.leaderboard_df_qa[datastore.leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
| 184 |
lb_df_retriever = reset_rank(lb_df_retriever)
|
| 185 |
lb_table_retriever = get_leaderboard_table(lb_df_retriever, datastore.types_qa)
|
|
|
|
| 186 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 187 |
hidden_lb_df_retriever = datastore.raw_df_qa[datastore.raw_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
| 188 |
hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
|
|
@@ -222,9 +224,8 @@ with demo:
|
|
| 222 |
lb_table_retriever,
|
| 223 |
queue=True
|
| 224 |
)
|
| 225 |
-
"""
|
| 226 |
with gr.TabItem("Reranking Only", id=12):
|
| 227 |
-
lb_df_reranker =
|
| 228 |
lb_df_reranker = reset_rank(lb_df_reranker)
|
| 229 |
reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
| 230 |
with gr.Row():
|
|
@@ -232,11 +233,18 @@ with demo:
|
|
| 232 |
selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
|
| 233 |
with gr.Column(scale=1):
|
| 234 |
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
| 235 |
-
lb_table_reranker = get_leaderboard_table(lb_df_reranker,
|
| 236 |
-
|
|
|
|
| 237 |
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
| 238 |
hidden_lb_table_reranker = get_leaderboard_table(
|
| 239 |
-
hidden_lb_df_reranker,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
)
|
| 241 |
|
| 242 |
set_listeners(
|
|
@@ -244,6 +252,7 @@ with demo:
|
|
| 244 |
lb_table_reranker,
|
| 245 |
hidden_lb_table_reranker,
|
| 246 |
search_bar_reranker,
|
|
|
|
| 247 |
selected_domains,
|
| 248 |
selected_langs,
|
| 249 |
selected_rerankings_reranker,
|
|
@@ -261,11 +270,11 @@ with demo:
|
|
| 261 |
search_bar_reranker,
|
| 262 |
show_anonymous,
|
| 263 |
show_revision_and_timestamp,
|
| 264 |
-
selected_version,
|
| 265 |
],
|
| 266 |
lb_table_reranker,
|
| 267 |
queue=True
|
| 268 |
)
|
|
|
|
| 269 |
with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
|
| 270 |
with gr.Row():
|
| 271 |
with gr.Column(min_width=320):
|
|
|
|
| 14 |
from src.envs import (
|
| 15 |
API,
|
| 16 |
EVAL_RESULTS_PATH,
|
| 17 |
+
REPO_ID, DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, METRIC_LIST, LATEST_BENCHMARK_VERSION, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, BM25_LINK
|
| 18 |
)
|
| 19 |
from src.loaders import (
|
| 20 |
load_eval_results
|
|
|
|
| 22 |
from src.utils import (
|
| 23 |
update_metric,
|
| 24 |
set_listeners,
|
| 25 |
+
reset_rank,
|
| 26 |
+
remove_html
|
| 27 |
)
|
| 28 |
from src.display.gradio_formatting import (
|
| 29 |
get_version_dropdown,
|
|
|
|
| 184 |
lb_df_retriever = datastore.leaderboard_df_qa[datastore.leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
| 185 |
lb_df_retriever = reset_rank(lb_df_retriever)
|
| 186 |
lb_table_retriever = get_leaderboard_table(lb_df_retriever, datastore.types_qa)
|
| 187 |
+
|
| 188 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 189 |
hidden_lb_df_retriever = datastore.raw_df_qa[datastore.raw_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
| 190 |
hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
|
|
|
|
| 224 |
lb_table_retriever,
|
| 225 |
queue=True
|
| 226 |
)
|
|
|
|
| 227 |
with gr.TabItem("Reranking Only", id=12):
|
| 228 |
+
lb_df_reranker = datastore.leaderboard_df_qa[datastore.leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
| 229 |
lb_df_reranker = reset_rank(lb_df_reranker)
|
| 230 |
reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
| 231 |
with gr.Row():
|
|
|
|
| 233 |
selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
|
| 234 |
with gr.Column(scale=1):
|
| 235 |
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
| 236 |
+
lb_table_reranker = get_leaderboard_table(lb_df_reranker, datastore.types_qa)
|
| 237 |
+
|
| 238 |
+
hidden_lb_df_reranker = datastore.raw_df_qa[datastore.raw_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
| 239 |
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
| 240 |
hidden_lb_table_reranker = get_leaderboard_table(
|
| 241 |
+
hidden_lb_df_reranker, datastore.types_qa, visible=False
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
selected_version.change(
|
| 245 |
+
update_datastore,
|
| 246 |
+
[selected_version,],
|
| 247 |
+
[selected_domains, selected_langs, selected_rerankings_reranker, lb_table_reranker, hidden_lb_table_reranker]
|
| 248 |
)
|
| 249 |
|
| 250 |
set_listeners(
|
|
|
|
| 252 |
lb_table_reranker,
|
| 253 |
hidden_lb_table_reranker,
|
| 254 |
search_bar_reranker,
|
| 255 |
+
selected_version,
|
| 256 |
selected_domains,
|
| 257 |
selected_langs,
|
| 258 |
selected_rerankings_reranker,
|
|
|
|
| 270 |
search_bar_reranker,
|
| 271 |
show_anonymous,
|
| 272 |
show_revision_and_timestamp,
|
|
|
|
| 273 |
],
|
| 274 |
lb_table_reranker,
|
| 275 |
queue=True
|
| 276 |
)
|
| 277 |
+
"""
|
| 278 |
with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
|
| 279 |
with gr.Row():
|
| 280 |
with gr.Column(min_width=320):
|
src/envs.py
CHANGED
|
@@ -30,7 +30,7 @@ BENCHMARK_VERSION_LIST = [
|
|
| 30 |
"AIR-Bench_24.05",
|
| 31 |
]
|
| 32 |
|
| 33 |
-
LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[
|
| 34 |
DEFAULT_METRIC_QA = "ndcg_at_10"
|
| 35 |
DEFAULT_METRIC_LONG_DOC = "recall_at_10"
|
| 36 |
METRIC_LIST = [
|
|
|
|
| 30 |
"AIR-Bench_24.05",
|
| 31 |
]
|
| 32 |
|
| 33 |
+
LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[0]
|
| 34 |
DEFAULT_METRIC_QA = "ndcg_at_10"
|
| 35 |
DEFAULT_METRIC_LONG_DOC = "recall_at_10"
|
| 36 |
METRIC_LIST = [
|
src/utils.py
CHANGED
|
@@ -177,10 +177,6 @@ def _update_table(
|
|
| 177 |
show_revision_and_timestamp: bool = False
|
| 178 |
):
|
| 179 |
version_slug = get_safe_name(version)[-4:]
|
| 180 |
-
if isinstance(hidden_df, str):
|
| 181 |
-
print(f"task: {task}")
|
| 182 |
-
print(f"version: {version}")
|
| 183 |
-
print(f"hidden_df is a string: {hidden_df}")
|
| 184 |
filtered_df = hidden_df.copy()
|
| 185 |
if not show_anonymous:
|
| 186 |
filtered_df = filtered_df[~filtered_df[COL_NAME_IS_ANONYMOUS]]
|
|
@@ -257,7 +253,6 @@ def upload_file(filepath: str):
|
|
| 257 |
return filepath
|
| 258 |
|
| 259 |
|
| 260 |
-
|
| 261 |
def get_iso_format_timestamp():
|
| 262 |
# Get the current timestamp with UTC as the timezone
|
| 263 |
current_timestamp = datetime.now(timezone.utc)
|
|
@@ -377,7 +372,6 @@ def get_leaderboard_df(datastore, task: str, metric: str) -> pd.DataFrame:
|
|
| 377 |
for v in raw_data:
|
| 378 |
all_data_json += v.to_dict(task=task, metric=metric)
|
| 379 |
df = pd.DataFrame.from_records(all_data_json)
|
| 380 |
-
# print(f'dataframe created: {df.shape}')
|
| 381 |
|
| 382 |
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
|
| 383 |
|
|
|
|
| 177 |
show_revision_and_timestamp: bool = False
|
| 178 |
):
|
| 179 |
version_slug = get_safe_name(version)[-4:]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
filtered_df = hidden_df.copy()
|
| 181 |
if not show_anonymous:
|
| 182 |
filtered_df = filtered_df[~filtered_df[COL_NAME_IS_ANONYMOUS]]
|
|
|
|
| 253 |
return filepath
|
| 254 |
|
| 255 |
|
|
|
|
| 256 |
def get_iso_format_timestamp():
|
| 257 |
# Get the current timestamp with UTC as the timezone
|
| 258 |
current_timestamp = datetime.now(timezone.utc)
|
|
|
|
| 372 |
for v in raw_data:
|
| 373 |
all_data_json += v.to_dict(task=task, metric=metric)
|
| 374 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
|
| 375 |
|
| 376 |
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
|
| 377 |
|