import pandas as pd import gradio as gr import csv from collections import defaultdict def strip_colname(x): if x.startswith('score_'): return x[6:] return x INTRO = """The current leaderboard displays performance across all filtered directions based on the dev subset of BOUQuET. A smarter leaderboard and the code for reproducing the evaluations will be published soon! """ LANGS_EXPLANATION = """## Languages For the description of languages, please refer to https://huggingface.co/datasets/facebook/bouquet#languages. """ METRICS_EXPLANATION = """## Metrics 1. `metricx_both`: [google/metricx-24-hybrid-xl-v2p6](https://huggingface.co/google/metricx-24-hybrid-xl-v2p6) score based on both source and reference. **Attention: lower is better!** 2. `xcomet_both`: []() score based on both source and reference. 3. `CHRFpp`: ChrF++ score ([sacrebleu](https://github.com/mjpost/sacrebleu) implementation) based on reference. 4. `glotlid_ref`: probability, as predicted by the [GlotLID model](https://huggingface.co/cis-lmu/glotlid), that translation and reference are in the same language. """ SYSTEMS_EXPLANATION = """## Systems Descriptions of the implementation of the systems will come out later. """ def leaderboard_tab(): stats = pd.read_csv("data/benchmark_stats.tsv", sep="\t", quoting=csv.QUOTE_NONE) stats.columns = [strip_colname(c) for c in stats.columns] metrics = ['metricx_both', 'xcomet_both', 'CHRFpp', 'glotlid_ref'] systems = sorted(set(stats["system"])) levels = ["sentence_level", "paragraph_level"] ALL = "ALL" MEAN = "Average" BEST = "Best" XX2EN = "Everything-into-English" EN2XX = "English-into-Everything" lang_src2tgt = defaultdict(set) lang_tgt2src = defaultdict(set) langs_src = set() langs_tgt = set() for src_lang, tgt_lang in stats[["src_lang", "tgt_lang"]].drop_duplicates().values: lang_src2tgt[src_lang].add(tgt_lang) lang_tgt2src[tgt_lang].add(src_lang) langs_src.add(src_lang) langs_tgt.add(tgt_lang) with gr.Tab("Leaderboard"): gr.Markdown("# BOUQuET translation leaderboard") gr.Markdown(INTRO) gr.Markdown("## Systems ranking") # Inputs gr_level = gr.Dropdown(levels, value="sentence_level", label="Level") gr_src_lang = gr.Dropdown([ALL] + sorted(langs_src), value=ALL, label="Source lang") gr_tgt_lang = gr.Dropdown([ALL] + sorted(langs_tgt), value=ALL, label="Target lang") # Interactivity inputs = [gr_level, gr_src_lang, gr_tgt_lang] def get_lb(level, src_lang, tgt_lang): filtered = stats[stats["level"].eq(level)] if src_lang != ALL: filtered = filtered[filtered["src_lang"].eq(src_lang)] if tgt_lang != ALL: filtered = filtered[filtered["tgt_lang"].eq(tgt_lang)] means = filtered.groupby(['system'])[metrics].mean().reset_index().sort_values('metricx_both') means.columns = [strip_colname(c) for c in means.columns] styler = means.style.background_gradient().format(precision=4) return styler df_all = get_lb(*[inp.value for inp in inputs]) gr_df = gr.Dataframe(df_all) for inp in inputs: inp.change(fn=get_lb, inputs=inputs, outputs=gr_df) # Interdependecy of the controls def src2tgt(src_lang, tgt_lang): if src_lang == ALL: choices = [ALL] + sorted(langs_tgt) else: choices = [ALL] + sorted(lang_src2tgt[src_lang]) return gr.update(choices=choices, value=tgt_lang) def tgt2src(src_lang, tgt_lang): if tgt_lang == ALL: choices = [ALL] + sorted(langs_src) else: choices = [ALL] + sorted(lang_tgt2src[tgt_lang]) return gr.update(choices=choices, value=src_lang) gr_src_lang.input(fn=src2tgt, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_tgt_lang) gr_tgt_lang.input(fn=tgt2src, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_src_lang) gr.Markdown("## Languages difficulty") gr_system = gr.Dropdown([MEAN, BEST] + systems, value=MEAN, label="Translation system") gr_direction = gr.Dropdown([XX2EN, EN2XX], value=XX2EN, label="Translation direction") gr_metric = gr.Dropdown(metrics, label="Quality metric", value="metricx_both") gr_level2 = gr.Dropdown(levels, value="sentence_level", label="Level") bar_controls = [gr_system, gr_direction, gr_metric, gr_level2] def get_hist(system, direction, metric, level): # decide on the data to process if direction == EN2XX: direction_filter = stats['src_lang'].eq('eng_Latn') lang_col = "tgt_lang" else: direction_filter = stats['tgt_lang'].eq('eng_Latn') lang_col = "src_lang" if system in (MEAN, BEST): system_filter = stats["system"].astype(bool) else: system_filter = stats['system'].eq(system) subset = stats[system_filter & direction_filter & stats["level"].eq(level)] # Compute the means and update the plot grouped = subset.groupby(lang_col)[metric] if system == BEST: if metric == "metricx_both": means = grouped.min() else: means = grouped.max() else: means = grouped.mean() means = means.sort_values( ascending=(metric=="metricx_both") ) means = means.to_frame().reset_index() return gr.update( value=means, x=lang_col, y=metric, x_label_angle=-90, height=500, sort="y", ) default_bar = get_hist(*[x.value for x in bar_controls]) gr_barplot = gr.BarPlot(**default_bar) for inp in bar_controls: inp.change(fn=get_hist, inputs=bar_controls, outputs=gr_barplot) gr.Markdown(LANGS_EXPLANATION) gr.Markdown(METRICS_EXPLANATION) gr.Markdown(SYSTEMS_EXPLANATION)