File size: 16,132 Bytes
ee52384
 
 
 
 
 
 
 
b556a2a
 
ee52384
 
 
 
 
 
 
4b7bd8f
 
 
 
ee52384
 
c20bbb2
d43cc64
 
41e89c2
ee52384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae87b7e
ee52384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b556a2a
ee52384
 
41e89c2
ee52384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae87b7e
ee52384
 
 
 
41e89c2
ee52384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae87b7e
ee52384
b556a2a
 
 
 
41e89c2
b556a2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee52384
 
 
 
 
b556a2a
 
 
ee52384
 
 
 
 
 
 
 
b556a2a
 
 
 
d43cc64
 
 
6e03470
 
 
d43cc64
 
 
 
 
6e03470
d43cc64
 
 
 
 
 
 
 
 
ee52384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
import gradio as gr
import pandas as pd

###########################################
#                 Load Data               #
###########################################

llm_judge_filename = "llm_judge_results.jsonl"
response_generation_filename = "report_generation.jsonl"
response_generation_w_docs_filename = "report_generation_w_docs.jsonl"

def load_filename_into_df(filename):
    df = pd.read_json(filename, lines=True)
    return df


color_map = {
    "Closed-source Instruct": "#4492F7" ,
    "Open-weight Instruct": "#0856f1",
    "Closed-source Reasoning": "#fac05d" ,
    "Open-weight Reasoning": "#f59c03",
}

CAPTION_V2 = f"""**ProfBench**: Over 7,000 brand-new expert-authored response–criterion pairs across 80 professional tasks across PhD STEM (Chemistry, Physics) and MBA Services (Finance, Consulting) domains. \n
ProfBench is a high-quality, text-only dataset that represent the complex reasoning tasks faced by professionals in fields like finance and chemistry. We're not talking about simple Q&A or retrieval-based tasks. We're talking about multi-page assignments that require deep domain knowledge and reasoning. Can AI generate comprehensive reports by applying the nuanced reasoning that a PhD-level physicist/chemist or an MBA-level consultant/financier would have? \n
[Blog](https://huggingface.co/blog/nvidia/profbench) | [Paper](https://arxiv.org/abs/2510.18941) | [Data](https://huggingface.co/datasets/nvidia/ProfBench) | [Code](https://github.com/NVlabs/ProfBench) | [Nemo Evaluator SDK](https://github.com/NVIDIA-NeMo/Evaluator)\n
Want to see your favorite models added? Run it with [Nemo Evaluator SDK for scalable evaluation](https://github.com/NVIDIA-NeMo/Evaluator) or [ProfBench code for quick evaluation](https://github.com/NVlabs/ProfBench), send us the scores or ping zhilinw/viviennez [at] nvidia.com to run it for you!"""


def color_model_type_column(df, color_map):
    """
    Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
    Parameters:
    df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
    color_map (dict): A dictionary mapping model types to colors.
    Returns:
    pd.Styler: The styled DataFrame.
    """

    # Function to apply color based on the model type
    def apply_color(val):
        color = color_map.get(val, "default")  # Default color if not specified in color_map
        return f"background-color: {color}"

    # # Format for different columns
    # format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Average", "Model", "Model Type"]}
    # format_dict["Average"] = "{:.2f}"
    # format_dict[""] = "{:d}"

    format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Model", "Category", "Input Tokens", "Output Tokens", "Cost"]}
    format_dict["Response Characters"] = "{:d}"
    format_dict["Input Tokens"] = "{:d}"
    format_dict["Output Tokens"] = "{:d}"
    format_dict[""] = "{:d}"
    format_dict["Cost"] = "{:.2f}"
    

    return df.style.map(apply_color, subset=["Category"]).format(format_dict, na_rep="")


def regex_table(dataframe, regex, filter_button, style=True):
    """
    Takes a model name as a regex, then returns only the rows that has that in it.
    """
    # Split regex statement by comma and trim whitespace around regexes
    regex_list = [x.strip() for x in regex.split(",")]
    # Join the list into a single regex pattern with '|' acting as OR
    combined_regex = "|".join(regex_list)

    if isinstance(filter_button, list) or isinstance(filter_button, str):
    
        if "Open-weight" not in filter_button:
            dataframe = dataframe[~dataframe["Category"].str.contains("Open-weight", case=False, na=False)]
        if "Closed-source" not in filter_button:
            dataframe = dataframe[~dataframe["Category"].str.contains("Closed-source", case=False, na=False)]
        if "Reasoning" not in filter_button:
            dataframe = dataframe[~dataframe["Category"].str.contains("Reasoning", case=False, na=False)]
        if "Instruct" not in filter_button:
            dataframe = dataframe[~dataframe["Category"].str.contains("Instruct", case=False, na=False)]
    
    data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]

    # if update the score to not use prior sets, do so
    data = data.sort_values(by="Overall", ascending=False)

    data.reset_index(drop=True, inplace=True)

    data.insert(0, "", range(1, 1 + len(data)))

    if style:
        # apply color
        data = color_model_type_column(data, color_map)

    return data


# Using a string for a predefined color
theme = gr.themes.Default(primary_hue="blue")

#############################################
#                 Gradio App                #
#############################################

with gr.Blocks(theme=theme) as app:
    # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
    with gr.Row():
        with gr.Column(scale=6):
            gr.Markdown(CAPTION_V2)

    with gr.Tabs(elem_id="outer-tabs", elem_classes="tabs-big") as tabs_big:
        with gr.TabItem("Report Generation"):
            with gr.Row():
                with gr.Column(scale=7):
                    gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 29 Oct 2025.")
                
            with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                with gr.TabItem("Leaderboard"):
                    with gr.Row():
                        search_1 = gr.Textbox(
                            label="Model Search (delimit with , )",
                            placeholder="Model Search (delimit with , )",
                            show_label=False,
                            scale=8,
                        )
                        model_types_1 = gr.CheckboxGroup(
                            ["Open-weight", "Closed-source", "Reasoning", "Instruct"],
                            value=["Open-weight", "Closed-source", "Reasoning", "Instruct"],
                            show_label=False,
                            scale=8,
                        )

                    with gr.Row():
                        col_types_response_generation = ["number"] + ["markdown"] + ["str"] + ["number"] * 12
                        df_response_generation = load_filename_into_df(response_generation_filename)

                        rewardbench_table_hidden = gr.Dataframe(
                            df_response_generation.values,
                            datatype=col_types_response_generation,
                            headers=df_response_generation.columns.tolist(),
                            visible=False,
                        )

                        rewardbench_table = gr.Dataframe(
                            regex_table(
                                df_response_generation.copy(),
                                "",
                                ["Open-weight", "Closed-source", "Reasoning", "Instruct"]
                            ),
                            datatype=col_types_response_generation,
                            headers=df_response_generation.columns.tolist(),
                            elem_id="response_generation_dataframe",
                            row_count=(25, "dynamic"),
                        )

        with gr.TabItem("LLM Judge"):
            with gr.Row():
                gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 across 3486 samples but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index. \nEvaluation and cost estimations last performed on 20 Sep 2025.")
            with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                with gr.TabItem("Leaderboard"):
                    with gr.Row():
                        search_1_v1 = gr.Textbox(
                            label="Model Search (delimit with , )",
                            placeholder="Model Search (delimit with , )",
                            show_label=False,
                        )
                        model_types_1_v1 = gr.CheckboxGroup(
                            ["Open-weight", "Closed-source", "Reasoning", "Instruct"],
                            value=["Open-weight", "Closed-source", "Reasoning", "Instruct"],
                            label="Model Types",
                            show_label=False,
                            #  info="Which model types to include.",
                        )
                        
                    with gr.Row():
                        col_types_llm_judge = ["number"] + ["markdown"] + ["str"] + ["number"] * 16
                        df_llm_judge = load_filename_into_df(llm_judge_filename)

                        rewardbench_table_hidden_v1 = gr.Dataframe(
                            df_llm_judge.values,
                            datatype=col_types_llm_judge,
                            headers=df_llm_judge.columns.tolist(),
                            visible=False,
                        )

                        rewardbench_table_v1 = gr.Dataframe(
                            regex_table(
                                df_llm_judge.copy(),
                                "",
                                ["Open-weight", "Closed-source", "Reasoning", "Instruct"],
                            ),
                            datatype=col_types_llm_judge,
                            headers=df_llm_judge.columns.tolist(),
                            elem_id="llm_judge_dataframe",
                            row_count=(25, "dynamic"),
                        )
        
        with gr.TabItem("Report Generation w Docs"):
            with gr.Row():
                with gr.Column(scale=7):
                    gr.Markdown("Report Generation Leaderboard with Grounding Documents: LLMs generate reports with the human-curated reference documents as context. Results below are based on the full dataset and gpt-oss-120b (mixed) as judge. \nEvaluation and cost estimations last performed on 20 Sep 2025.")
                
            with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                with gr.TabItem("Leaderboard"):
                    with gr.Row():
                        search_1_v2 = gr.Textbox(
                            label="Model Search (delimit with , )",
                            placeholder="Model Search (delimit with , )",
                            show_label=False,
                            scale=8,
                        )
                        model_types_1_v2 = gr.CheckboxGroup(
                            ["Open-weight", "Closed-source", "Reasoning", "Instruct"],
                            value=["Open-weight", "Closed-source", "Reasoning", "Instruct"],
                            show_label=False,
                            scale=8,
                        )

                    with gr.Row():
                        col_types_response_generation = ["number"] + ["markdown"] + ["str"] + ["number"] * 12
                        df_response_generation_w_docs = load_filename_into_df(response_generation_w_docs_filename)

                        rewardbench_table_hidden_v2 = gr.Dataframe(
                            df_response_generation_w_docs.values,
                            datatype=col_types_response_generation,
                            headers=df_response_generation_w_docs.columns.tolist(),
                            visible=False,
                        )

                        rewardbench_table_v2 = gr.Dataframe(
                            regex_table(
                                df_response_generation_w_docs.copy(),
                                "",
                                ["Open-weight", "Closed-source", "Reasoning", "Instruct"]
                            ),
                            datatype=col_types_response_generation,
                            headers=df_response_generation_w_docs.columns.tolist(),
                            elem_id="response_generation_dataframe",
                            row_count=(25, "dynamic"),
                        )
            
    search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
    search_1_v1.change(
        regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
    )
    search_1_v2.change(
        regex_table, inputs=[rewardbench_table_hidden_v2, search_1_v2, model_types_1_v2], outputs=rewardbench_table_v2
    )

    model_types_1.change(
        regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
    )
    model_types_1_v1.change(
        regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
    )

    model_types_1_v2.change(
        regex_table, inputs=[rewardbench_table_hidden_v2, search_1_v2, model_types_1_v2], outputs=rewardbench_table_v2
    )

    with gr.Row():
        with gr.Accordion("📚 Frequently Asked Questions", open=False):
            citation_button = gr.Textbox(
                value=r"""1. How is the cost calculated?: We use the token cost from https://openrouter.ai/models multipled by the total input/output tokens in each evaluation.
2. How can I run Report Generation Leaderboard with Grounding Documents: This benchmark is unable to be run externally at the moment since we are unable to release the required grounding documents. We are working on it.""",
                lines=2,
                label="FAQ",
                elem_id="faq_box",
            )
        
    with gr.Row():
        with gr.Accordion("📚 Understand the Metrics", open=False):
            citation_button = gr.Textbox(
                value=r"""Response Generation (w Docs): We first generate the response. Then we grade the response against the human-annotated rubrics. Finally, we calculate the proportion of rubrics satisfied by each response, weighted by their criterion-weight to derive a score for each response.
LLM Judge: We calculate macro-F1 of the LLM-judge predicted criteria-fulfillment against the human-labelled criterion fulfillment to get Overall F1. We then calculate the bias for each model by taking mean of predicted fulfilment minus mean of human-labelled fulfilment. We calculate Bias Index by taking max(bias) - min(bias) across models. Overall is calculated by Overall F1 - Bias Index.""",
                lines=4,
                label="Metrics",
                elem_id="metrics_box",
            )


    with gr.Row():
        with gr.Accordion("📚 Citation and Credits", open=False):
            citation_button = gr.Textbox(
                value=r"""@misc{wang2025profbenchmultidomainrubricsrequiring,
      title={ProfBench: Multi-Domain Rubrics requiring Professional Knowledge to Answer and Judge}, 
      author={Zhilin Wang and Jaehun Jung and Ximing Lu and Shizhe Diao and Ellie Evans and Jiaqi Zeng and Pavlo Molchanov and Yejin Choi and Jan Kautz and Yi Dong},
      year={2025},
      eprint={2510.18941},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2510.18941}, 
}""",
                lines=10,
                label="If you find the results helpful, please cite the following. ",
                elem_id="citation-button",
                show_copy_button=True,
            )
            gr.Textbox("Leaderboard adapted from allenai/reward-bench ", label="Leaderboard credits",)

app.launch()  # had .queue() before launch before... not sure if that's necessary