Spaces:
Running
Running
| import streamlit as st | |
| import pandas as pd | |
| from PIL import Image | |
| import base64 | |
| from io import BytesIO | |
| # βββ Page config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.set_page_config(page_title="ExpertLongBench Leaderboard", layout="wide") | |
| logo_image = Image.open("src/ExpertLongBench.png") | |
| # Display logo | |
| buffered = BytesIO() | |
| logo_image.save(buffered, format="PNG") | |
| img_data = base64.b64encode(buffered.getvalue()).decode("utf-8") | |
| st.markdown( | |
| f""" | |
| <div class="logo-container" style="display:flex; justify-content: center;"> | |
| <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/> | |
| </div> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| st.markdown( | |
| ''' | |
| <div class="header"> | |
| <br/> | |
| <p style="font-size:22px;"> | |
| ExpertLongBench: Benchmarking Language Models on Expert-Level Long-Form Generation with Structured Checklists | |
| </p> | |
| <p style="font-size:20px;"> | |
| π <a href="">Paper</a> | π» <a href="">GitHub</a> | <strong>K</strong> <a href="">Kaggle</a> <add links later> | |
| βοΈ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 12 | Updated: <strong>May 2025</strong> | |
| </p> | |
| </div> | |
| ''', | |
| unsafe_allow_html=True | |
| ) | |
| # βββ Load data ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_data(path="src/models.json"): | |
| df = pd.read_json(path, lines=True) | |
| score_cols = [f"T{i}" for i in range(1, 12)] | |
| df["Avg"] = df[score_cols].mean(axis=1).round(1) | |
| # Compute rank per column (1 = best) | |
| for col in score_cols + ["Avg"]: | |
| df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int) | |
| return df | |
| df = load_data() | |
| # Precompute max ranks for color scaling | |
| score_cols = [f"T{i}" for i in range(1, 12)] + ["Avg"] | |
| max_ranks = {col: df[f"{col}_rank"].max() for col in score_cols} | |
| # βββ Tabs ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"]) | |
| with tab1: | |
| # st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.") | |
| # Build raw HTML table | |
| cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"] | |
| html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>" | |
| # header | |
| html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>" | |
| # rows | |
| for _, row in df.iterrows(): | |
| html += "<tr>" | |
| for col in cols: | |
| val = row[col] | |
| if col == "Model": | |
| html += f"<td style='padding:6px; text-align:left;'>{val}</td>" | |
| else: | |
| rank = int(row[f"{col}_rank"]) | |
| norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1) | |
| # interpolate green (182,243,182) β white (255,255,255) | |
| r = int(255 - norm*(255-182)) | |
| g = int(255 - norm*(255-243)) | |
| b = 255 | |
| bold = "font-weight:bold;" if rank == 1 else "" | |
| style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}" | |
| html += f"<td style='{style}'>{val}</td>" | |
| html += "</tr>" | |
| html += "</table>" | |
| st.markdown(html, unsafe_allow_html=True) | |
| with tab2: | |
| st.markdown("## Abstract") | |
| st.write( | |
| "<add final abstract here>" | |
| ) | |
| st.markdown("## Pipeline") | |
| st.write( | |
| "<add final pipeline figure here>" | |
| ) | |