Spaces:

launch
/

ExpertLongBench

Running

App Files Files Community

shezamunir commited on May 15

Commit

578adcb

verified ·

1 Parent(s): d56c24f

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +30 -68

src/streamlit_app.py CHANGED Viewed

@@ -1,98 +1,60 @@
 import streamlit as st
 import pandas as pd
-import numpy as np
-from PIL import Image
-import base64
-from io import BytesIO
-# --- Page config ---
 st.set_page_config(page_title="VeriFact Leaderboard", layout="wide")
-# --- Load images ---
-@st.cache_data
-def load_image(path):
-    return Image.open(path)
-# logo = load_image("factrbench.png")
-# chart = load_image("test.png")
-# Display logo
-buf = BytesIO()
-logo.save(buf, format="PNG")
-logo_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
-st.markdown(f"""
-    <div style="text-align:center; margin-bottom:20px;">
-        <img src="data:image/png;base64,{logo_b64}" style="width:50%; max-width:700px;"/>
-    </div>
-""", unsafe_allow_html=True)
-# Header
-st.markdown("""
-<div style="text-align:center;">
-  <p style="font-size:22px;">
-    VERIFACT: Enhancing Long-Form Factuality Evaluation...
-  </p>
-  <p style="font-size:20px;">
-    # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a> |
-    ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 11 | Updated: <strong>April 2025</strong>
-  </p>
-</div>
-""", unsafe_allow_html=True)
-# --- Load data ---
 @st.cache_data
 def load_data(path="models.json"):
     df = pd.read_json(path, lines=True)
-    df["Avg"] = df[[f"T{i}" for i in range(1,12)]].mean(axis=1).round(1)
-    # Compute rank per column
-    for col in [f"T{i}" for i in range(1,12)] + ["Avg"]:
         df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
     return df
 df = load_data()
-# --- Tabs ---
 tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
 with tab1:
-    st.markdown("**Leaderboard:** Higher scores shaded green; best models bolded.")
-    # Build HTML table
     cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
-    max_ranks = {col: df[f"{col}_rank"].max() for col in cols if col!="Model"}
-    html = "<table style='border-collapse:collapse; width:100%;'>"
     # header
-    html += "<tr>" + "".join(f"<th style='padding:4px;'>{c}</th>" for c in cols) + "</tr>"
     # rows
     for _, row in df.iterrows():
         html += "<tr>"
-        for c in cols:
-            val = row[c] if c!="Model" else row[c]
-            if c=="Model":
-                html += f"<td style='padding:4px;text-align:left;'>{val}</td>"
             else:
-                # color gradient
-                rank = row[f"{c}_rank"]
-                norm = 1 - (rank-1)/(max_ranks[c]-1 or 1)
-                # interpolate green-white
                 r = int(255 - norm*(255-182))
                 g = int(255 - norm*(255-243))
                 b = 255
-                style = f"background-color:rgb({r},{g},{b}); padding:4px;"
-                bold = "font-weight:bold;" if rank==1 else ""
-                html += f"<td style='{style}{bold}'>{val}</td>"
         html += "</tr>"
     html += "</table>"
     st.markdown(html, unsafe_allow_html=True)
 with tab2:
-    buf2 = BytesIO()
-    chart.save(buf2, format="PNG")
-    chart_b64 = base64.b64encode(buf2.getvalue()).decode("utf-8")
-    st.markdown(f"""
-        <div style="text-align:center;">
-          <img src="data:image/png;base64,{chart_b64}" style="width:65%;"/>
-        </div>
-    """, unsafe_allow_html=True)

 import streamlit as st
 import pandas as pd
+# ─── Page config ──────────────────────────────────────────────────────────────
 st.set_page_config(page_title="VeriFact Leaderboard", layout="wide")
+# ─── Load data ────────────────────────────────────────────────────────────────
 @st.cache_data
 def load_data(path="models.json"):
     df = pd.read_json(path, lines=True)
+    score_cols = [f"T{i}" for i in range(1, 12)]
+    df["Avg"] = df[score_cols].mean(axis=1).round(1)
+    # Compute rank per column (1 = best)
+    for col in score_cols + ["Avg"]:
         df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
     return df
 df = load_data()
+# Precompute max ranks for color scaling
+score_cols = [f"T{i}" for i in range(1, 12)] + ["Avg"]
+max_ranks = {col: df[f"{col}_rank"].max() for col in score_cols}
+# ─── Tabs ──────────────────────────────────────────────────────────────────────
 tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
 with tab1:
+    st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.")
+    # Build raw HTML table
     cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
+    html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"
     # header
+    html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>"
     # rows
     for _, row in df.iterrows():
         html += "<tr>"
+        for col in cols:
+            val = row[col]
+            if col == "Model":
+                html += f"<td style='padding:6px; text-align:left;'>{val}</td>"
             else:
+                rank = int(row[f"{col}_rank"])
+                norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1)
+                # interpolate green (182,243,182) → white (255,255,255)
                 r = int(255 - norm*(255-182))
                 g = int(255 - norm*(255-243))
                 b = 255
+                bold = "font-weight:bold;" if rank == 1 else ""
+                style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}"
+                html += f"<td style='{style}'>{val}</td>"
         html += "</tr>"
     html += "</table>"
     st.markdown(html, unsafe_allow_html=True)
 with tab2:
+    st.markdown("### Benchmark Details")
+    st.write(
+        "VERIFACT is a factuality evaluation framework for long‑form LLM outputs. "
+        "FACTRBENCH provides reference fact sets and external evidence across real‑world prompts."
+    )