Spaces:

d3LLM
/

dLLM_Leaderboard

Running

File size: 10,939 Bytes

d473371

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

# 30 distinct colors - assigned by Avg AUP rank
COLOR_PALETTE = [
    "#E91E63", "#4A90E2", "#00BFA5", "#FF6B35", "#8E24AA",
    "#4CAF50", "#FF4081", "#303F9F", "#FFD166", "#00796B",
    "#C2185B", "#7B1FA2", "#26A69A", "#1A4C7C", "#FF8C42",
    "#009688", "#673AB7", "#F44336", "#3F51B5", "#795548",
    "#607D8B", "#9C27B0", "#2196F3", "#CDDC39", "#FF9800",
    "#00BCD4", "#E64A19", "#5D4037", "#455A64", "#AD1457",
]

def get_model_colors(df):
    """Assign colors to methods by Avg AUP rank (descending)."""
    models_sorted = df.sort_values("Avg_AUP", ascending=False)["Method"].tolist()
    return {model: COLOR_PALETTE[i % len(COLOR_PALETTE)] for i, model in enumerate(models_sorted)}

def get_model_ranks(df):
    """Get rank for each method by Avg AUP."""
    models_sorted = df.sort_values("Avg_AUP", ascending=False)["Method"].tolist()
    return {model: i + 1 for i, model in enumerate(models_sorted)}

def hex_to_rgba(hex_color, alpha=0.25):
    hex_color = hex_color.lstrip('#')
    r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
    return f'rgba({r},{g},{b},{alpha})'

def create_radar_chart(df, tasks, top_n=15):
    """Create radar chart for top N methods showing original AUP scores (independent axes)."""
    df_top = df.head(top_n).copy()
    model_colors = get_model_colors(df)
    model_ranks = get_model_ranks(df)
    
    all_cols = [f"{t}_AUP" for t in tasks] + ["Avg_AUP"]
    categories = [t.replace("-", "\n") for t in tasks] + ["Avg\nAUP"]
    
    # Compute min/max per column for normalization (for radar display only)
    col_stats = {}
    for col in all_cols:
        vals = df_top[col].dropna().astype(float)
        col_stats[col] = {'min': vals.min() if len(vals) > 0 else 0, 
                          'max': vals.max() if len(vals) > 0 else 100}
    
    fig = go.Figure()
    
    for _, row in df_top.iterrows():
        method = row["Method"]
        rank = model_ranks.get(method, 0)
        color = model_colors.get(method, "#808080")
        display_name = f"#{rank} {method}"
        
        # Original AUP values for hover display
        original_vals = [row.get(col, 0) or 0 for col in all_cols]
        
        # Normalized values for radar shape (0-100 scale per axis)
        normalized = []
        for col, val in zip(all_cols, original_vals):
            stats = col_stats[col]
            range_val = stats['max'] - stats['min']
            if range_val > 0:
                norm = ((val - stats['min']) / range_val) * 80 + 10  # Scale to 10-90
            else:
                norm = 50
            normalized.append(norm)
        
        # Custom hover text showing original AUP scores
        hover_texts = [f"<b>{display_name}</b><br>{cat}: <b>{val:.1f}</b>" 
                       for cat, val in zip(categories, original_vals)]
        
        fig.add_trace(go.Scatterpolar(
            r=normalized + [normalized[0]], 
            theta=categories + [categories[0]],
            mode='lines+markers', fill='toself', name=display_name,
            line=dict(color=color, width=2), marker=dict(color=color, size=6),
            fillcolor=hex_to_rgba(color, 0.15), opacity=0.9,
            text=hover_texts + [hover_texts[0]],
            hovertemplate='%{text}<extra></extra>'
        ))
    
    fig.update_layout(
        height=600, margin=dict(l=100, r=250, t=80, b=60),
        title=dict(text=f"🎯 Top {top_n} Methods: AUP Scores in Radar Chart", x=0.5, font=dict(size=18)),
        # title=dict(text=f"🎯 Top {top_n} Methods: AUP Scores in Radar Chart", x=0.5, font=dict(size=18)),
        polar=dict(radialaxis=dict(visible=True, range=[0, 100], tickfont=dict(size=11), 
                                   tickvals=[], showticklabels=False)),
        legend=dict(font=dict(size=12), x=1.05, y=1, bgcolor='rgba(255,255,255,0.95)',
                   bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=13))),
        hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial", bordercolor="#333")
    )
    return fig

def create_group_bar_chart(df, tasks, top_n=15):
    """Create grouped bar chart with Avg AUP included and rank numbers."""
    df_top = df.head(top_n).copy()
    methods = df_top["Method"].tolist()
    model_colors = get_model_colors(df)
    model_ranks = get_model_ranks(df)
    
    all_benchmarks = tasks + ["Avg_AUP"]
    fig = go.Figure()
    
    for method in methods:
        row = df_top[df_top["Method"] == method].iloc[0]
        color = model_colors.get(method, "#808080")
        rank = model_ranks.get(method, 0)
        display_name = f"#{rank} {method}"
        
        y_vals, x_vals = [], []
        for bench in all_benchmarks:
            aup = row.get("Avg_AUP") if bench == "Avg_AUP" else row.get(f"{bench}_AUP")
            if aup is not None and not (isinstance(aup, float) and aup != aup):
                y_vals.append(aup)
                x_vals.append("Avg AUP" if bench == "Avg_AUP" else bench)
        
        if y_vals:
            fig.add_trace(go.Bar(
                name=display_name, x=x_vals, y=y_vals, marker_color=color,
                hovertemplate=f"<b>{display_name}</b><br>%{{x}}: %{{y:.1f}}<extra></extra>"
            ))
    
    fig.update_layout(
        height=550, margin=dict(l=60, r=250, t=80, b=100),
        title=dict(text=f"📊 Top {top_n} Methods: AUP Scores in Bar Chart", x=0.5, font=dict(size=18)),
        # title=dict(text=f"📊 Top {top_n} Methods: AUP Scores of Different Benchmarks", x=0.5, font=dict(size=18)),
        xaxis_title="Benchmark", yaxis_title="AUP Score",
        barmode='group', bargap=0.2, bargroupgap=0.05,
        legend=dict(font=dict(size=11), x=1.02, y=1, bgcolor='rgba(255,255,255,0.95)',
                   bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=12))),
        hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial")
    )
    return fig

def create_aup_curve_chart(raw_data, tasks, df, top_n=15):
    """Create 2x3 subplot grid of AUP curves with quadratic fitting (same as plot_lines.py)."""
    df_top = df.head(top_n).copy()
    model_colors = get_model_colors(df)
    model_ranks = get_model_ranks(df)
    methods_to_show = set(df_top["Method"].tolist())
    
    # Build per-task data: {task: {method: [(rho, y), ...]}}
    task_data = {t: {} for t in tasks}
    for task in tasks:
        for method, pairs in raw_data.get(task, {}).items():
            if method in methods_to_show:
                task_data[task][method] = [(p[0], p[1]) for p in pairs]
    
    # Compute average data: average TPF and Acc by index across tasks (all tasks have same length)
    avg_data = {}
    for method in methods_to_show:
        task_points = [task_data.get(t, {}).get(method, []) for t in tasks]
        task_points = [p for p in task_points if p]  # filter empty
        if not task_points:
            continue
        n_points = len(task_points[0])
        avg_data[method] = [
            (np.mean([tp[i][0] for tp in task_points]), sum(tp[i][1] for tp in task_points) / 5)
            for i in range(n_points)
        ]
    
    # 6 subplots: 5 tasks + 1 Average at (2,3)
    titles = tasks + ["Average"]
    fig = make_subplots(rows=2, cols=3, subplot_titles=titles,
                        horizontal_spacing=0.08, vertical_spacing=0.15)
    
    # Track which methods have been added to legend
    legend_added = set()
    
    def get_pos(idx):
        if idx < 3:
            return (1, idx + 1)
        return (2, idx - 2)  # idx=3->(2,1), idx=4->(2,2), idx=5->(2,3)
    
    # Helper to draw curve for a given subplot
    def draw_curve(pairs, method, row, col):
        nonlocal legend_added
        if not pairs:
            return
        color = model_colors.get(method, "#808080")
        rank = model_ranks.get(method, 0)
        display_name = f"#{rank} {method}"
        show_legend = method not in legend_added
        if show_legend:
            legend_added.add(method)
        
        rho, y = zip(*sorted(pairs, key=lambda x: x[0]))
        rho, y = np.array(rho), np.array(y)
        
        # Generate smooth curve (quadratic fitting, same as plot_lines.py)
        if len(rho) >= 3:
            z = np.polyfit(rho, y, 2)
            p = np.poly1d(z)
            x_smooth = np.linspace(rho.min(), rho.max(), 300)
            y_smooth = p(x_smooth)
        elif len(rho) == 2:
            x_smooth = np.linspace(rho.min(), rho.max(), 300)
            if rho[1] != rho[0]:
                a = (y[1] - y[0]) / ((rho[1] - rho[0]) ** 2)
                y_smooth = a * (x_smooth - rho[0]) ** 2 + y[0]
            else:
                y_smooth = np.linspace(y[0], y[1], 300)
        else:
            x_smooth, y_smooth = rho, y
        
        # Add fitted curve
        fig.add_trace(go.Scatter(
            x=x_smooth, y=y_smooth, mode='lines', name=display_name,
            line=dict(color=color, width=2.5), opacity=0.85,
            showlegend=show_legend, legendgroup=method,
            hoverinfo='skip'
        ), row=row, col=col)
        
        # Add markers at original data points
        fig.add_trace(go.Scatter(
            x=rho, y=y, mode='markers', name=display_name,
            marker=dict(color='white', size=8, line=dict(color=color, width=2)),
            showlegend=False, legendgroup=method,
            hovertemplate=f"<b>{display_name}</b><br>TPF: %{{x:.2f}}<br>Acc: %{{y:.1f}}<extra></extra>"
        ), row=row, col=col)
    
    # Draw 5 task subplots
    for idx, task in enumerate(tasks):
        row, col = get_pos(idx)
        data = task_data.get(task, {})
        for method in df_top["Method"].tolist():
            if method in data:
                draw_curve(data[method], method, row, col)
    
    # Draw Average subplot at (2, 3)
    for method in df_top["Method"].tolist():
        if method in avg_data:
            draw_curve(avg_data[method], method, 2, 3)
    
    fig.update_layout(
        height=550, margin=dict(l=60, r=250, t=80, b=60),
        title=dict(text=f"📈 Top {top_n} Methods: Accuracy-Parallelism Curves", x=0.5, font=dict(size=18)),
        legend=dict(font=dict(size=11), x=1.02, y=1, bgcolor='rgba(255,255,255,0.95)',
                   bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=12)),
                   tracegroupgap=1, itemsizing='constant'),
        hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial")
    )
    
    # Update axes labels for 6 subplots
    for idx in range(6):
        row, col = get_pos(idx)
        fig.update_xaxes(title_text="TPF (Tokens per Forward)" if idx >= 3 else "", row=row, col=col)
        fig.update_yaxes(title_text="Acc (%)" if col == 1 else "", row=row, col=col)
    
    return fig