Spaces:
Sleeping
Sleeping
Upload 14 files
Browse files- README.md +60 -13
- app.py +1006 -0
- config.py +48 -0
- data.yaml +422 -0
- data_loader.py +133 -0
- download_external_models.py +124 -0
- example_usage.py +134 -0
- external_models.csv +31 -0
- extract_portuguese_leaderboard.py +195 -0
- manage_data.py +226 -0
- portuguese_leaderboard.csv +0 -0
- requirements.txt +8 -0
- run_app.py +73 -0
- validate_data.py +106 -0
README.md
CHANGED
|
@@ -1,13 +1,60 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Napolab Leaderboard - Gradio App
|
| 2 |
+
|
| 3 |
+
A comprehensive Gradio web application for exploring and benchmarking Portuguese language models using the Napolab dataset collection.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- **π Benchmark Results**: Single comprehensive table with one column per dataset and clickable model links
|
| 8 |
+
- **π Model Analysis**: Radar chart showing model performance across all datasets
|
| 9 |
+
|
| 10 |
+
## Installation
|
| 11 |
+
|
| 12 |
+
1. Navigate to the leaderboard directory:
|
| 13 |
+
```bash
|
| 14 |
+
cd dev/napolab/leaderboard
|
| 15 |
+
```
|
| 16 |
+
|
| 17 |
+
2. Install the required dependencies:
|
| 18 |
+
```bash
|
| 19 |
+
pip install -r requirements.txt
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
3. Extract data from external sources (optional but recommended):
|
| 23 |
+
```bash
|
| 24 |
+
# Extract data from Portuguese LLM Leaderboard
|
| 25 |
+
python extract_portuguese_leaderboard.py
|
| 26 |
+
|
| 27 |
+
# Download external models data
|
| 28 |
+
python download_external_models.py
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
4. Run the Gradio app:
|
| 32 |
+
```bash
|
| 33 |
+
python app.py
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
The app will be available at `http://localhost:7860`
|
| 37 |
+
|
| 38 |
+
## Data Management
|
| 39 |
+
|
| 40 |
+
The app uses a YAML configuration file (`data.yaml`) for adding new data, making it easy to edit and maintain.
|
| 41 |
+
|
| 42 |
+
### Data Extraction Scripts
|
| 43 |
+
|
| 44 |
+
The leaderboard includes scripts to automatically extract and update data from external sources:
|
| 45 |
+
|
| 46 |
+
#### `extract_portuguese_leaderboard.py`
|
| 47 |
+
This script extracts benchmark results from the Open Portuguese LLM Leaderboard:
|
| 48 |
+
- Fetches data from the Hugging Face Spaces leaderboard
|
| 49 |
+
- Updates the `portuguese_leaderboard.csv` file
|
| 50 |
+
- Includes both open-source and proprietary models
|
| 51 |
+
- Automatically handles data formatting and validation
|
| 52 |
+
|
| 53 |
+
#### `download_external_models.py`
|
| 54 |
+
This script downloads additional model data:
|
| 55 |
+
- Fetches model metadata from various sources
|
| 56 |
+
- Updates the `external_models.csv` file
|
| 57 |
+
- Includes model links and performance metrics
|
| 58 |
+
- Ensures data consistency with the main leaderboard
|
| 59 |
+
|
| 60 |
+
**Note**: These scripts require internet connection and may take a few minutes to complete. Run them periodically to keep the leaderboard data up to date.
|
app.py
ADDED
|
@@ -0,0 +1,1006 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import plotly.express as px
|
| 5 |
+
import plotly.graph_objects as go
|
| 6 |
+
from plotly.subplots import make_subplots
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
from typing import Dict, List, Optional, Tuple
|
| 10 |
+
|
| 11 |
+
# Import data loader
|
| 12 |
+
from data_loader import data_loader, get_napolab_datasets, get_sample_benchmark_results, get_model_metadata
|
| 13 |
+
|
| 14 |
+
# Load data from YAML file
|
| 15 |
+
NAPOLAB_DATASETS = get_napolab_datasets()
|
| 16 |
+
SAMPLE_BENCHMARK_RESULTS = get_sample_benchmark_results()
|
| 17 |
+
MODEL_METADATA = get_model_metadata()
|
| 18 |
+
|
| 19 |
+
def load_portuguese_leaderboard_data() -> pd.DataFrame:
|
| 20 |
+
"""Load data from the Portuguese leaderboard CSV file."""
|
| 21 |
+
try:
|
| 22 |
+
csv_path = "portuguese_leaderboard.csv"
|
| 23 |
+
if os.path.exists(csv_path):
|
| 24 |
+
df = pd.read_csv(csv_path)
|
| 25 |
+
# Select only the relevant columns
|
| 26 |
+
relevant_columns = ['model_name', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
|
| 27 |
+
df = df[relevant_columns].copy()
|
| 28 |
+
|
| 29 |
+
# Rename columns to match the existing format
|
| 30 |
+
df = df.rename(columns={
|
| 31 |
+
'assin2_rte': 'ASSIN2 RTE',
|
| 32 |
+
'assin2_sts': 'ASSIN2 STS',
|
| 33 |
+
'faquad_nli': 'FaQuAD-NLI',
|
| 34 |
+
'hatebr_offensive': 'HateBR'
|
| 35 |
+
})
|
| 36 |
+
|
| 37 |
+
# Add source information
|
| 38 |
+
df['source'] = 'portuguese_leaderboard'
|
| 39 |
+
|
| 40 |
+
print(f"Loaded {len(df)} models from Portuguese leaderboard")
|
| 41 |
+
return df
|
| 42 |
+
else:
|
| 43 |
+
print(f"Portuguese leaderboard CSV not found: {csv_path}")
|
| 44 |
+
return pd.DataFrame()
|
| 45 |
+
except Exception as e:
|
| 46 |
+
print(f"Error loading Portuguese leaderboard data: {e}")
|
| 47 |
+
return pd.DataFrame()
|
| 48 |
+
|
| 49 |
+
def load_external_models_data() -> pd.DataFrame:
|
| 50 |
+
"""Load data from the external models CSV file."""
|
| 51 |
+
try:
|
| 52 |
+
csv_path = "external_models.csv"
|
| 53 |
+
if os.path.exists(csv_path):
|
| 54 |
+
df = pd.read_csv(csv_path)
|
| 55 |
+
# Select only the relevant columns
|
| 56 |
+
relevant_columns = ['model', 'link', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
|
| 57 |
+
df = df[relevant_columns].copy()
|
| 58 |
+
|
| 59 |
+
# Rename columns to match the existing format
|
| 60 |
+
df = df.rename(columns={
|
| 61 |
+
'model': 'model_name',
|
| 62 |
+
'assin2_rte': 'ASSIN2 RTE',
|
| 63 |
+
'assin2_sts': 'ASSIN2 STS',
|
| 64 |
+
'faquad_nli': 'FaQuAD-NLI',
|
| 65 |
+
'hatebr_offensive': 'HateBR'
|
| 66 |
+
})
|
| 67 |
+
|
| 68 |
+
# Add source information
|
| 69 |
+
df['source'] = 'external_models'
|
| 70 |
+
|
| 71 |
+
print(f"Loaded {len(df)} external models")
|
| 72 |
+
return df
|
| 73 |
+
else:
|
| 74 |
+
print(f"External models CSV not found: {csv_path}")
|
| 75 |
+
return pd.DataFrame()
|
| 76 |
+
except Exception as e:
|
| 77 |
+
print(f"Error loading external models data: {e}")
|
| 78 |
+
return pd.DataFrame()
|
| 79 |
+
|
| 80 |
+
# Load Portuguese leaderboard data
|
| 81 |
+
PORTUGUESE_LEADERBOARD_DATA = load_portuguese_leaderboard_data()
|
| 82 |
+
|
| 83 |
+
# Load external models data
|
| 84 |
+
EXTERNAL_MODELS_DATA = load_external_models_data()
|
| 85 |
+
|
| 86 |
+
def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "") -> pd.DataFrame:
|
| 87 |
+
"""Create a simplified benchmark table with one column per dataset."""
|
| 88 |
+
# Get all dataset names
|
| 89 |
+
dataset_names = sorted(NAPOLAB_DATASETS.keys())
|
| 90 |
+
dataset_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in dataset_names]
|
| 91 |
+
|
| 92 |
+
# Use selected datasets if provided, otherwise use all datasets
|
| 93 |
+
if selected_datasets is None:
|
| 94 |
+
selected_datasets = dataset_names
|
| 95 |
+
|
| 96 |
+
# Collect data for each model
|
| 97 |
+
model_data = {}
|
| 98 |
+
|
| 99 |
+
# Process existing benchmark results
|
| 100 |
+
for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
|
| 101 |
+
for model_name, metrics in models.items():
|
| 102 |
+
if model_name not in model_data:
|
| 103 |
+
model_data[model_name] = {
|
| 104 |
+
'dataset_scores': {},
|
| 105 |
+
'url': None,
|
| 106 |
+
'source': 'existing'
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
# Calculate average performance for this dataset
|
| 110 |
+
avg_performance = np.mean(list(metrics.values()))
|
| 111 |
+
model_data[model_name]['dataset_scores'][dataset_name] = avg_performance
|
| 112 |
+
|
| 113 |
+
# Process Portuguese leaderboard data
|
| 114 |
+
if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
|
| 115 |
+
for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
|
| 116 |
+
model_name = row['model_name']
|
| 117 |
+
|
| 118 |
+
if model_name not in model_data:
|
| 119 |
+
model_data[model_name] = {
|
| 120 |
+
'dataset_scores': {},
|
| 121 |
+
'url': None,
|
| 122 |
+
'source': 'portuguese_leaderboard'
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
# Map Portuguese leaderboard columns to dataset names
|
| 126 |
+
column_mapping = {
|
| 127 |
+
'ASSIN2 RTE': 'assin2_rte',
|
| 128 |
+
'ASSIN2 STS': 'assin2_sts',
|
| 129 |
+
'FaQuAD-NLI': 'faquad-nli',
|
| 130 |
+
'HateBR': 'hatebr'
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
for display_name, dataset_name in column_mapping.items():
|
| 134 |
+
if dataset_name in NAPOLAB_DATASETS:
|
| 135 |
+
score = row[display_name]
|
| 136 |
+
if pd.notna(score) and score > 0:
|
| 137 |
+
model_data[model_name]['dataset_scores'][dataset_name] = score
|
| 138 |
+
|
| 139 |
+
# Process external models data
|
| 140 |
+
if show_external_models and not EXTERNAL_MODELS_DATA.empty:
|
| 141 |
+
for _, row in EXTERNAL_MODELS_DATA.iterrows():
|
| 142 |
+
model_name = row['model_name']
|
| 143 |
+
|
| 144 |
+
if model_name not in model_data:
|
| 145 |
+
model_data[model_name] = {
|
| 146 |
+
'dataset_scores': {},
|
| 147 |
+
'url': row.get('link', ''),
|
| 148 |
+
'source': 'external_models'
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
# Map external models columns to dataset names
|
| 152 |
+
column_mapping = {
|
| 153 |
+
'ASSIN2 RTE': 'assin2_rte',
|
| 154 |
+
'ASSIN2 STS': 'assin2_sts',
|
| 155 |
+
'FaQuAD-NLI': 'faquad-nli',
|
| 156 |
+
'HateBR': 'hatebr'
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
for display_name, dataset_name in column_mapping.items():
|
| 160 |
+
if dataset_name in NAPOLAB_DATASETS:
|
| 161 |
+
score = row[display_name]
|
| 162 |
+
if pd.notna(score) and score > 0:
|
| 163 |
+
model_data[model_name]['dataset_scores'][dataset_name] = score
|
| 164 |
+
|
| 165 |
+
# Get model URLs and source information for existing models
|
| 166 |
+
additional_models = data_loader.get_additional_models()
|
| 167 |
+
for model_name in model_data.keys():
|
| 168 |
+
if model_data[model_name]['source'] == 'existing':
|
| 169 |
+
# Get URL
|
| 170 |
+
for arch_models in additional_models.values():
|
| 171 |
+
if model_name in arch_models:
|
| 172 |
+
model_data[model_name]['url'] = arch_models[model_name].get('huggingface_url', '')
|
| 173 |
+
break
|
| 174 |
+
|
| 175 |
+
# Get source information
|
| 176 |
+
model_metadata = MODEL_METADATA.get(model_name, {})
|
| 177 |
+
source = model_metadata.get('source', 'unknown')
|
| 178 |
+
model_data[model_name]['source'] = source
|
| 179 |
+
|
| 180 |
+
# Create table data
|
| 181 |
+
table_data = []
|
| 182 |
+
|
| 183 |
+
for model_name, data in model_data.items():
|
| 184 |
+
# Apply source filtering
|
| 185 |
+
source = data['source']
|
| 186 |
+
|
| 187 |
+
# Apply show filters - only show models from sources that are checked
|
| 188 |
+
if source == 'napolab_thesis' and not show_napolab_thesis:
|
| 189 |
+
continue
|
| 190 |
+
if source == 'teenytinyllama_paper' and not show_teenytinyllama:
|
| 191 |
+
continue
|
| 192 |
+
if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
|
| 193 |
+
continue
|
| 194 |
+
if source == 'external_models' and not show_external_models:
|
| 195 |
+
continue
|
| 196 |
+
# Hide models with unknown source (should not happen with proper data)
|
| 197 |
+
if source == 'unknown':
|
| 198 |
+
continue
|
| 199 |
+
|
| 200 |
+
# Create clickable link for model name
|
| 201 |
+
if data['url']:
|
| 202 |
+
model_display = f"[{model_name}]({data['url']})"
|
| 203 |
+
elif source == 'portuguese_leaderboard' and '/' in model_name:
|
| 204 |
+
# Create Hugging Face link for Portuguese leaderboard models with slashes
|
| 205 |
+
huggingface_url = f"https://huggingface.co/{model_name}"
|
| 206 |
+
model_display = f"[{model_name}]({huggingface_url})"
|
| 207 |
+
else:
|
| 208 |
+
model_display = model_name
|
| 209 |
+
|
| 210 |
+
# Create row with dataset scores
|
| 211 |
+
row_data = {'Model': model_display}
|
| 212 |
+
|
| 213 |
+
# Calculate average only over selected datasets
|
| 214 |
+
selected_scores = []
|
| 215 |
+
for dataset_name in selected_datasets:
|
| 216 |
+
score = data['dataset_scores'].get(dataset_name, 0)
|
| 217 |
+
if score > 0: # Only include non-zero scores in average
|
| 218 |
+
selected_scores.append(score)
|
| 219 |
+
|
| 220 |
+
overall_avg = np.mean(selected_scores) if selected_scores else 0
|
| 221 |
+
row_data['Average'] = round(overall_avg, 4)
|
| 222 |
+
|
| 223 |
+
# Add scores for each dataset (only selected ones)
|
| 224 |
+
for dataset_name in dataset_names:
|
| 225 |
+
score = data['dataset_scores'].get(dataset_name, 0)
|
| 226 |
+
display_name = dataset_display_names[dataset_names.index(dataset_name)]
|
| 227 |
+
# Only add columns for selected datasets
|
| 228 |
+
if dataset_name in selected_datasets:
|
| 229 |
+
row_data[display_name] = round(score, 4)
|
| 230 |
+
|
| 231 |
+
table_data.append(row_data)
|
| 232 |
+
|
| 233 |
+
df = pd.DataFrame(table_data)
|
| 234 |
+
|
| 235 |
+
# Filter to show only models that have scores for at least one selected dataset
|
| 236 |
+
if selected_datasets and not df.empty:
|
| 237 |
+
# Get display names for selected datasets
|
| 238 |
+
selected_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in selected_datasets]
|
| 239 |
+
|
| 240 |
+
# Filter models based on selection criteria
|
| 241 |
+
models_to_keep = []
|
| 242 |
+
for _, row in df.iterrows():
|
| 243 |
+
has_score = False
|
| 244 |
+
has_all_scores = True
|
| 245 |
+
|
| 246 |
+
# Only check the datasets that are actually selected for display
|
| 247 |
+
for dataset_name in selected_datasets:
|
| 248 |
+
display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
|
| 249 |
+
if display_name in df.columns:
|
| 250 |
+
score = row[display_name]
|
| 251 |
+
if score > 0:
|
| 252 |
+
has_score = True
|
| 253 |
+
else:
|
| 254 |
+
has_all_scores = False
|
| 255 |
+
|
| 256 |
+
# Keep model if it has at least one score
|
| 257 |
+
if has_score:
|
| 258 |
+
# If hide_incomplete_models is True, only keep models with all scores in selected datasets
|
| 259 |
+
if not hide_incomplete_models or has_all_scores:
|
| 260 |
+
models_to_keep.append(row['Model'])
|
| 261 |
+
|
| 262 |
+
# Filter dataframe to only include selected models
|
| 263 |
+
if models_to_keep:
|
| 264 |
+
df = df[df['Model'].isin(models_to_keep)]
|
| 265 |
+
else:
|
| 266 |
+
# If no models to keep, create empty DataFrame with proper structure
|
| 267 |
+
# Create columns list first
|
| 268 |
+
columns = ['Model']
|
| 269 |
+
for dataset_name in dataset_names:
|
| 270 |
+
display_name = dataset_display_names[dataset_names.index(dataset_name)]
|
| 271 |
+
if dataset_name in selected_datasets:
|
| 272 |
+
columns.append(display_name)
|
| 273 |
+
columns.append('Average')
|
| 274 |
+
|
| 275 |
+
# Create empty DataFrame with correct columns
|
| 276 |
+
df = pd.DataFrame(columns=columns)
|
| 277 |
+
|
| 278 |
+
# Filter by minimum average performance
|
| 279 |
+
if min_average_performance > 0 and not df.empty:
|
| 280 |
+
df = df[df['Average'] >= min_average_performance]
|
| 281 |
+
|
| 282 |
+
# Filter by search query
|
| 283 |
+
if search_query and not df.empty:
|
| 284 |
+
# Extract model names from markdown links for searching
|
| 285 |
+
df_filtered = df.copy()
|
| 286 |
+
df_filtered['model_name_clean'] = df_filtered['Model'].str.replace(r'\[([^\]]+)\]\([^)]+\)', r'\1', regex=True)
|
| 287 |
+
df_filtered = df_filtered[df_filtered['model_name_clean'].str.contains(search_query, case=False, na=False)]
|
| 288 |
+
df = df_filtered.drop('model_name_clean', axis=1)
|
| 289 |
+
|
| 290 |
+
# Sort by Average (descending)
|
| 291 |
+
if not df.empty:
|
| 292 |
+
df = df.sort_values('Average', ascending=False)
|
| 293 |
+
|
| 294 |
+
# Add rank column with medal emojis for top 3 and color-coded emojis for others
|
| 295 |
+
if not df.empty:
|
| 296 |
+
df = df.reset_index(drop=True)
|
| 297 |
+
df.index = df.index + 1 # Start ranking from 1
|
| 298 |
+
|
| 299 |
+
# Create rank column with medal emojis and color-coded emojis
|
| 300 |
+
rank_column = []
|
| 301 |
+
total_models = len(df)
|
| 302 |
+
|
| 303 |
+
for rank in df.index:
|
| 304 |
+
if rank == 1:
|
| 305 |
+
rank_column.append("π₯ 1")
|
| 306 |
+
elif rank == 2:
|
| 307 |
+
rank_column.append("π₯ 2")
|
| 308 |
+
elif rank == 3:
|
| 309 |
+
rank_column.append("π₯ 3")
|
| 310 |
+
else:
|
| 311 |
+
# Color-code based on position relative to total
|
| 312 |
+
position_ratio = rank / total_models
|
| 313 |
+
if position_ratio <= 0.33: # Top third
|
| 314 |
+
rank_column.append("π’ " + str(rank))
|
| 315 |
+
elif position_ratio <= 0.67: # Middle third
|
| 316 |
+
rank_column.append("π‘ " + str(rank))
|
| 317 |
+
else: # Bottom third
|
| 318 |
+
rank_column.append("π΄ " + str(rank))
|
| 319 |
+
|
| 320 |
+
df.insert(0, 'Rank', rank_column)
|
| 321 |
+
|
| 322 |
+
return df
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
# Global variable to track the current CSV file
|
| 326 |
+
current_csv_file = None
|
| 327 |
+
|
| 328 |
+
def export_csv(df: pd.DataFrame):
|
| 329 |
+
"""Export the benchmark table to CSV."""
|
| 330 |
+
global current_csv_file
|
| 331 |
+
|
| 332 |
+
print(f"Export function called with dataframe shape: {df.shape}")
|
| 333 |
+
|
| 334 |
+
if df.empty:
|
| 335 |
+
print("Dataframe is empty, returning None")
|
| 336 |
+
return None
|
| 337 |
+
|
| 338 |
+
# Clean up previous file if it exists
|
| 339 |
+
if current_csv_file:
|
| 340 |
+
try:
|
| 341 |
+
import os
|
| 342 |
+
if os.path.exists(current_csv_file):
|
| 343 |
+
os.remove(current_csv_file)
|
| 344 |
+
print(f"Deleted previous CSV file: {current_csv_file}")
|
| 345 |
+
except Exception as e:
|
| 346 |
+
print(f"Error deleting previous file {current_csv_file}: {e}")
|
| 347 |
+
|
| 348 |
+
# Clean the dataframe for CSV export
|
| 349 |
+
df_clean = df.copy()
|
| 350 |
+
|
| 351 |
+
# Remove markdown formatting from model names for cleaner CSV
|
| 352 |
+
df_clean['Model'] = df_clean['Model'].str.replace(r'\[([^\]]+)\]\([^)]+\)', r'\1', regex=True)
|
| 353 |
+
|
| 354 |
+
# Create filename with timestamp
|
| 355 |
+
from datetime import datetime
|
| 356 |
+
import tempfile
|
| 357 |
+
import os
|
| 358 |
+
|
| 359 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 360 |
+
filename = f"napolab_benchmark_results_{timestamp}.csv"
|
| 361 |
+
|
| 362 |
+
# Create file in current directory (simpler approach)
|
| 363 |
+
file_path = filename
|
| 364 |
+
|
| 365 |
+
print(f"Creating CSV file at: {file_path}")
|
| 366 |
+
|
| 367 |
+
# Save to CSV file
|
| 368 |
+
df_clean.to_csv(file_path, index=False)
|
| 369 |
+
|
| 370 |
+
print(f"CSV file created successfully. File exists: {os.path.exists(file_path)}")
|
| 371 |
+
|
| 372 |
+
# Update current file tracking
|
| 373 |
+
current_csv_file = file_path
|
| 374 |
+
|
| 375 |
+
return file_path
|
| 376 |
+
|
| 377 |
+
def cleanup_current_csv():
|
| 378 |
+
"""Clean up the current CSV file after download."""
|
| 379 |
+
global current_csv_file
|
| 380 |
+
import os
|
| 381 |
+
|
| 382 |
+
if current_csv_file and os.path.exists(current_csv_file):
|
| 383 |
+
try:
|
| 384 |
+
os.remove(current_csv_file)
|
| 385 |
+
print(f"Deleted CSV file after download: {current_csv_file}")
|
| 386 |
+
current_csv_file = None
|
| 387 |
+
except Exception as e:
|
| 388 |
+
print(f"Error deleting file {current_csv_file}: {e}")
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
def create_model_performance_radar(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "") -> go.Figure:
|
| 392 |
+
"""Create a radar chart showing model performance across all datasets."""
|
| 393 |
+
# Use selected datasets if provided, otherwise use all datasets
|
| 394 |
+
if selected_datasets is None:
|
| 395 |
+
selected_datasets = list(NAPOLAB_DATASETS.keys())
|
| 396 |
+
|
| 397 |
+
# Get dataset names for the radar axes (only selected ones)
|
| 398 |
+
dataset_names = selected_datasets
|
| 399 |
+
dataset_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in dataset_names]
|
| 400 |
+
|
| 401 |
+
# Collect data for each model
|
| 402 |
+
model_data = {}
|
| 403 |
+
|
| 404 |
+
# Process existing benchmark results
|
| 405 |
+
for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
|
| 406 |
+
if dataset_name in selected_datasets:
|
| 407 |
+
for model_name, metrics in models.items():
|
| 408 |
+
if model_name not in model_data:
|
| 409 |
+
model_data[model_name] = {
|
| 410 |
+
'performances': {},
|
| 411 |
+
'architecture': MODEL_METADATA.get(model_name, {}).get('architecture', 'Unknown'),
|
| 412 |
+
'source': 'existing'
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
# Calculate average performance for this dataset
|
| 416 |
+
avg_performance = np.mean(list(metrics.values()))
|
| 417 |
+
model_data[model_name]['performances'][dataset_name] = avg_performance
|
| 418 |
+
|
| 419 |
+
# Process Portuguese leaderboard data
|
| 420 |
+
if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
|
| 421 |
+
for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
|
| 422 |
+
model_name = row['model_name']
|
| 423 |
+
|
| 424 |
+
if model_name not in model_data:
|
| 425 |
+
model_data[model_name] = {
|
| 426 |
+
'performances': {},
|
| 427 |
+
'architecture': 'Unknown',
|
| 428 |
+
'source': 'portuguese_leaderboard'
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
# Map Portuguese leaderboard columns to dataset names
|
| 432 |
+
column_mapping = {
|
| 433 |
+
'ASSIN2 RTE': 'assin2_rte',
|
| 434 |
+
'ASSIN2 STS': 'assin2_sts',
|
| 435 |
+
'FaQuAD-NLI': 'faquad-nli',
|
| 436 |
+
'HateBR': 'hatebr'
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
for display_name, dataset_name in column_mapping.items():
|
| 440 |
+
if dataset_name in selected_datasets:
|
| 441 |
+
score = row[display_name]
|
| 442 |
+
if pd.notna(score) and score > 0:
|
| 443 |
+
model_data[model_name]['performances'][dataset_name] = score
|
| 444 |
+
|
| 445 |
+
# Process external models data
|
| 446 |
+
if show_external_models and not EXTERNAL_MODELS_DATA.empty:
|
| 447 |
+
for _, row in EXTERNAL_MODELS_DATA.iterrows():
|
| 448 |
+
model_name = row['model_name']
|
| 449 |
+
|
| 450 |
+
if model_name not in model_data:
|
| 451 |
+
model_data[model_name] = {
|
| 452 |
+
'performances': {},
|
| 453 |
+
'architecture': 'Unknown',
|
| 454 |
+
'source': 'external_models'
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
# Map external models columns to dataset names
|
| 458 |
+
column_mapping = {
|
| 459 |
+
'ASSIN2 RTE': 'assin2_rte',
|
| 460 |
+
'ASSIN2 STS': 'assin2_sts',
|
| 461 |
+
'FaQuAD-NLI': 'faquad-nli',
|
| 462 |
+
'HateBR': 'hatebr'
|
| 463 |
+
}
|
| 464 |
+
|
| 465 |
+
for display_name, dataset_name in column_mapping.items():
|
| 466 |
+
if dataset_name in selected_datasets:
|
| 467 |
+
score = row[display_name]
|
| 468 |
+
if pd.notna(score) and score > 0:
|
| 469 |
+
model_data[model_name]['performances'][dataset_name] = score
|
| 470 |
+
|
| 471 |
+
# Get model URLs and source information for existing models
|
| 472 |
+
additional_models = data_loader.get_additional_models()
|
| 473 |
+
for model_name in model_data.keys():
|
| 474 |
+
if model_data[model_name]['source'] == 'existing':
|
| 475 |
+
# Get URL
|
| 476 |
+
for arch_models in additional_models.values():
|
| 477 |
+
if model_name in arch_models:
|
| 478 |
+
model_data[model_name]['url'] = arch_models[model_name].get('huggingface_url', '')
|
| 479 |
+
break
|
| 480 |
+
|
| 481 |
+
# Get source information
|
| 482 |
+
model_metadata = MODEL_METADATA.get(model_name, {})
|
| 483 |
+
source = model_metadata.get('source', 'unknown')
|
| 484 |
+
model_data[model_name]['source'] = source
|
| 485 |
+
|
| 486 |
+
# Apply source filtering
|
| 487 |
+
filtered_model_data = {}
|
| 488 |
+
for model_name, data in model_data.items():
|
| 489 |
+
source = data.get('source', 'existing')
|
| 490 |
+
|
| 491 |
+
# Apply show filters - only show models from sources that are checked
|
| 492 |
+
if source == 'napolab_thesis' and not show_napolab_thesis:
|
| 493 |
+
continue
|
| 494 |
+
if source == 'teenytinyllama_paper' and not show_teenytinyllama:
|
| 495 |
+
continue
|
| 496 |
+
if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
|
| 497 |
+
continue
|
| 498 |
+
if source == 'external_models' and not show_external_models:
|
| 499 |
+
continue
|
| 500 |
+
# Hide models with unknown source (should not happen with proper data)
|
| 501 |
+
if source == 'unknown':
|
| 502 |
+
continue
|
| 503 |
+
|
| 504 |
+
filtered_model_data[model_name] = data
|
| 505 |
+
|
| 506 |
+
# Apply incomplete model filtering
|
| 507 |
+
if hide_incomplete_models and selected_datasets:
|
| 508 |
+
final_filtered_data = {}
|
| 509 |
+
for model_name, data in filtered_model_data.items():
|
| 510 |
+
has_all_scores = True
|
| 511 |
+
for dataset_name in selected_datasets:
|
| 512 |
+
if data['performances'].get(dataset_name, 0) == 0:
|
| 513 |
+
has_all_scores = False
|
| 514 |
+
break
|
| 515 |
+
if has_all_scores:
|
| 516 |
+
final_filtered_data[model_name] = data
|
| 517 |
+
filtered_model_data = final_filtered_data
|
| 518 |
+
|
| 519 |
+
# Apply minimum average performance filtering
|
| 520 |
+
if min_average_performance > 0 and selected_datasets:
|
| 521 |
+
final_filtered_data = {}
|
| 522 |
+
for model_name, data in filtered_model_data.items():
|
| 523 |
+
# Calculate average performance for selected datasets
|
| 524 |
+
scores = []
|
| 525 |
+
for dataset_name in selected_datasets:
|
| 526 |
+
score = data['performances'].get(dataset_name, 0)
|
| 527 |
+
if score > 0: # Only include non-zero scores
|
| 528 |
+
scores.append(score)
|
| 529 |
+
|
| 530 |
+
if scores:
|
| 531 |
+
avg_performance = np.mean(scores)
|
| 532 |
+
if avg_performance >= min_average_performance:
|
| 533 |
+
final_filtered_data[model_name] = data
|
| 534 |
+
filtered_model_data = final_filtered_data
|
| 535 |
+
|
| 536 |
+
# Apply search query filtering
|
| 537 |
+
if search_query:
|
| 538 |
+
final_filtered_data = {}
|
| 539 |
+
for model_name, data in filtered_model_data.items():
|
| 540 |
+
if search_query.lower() in model_name.lower():
|
| 541 |
+
final_filtered_data[model_name] = data
|
| 542 |
+
filtered_model_data = final_filtered_data
|
| 543 |
+
|
| 544 |
+
# Sort models by average performance (descending)
|
| 545 |
+
model_performances = []
|
| 546 |
+
for model_name, data in filtered_model_data.items():
|
| 547 |
+
# Calculate average performance for selected datasets
|
| 548 |
+
scores = []
|
| 549 |
+
for dataset_name in selected_datasets:
|
| 550 |
+
score = data['performances'].get(dataset_name, 0)
|
| 551 |
+
if score > 0: # Only include non-zero scores
|
| 552 |
+
scores.append(score)
|
| 553 |
+
|
| 554 |
+
avg_performance = np.mean(scores) if scores else 0
|
| 555 |
+
model_performances.append((model_name, data, avg_performance))
|
| 556 |
+
|
| 557 |
+
# Sort by average performance (descending)
|
| 558 |
+
model_performances.sort(key=lambda x: x[2], reverse=True)
|
| 559 |
+
|
| 560 |
+
# Create radar chart
|
| 561 |
+
fig = go.Figure()
|
| 562 |
+
|
| 563 |
+
# Generate a dynamic color palette based on the number of models
|
| 564 |
+
num_models = len(model_performances)
|
| 565 |
+
if num_models <= 10:
|
| 566 |
+
# Use a qualitative color palette for small numbers
|
| 567 |
+
colors = px.colors.qualitative.Set3 + px.colors.qualitative.Pastel1 + px.colors.qualitative.Dark2
|
| 568 |
+
else:
|
| 569 |
+
# Use a continuous color palette for larger numbers
|
| 570 |
+
colors = px.colors.sequential.Viridis + px.colors.sequential.Plasma + px.colors.sequential.Inferno
|
| 571 |
+
|
| 572 |
+
# Ensure we have enough colors
|
| 573 |
+
while len(colors) < num_models:
|
| 574 |
+
colors.extend(colors)
|
| 575 |
+
|
| 576 |
+
for i, (model_name, data, avg_performance) in enumerate(model_performances):
|
| 577 |
+
# Get performance values for all datasets (fill with 0 if missing)
|
| 578 |
+
performance_values = []
|
| 579 |
+
for dataset_name in dataset_names:
|
| 580 |
+
performance_values.append(data['performances'].get(dataset_name, 0))
|
| 581 |
+
|
| 582 |
+
# Assign color based on model index for better differentiation
|
| 583 |
+
color = colors[i % len(colors)]
|
| 584 |
+
|
| 585 |
+
# Show first two models by default, hide the rest
|
| 586 |
+
visible = True if i < 2 else 'legendonly'
|
| 587 |
+
|
| 588 |
+
fig.add_trace(go.Scatterpolar(
|
| 589 |
+
r=performance_values,
|
| 590 |
+
theta=dataset_display_names,
|
| 591 |
+
fill='toself',
|
| 592 |
+
name=model_name,
|
| 593 |
+
line_color=color,
|
| 594 |
+
opacity=0.6,
|
| 595 |
+
visible=visible,
|
| 596 |
+
hovertemplate=(
|
| 597 |
+
"<b>%{fullData.name}</b><br>" +
|
| 598 |
+
"Dataset: %{theta}<br>" +
|
| 599 |
+
"Performance: %{r:.3f}<br>" +
|
| 600 |
+
"Architecture: " + data['architecture'] + "<br>" +
|
| 601 |
+
"<extra></extra>"
|
| 602 |
+
)
|
| 603 |
+
))
|
| 604 |
+
|
| 605 |
+
# Update layout
|
| 606 |
+
fig.update_layout(
|
| 607 |
+
title="Model Performance Radar Chart - All Datasets",
|
| 608 |
+
polar=dict(
|
| 609 |
+
radialaxis=dict(
|
| 610 |
+
visible=True,
|
| 611 |
+
range=[0.6, 1],
|
| 612 |
+
ticktext=['0.6', '0.7', '0.8', '0.9', '1.0'],
|
| 613 |
+
tickvals=[0.6, 0.7, 0.8, 0.9, 1.0]
|
| 614 |
+
),
|
| 615 |
+
angularaxis=dict(
|
| 616 |
+
tickmode='array',
|
| 617 |
+
tickvals=list(range(len(dataset_display_names))),
|
| 618 |
+
ticktext=dataset_display_names
|
| 619 |
+
)
|
| 620 |
+
),
|
| 621 |
+
height=700,
|
| 622 |
+
showlegend=True,
|
| 623 |
+
legend=dict(
|
| 624 |
+
yanchor="top",
|
| 625 |
+
y=-0.15,
|
| 626 |
+
xanchor="center",
|
| 627 |
+
x=0.5,
|
| 628 |
+
bgcolor='rgba(255, 255, 255, 0.9)',
|
| 629 |
+
bordercolor='rgba(0, 0, 0, 0.2)',
|
| 630 |
+
borderwidth=1,
|
| 631 |
+
orientation="h"
|
| 632 |
+
),
|
| 633 |
+
margin=dict(l=50, r=50, t=100, b=100)
|
| 634 |
+
)
|
| 635 |
+
|
| 636 |
+
return fig
|
| 637 |
+
|
| 638 |
+
# Gradio Interface
|
| 639 |
+
with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
| 640 |
+
gr.Markdown("""
|
| 641 |
+
# π Napolab Leaderboard
|
| 642 |
+
|
| 643 |
+
Stay up to date with the latest advancements in Portuguese language models and their performance across carefully curated Portuguese language tasks.
|
| 644 |
+
[β Star us on GitHub](https://github.com/ruanchaves/napolab)
|
| 645 |
+
""")
|
| 646 |
+
|
| 647 |
+
with gr.Tabs():
|
| 648 |
+
|
| 649 |
+
# Benchmark Results Tab
|
| 650 |
+
with gr.Tab("π Benchmark Results"):
|
| 651 |
+
gr.Markdown("### Model Performance Benchmarks")
|
| 652 |
+
|
| 653 |
+
with gr.Accordion("Select Datasets to Include: (Click to expand)", open=False):
|
| 654 |
+
with gr.Row():
|
| 655 |
+
# Create checkboxes for each dataset
|
| 656 |
+
dataset_checkboxes = []
|
| 657 |
+
for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
|
| 658 |
+
display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
|
| 659 |
+
checkbox = gr.Checkbox(
|
| 660 |
+
label=display_name,
|
| 661 |
+
value=True # Default to selected
|
| 662 |
+
)
|
| 663 |
+
dataset_checkboxes.append((dataset_name, checkbox))
|
| 664 |
+
|
| 665 |
+
with gr.Accordion("Filter by Score: (Click to expand)", open=False):
|
| 666 |
+
with gr.Row():
|
| 667 |
+
hide_incomplete_models = gr.Checkbox(
|
| 668 |
+
label="Hide models with zero scores in selected datasets",
|
| 669 |
+
value=False
|
| 670 |
+
)
|
| 671 |
+
|
| 672 |
+
min_average_performance = gr.Slider(
|
| 673 |
+
minimum=0,
|
| 674 |
+
maximum=100,
|
| 675 |
+
value=80,
|
| 676 |
+
step=1,
|
| 677 |
+
label="Minimum Average Performance (%)"
|
| 678 |
+
)
|
| 679 |
+
|
| 680 |
+
with gr.Accordion("Filter by Data Source: (Click to expand)", open=False):
|
| 681 |
+
with gr.Row():
|
| 682 |
+
show_napolab_thesis = gr.Checkbox(
|
| 683 |
+
label="Napolab Thesis models",
|
| 684 |
+
value=True
|
| 685 |
+
)
|
| 686 |
+
show_teenytinyllama = gr.Checkbox(
|
| 687 |
+
label="TeenyTinyLlama models",
|
| 688 |
+
value=True
|
| 689 |
+
)
|
| 690 |
+
show_portuguese_leaderboard = gr.Checkbox(
|
| 691 |
+
label="Open Portuguese LLM Leaderboard models (open-source)",
|
| 692 |
+
value=True
|
| 693 |
+
)
|
| 694 |
+
|
| 695 |
+
show_external_models = gr.Checkbox(
|
| 696 |
+
label="Open Portuguese LLM Leaderboard models (proprietary)",
|
| 697 |
+
value=True
|
| 698 |
+
)
|
| 699 |
+
|
| 700 |
+
# Search bar for filtering models
|
| 701 |
+
search_query = gr.Textbox(
|
| 702 |
+
label="Search models by name",
|
| 703 |
+
placeholder="Enter model name to filter...",
|
| 704 |
+
value=""
|
| 705 |
+
)
|
| 706 |
+
|
| 707 |
+
benchmark_table = gr.DataFrame(
|
| 708 |
+
label="Model Performance Benchmarks",
|
| 709 |
+
wrap=[True, False, False, False, False, False, False, False, False, False],
|
| 710 |
+
interactive=False,
|
| 711 |
+
datatype=["str", "markdown", "number", "number", "number", "number", "number", "number", "number", "number"],
|
| 712 |
+
column_widths=["80px", "200px", "100px", "120px", "120px", "120px", "120px", "120px", "120px", "120px"]
|
| 713 |
+
)
|
| 714 |
+
|
| 715 |
+
gr.Markdown("*π₯π₯π₯ = Top 3 | π’ = Top 33% | π‘ = Middle 33% | π΄ = Bottom 33%*")
|
| 716 |
+
|
| 717 |
+
# Export to CSV button and file component
|
| 718 |
+
export_button = gr.Button("π₯ Export to CSV", variant="secondary")
|
| 719 |
+
csv_file = gr.File(label="Download CSV", interactive=False, visible=True)
|
| 720 |
+
|
| 721 |
+
# Model Analysis Tab
|
| 722 |
+
with gr.Tab("π Model Analysis"):
|
| 723 |
+
gr.Markdown("### Model Performance Radar Chart")
|
| 724 |
+
|
| 725 |
+
# Dataset Selection Controls
|
| 726 |
+
with gr.Accordion("Select Datasets to Display: (Click to expand)", open=False):
|
| 727 |
+
with gr.Row():
|
| 728 |
+
# Create checkboxes for each dataset
|
| 729 |
+
analysis_dataset_checkboxes = []
|
| 730 |
+
for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
|
| 731 |
+
display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
|
| 732 |
+
checkbox = gr.Checkbox(
|
| 733 |
+
label=display_name,
|
| 734 |
+
value=True
|
| 735 |
+
)
|
| 736 |
+
analysis_dataset_checkboxes.append((dataset_name, checkbox))
|
| 737 |
+
|
| 738 |
+
# Filter Controls
|
| 739 |
+
with gr.Accordion("Filter by Score: (Click to expand)", open=False):
|
| 740 |
+
with gr.Row():
|
| 741 |
+
hide_incomplete_models_analysis = gr.Checkbox(
|
| 742 |
+
label="Hide models with zero scores in selected datasets",
|
| 743 |
+
value=False
|
| 744 |
+
)
|
| 745 |
+
|
| 746 |
+
min_average_performance_analysis = gr.Slider(
|
| 747 |
+
minimum=0,
|
| 748 |
+
maximum=100,
|
| 749 |
+
value=80,
|
| 750 |
+
step=1,
|
| 751 |
+
label="Minimum Average Performance (%)"
|
| 752 |
+
)
|
| 753 |
+
|
| 754 |
+
with gr.Accordion("Filter by Data Source: (Click to expand)", open=False):
|
| 755 |
+
with gr.Row():
|
| 756 |
+
show_napolab_thesis_analysis = gr.Checkbox(
|
| 757 |
+
label="Napolab Thesis models",
|
| 758 |
+
value=True
|
| 759 |
+
)
|
| 760 |
+
|
| 761 |
+
show_teenytinyllama_analysis = gr.Checkbox(
|
| 762 |
+
label="TeenyTinyLlama models",
|
| 763 |
+
value=True
|
| 764 |
+
)
|
| 765 |
+
|
| 766 |
+
show_portuguese_leaderboard_analysis = gr.Checkbox(
|
| 767 |
+
label="Open Portuguese LLM Leaderboard models (open-source)",
|
| 768 |
+
value=True
|
| 769 |
+
)
|
| 770 |
+
|
| 771 |
+
show_external_models_analysis = gr.Checkbox(
|
| 772 |
+
label="Open Portuguese LLM Leaderboard models (proprietary)",
|
| 773 |
+
value=True
|
| 774 |
+
)
|
| 775 |
+
|
| 776 |
+
# Search bar for filtering models in radar chart
|
| 777 |
+
search_query_analysis = gr.Textbox(
|
| 778 |
+
label="Search models by name",
|
| 779 |
+
placeholder="Enter model name to filter...",
|
| 780 |
+
value=""
|
| 781 |
+
)
|
| 782 |
+
|
| 783 |
+
model_analysis_chart = gr.Plot(label="Model Performance Radar Chart")
|
| 784 |
+
|
| 785 |
+
gr.Markdown("""
|
| 786 |
+
**How to interact with the chart:**
|
| 787 |
+
- **Click on legend items** to show/hide specific models.
|
| 788 |
+
- **Double-click on a legend item** to isolate that model (hide all others).
|
| 789 |
+
- **Double-click again** to show all models.
|
| 790 |
+
|
| 791 |
+
Models in the legend are sorted in descending order based on their average performance across your chosen datasets.
|
| 792 |
+
""")
|
| 793 |
+
|
| 794 |
+
|
| 795 |
+
|
| 796 |
+
# About Tab
|
| 797 |
+
with gr.Tab("βΉοΈ About"):
|
| 798 |
+
gr.Markdown("""
|
| 799 |
+
## About Napolab
|
| 800 |
+
|
| 801 |
+
**Natural Portuguese Language Benchmark (Napolab)** is a comprehensive collection of Portuguese datasets designed for evaluating Large Language Models.
|
| 802 |
+
|
| 803 |
+
For more information, please visit the [GitHub repository](https://github.com/ruanchaves/napolab) and the [Hugging Face Dataset](https://huggingface.co/datasets/ruanchaves/napolab).
|
| 804 |
+
|
| 805 |
+
### Data Sources:
|
| 806 |
+
The benchmark results and model evaluations presented in this leaderboard are compiled from multiple sources:
|
| 807 |
+
|
| 808 |
+
**1. "Lessons learned from the evaluation of Portuguese language models"** by Ruan Chaves Rodrigues (2023). Available at: [University of Malta OAR@UM Repository](https://www.um.edu.mt/library/oar/handle/123456789/120557)
|
| 809 |
+
|
| 810 |
+
**2. Open PT LLM Leaderboard** by Eduardo Garcia (2025). Available at: [Hugging Face Spaces](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard).
|
| 811 |
+
|
| 812 |
+
**3. "TeenyTinyLlama: Open-source tiny language models trained in Brazilian Portuguese"** by CorrΓͺa et al. (2024). Available at: [arXiv](https://arxiv.org/abs/2401.16640).
|
| 813 |
+
|
| 814 |
+
### Thesis Citation:
|
| 815 |
+
```bibtex
|
| 816 |
+
@mastersthesis{chaves2023lessons,
|
| 817 |
+
title={Lessons learned from the evaluation of Portuguese language models},
|
| 818 |
+
author={Chaves Rodrigues, Ruan},
|
| 819 |
+
year={2023},
|
| 820 |
+
school={University of Malta},
|
| 821 |
+
url={https://www.um.edu.mt/library/oar/handle/123456789/120557}
|
| 822 |
+
}
|
| 823 |
+
```
|
| 824 |
+
|
| 825 |
+
### Napolab Citation:
|
| 826 |
+
```bibtex
|
| 827 |
+
@software{Chaves_Rodrigues_napolab_2023,
|
| 828 |
+
author = {Chaves Rodrigues, Ruan and Tanti, Marc and Agerri, Rodrigo},
|
| 829 |
+
doi = {10.5281/zenodo.7781848},
|
| 830 |
+
month = {3},
|
| 831 |
+
title = {{Natural Portuguese Language Benchmark (Napolab)}},
|
| 832 |
+
url = {https://github.com/ruanchaves/napolab},
|
| 833 |
+
version = {1.0.0},
|
| 834 |
+
year = {2023}
|
| 835 |
+
}
|
| 836 |
+
```
|
| 837 |
+
|
| 838 |
+
""")
|
| 839 |
+
|
| 840 |
+
# Event handlers
|
| 841 |
+
def update_radar_chart(*args):
|
| 842 |
+
# Extract arguments for radar chart
|
| 843 |
+
dataset_values = args[:len(analysis_dataset_checkboxes)]
|
| 844 |
+
hide_incomplete_models = args[len(analysis_dataset_checkboxes)]
|
| 845 |
+
min_average_performance = args[len(analysis_dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal
|
| 846 |
+
show_napolab_thesis = args[len(analysis_dataset_checkboxes) + 2]
|
| 847 |
+
show_teenytinyllama = args[len(analysis_dataset_checkboxes) + 3]
|
| 848 |
+
show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
|
| 849 |
+
show_external_models = args[len(analysis_dataset_checkboxes) + 5]
|
| 850 |
+
search_query = args[len(analysis_dataset_checkboxes) + 6]
|
| 851 |
+
|
| 852 |
+
# Convert dataset selections to list of selected dataset names
|
| 853 |
+
selected_datasets = []
|
| 854 |
+
for i, (dataset_name, _) in enumerate(analysis_dataset_checkboxes):
|
| 855 |
+
if dataset_values[i]:
|
| 856 |
+
selected_datasets.append(dataset_name)
|
| 857 |
+
|
| 858 |
+
return create_model_performance_radar(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query)
|
| 859 |
+
|
| 860 |
+
def update_benchmark_table(*args):
|
| 861 |
+
# Extract arguments
|
| 862 |
+
dataset_values = args[:len(dataset_checkboxes)]
|
| 863 |
+
hide_incomplete_models = args[len(dataset_checkboxes)]
|
| 864 |
+
min_average_performance = args[len(dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal
|
| 865 |
+
show_napolab_thesis = args[len(dataset_checkboxes) + 2]
|
| 866 |
+
show_teenytinyllama = args[len(dataset_checkboxes) + 3]
|
| 867 |
+
show_portuguese_leaderboard = args[len(dataset_checkboxes) + 4]
|
| 868 |
+
show_external_models = args[len(dataset_checkboxes) + 5]
|
| 869 |
+
search_query = args[len(dataset_checkboxes) + 6]
|
| 870 |
+
|
| 871 |
+
# Convert dataset selections to list of selected dataset names
|
| 872 |
+
selected_datasets = []
|
| 873 |
+
for i, (dataset_name, _) in enumerate(dataset_checkboxes):
|
| 874 |
+
if dataset_values[i]:
|
| 875 |
+
selected_datasets.append(dataset_name)
|
| 876 |
+
|
| 877 |
+
df = create_simplified_benchmark_table(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query)
|
| 878 |
+
|
| 879 |
+
return df
|
| 880 |
+
|
| 881 |
+
# Connect events
|
| 882 |
+
# Load model analysis chart on app start
|
| 883 |
+
app.load(lambda: update_radar_chart(*([True] * len(analysis_dataset_checkboxes) + [False, 80, True, True, True, True, ""])), outputs=model_analysis_chart)
|
| 884 |
+
|
| 885 |
+
# Load benchmark table on app start
|
| 886 |
+
app.load(lambda: update_benchmark_table(*([True] * len(dataset_checkboxes) + [False, 80, True, True, True, True, ""])), outputs=benchmark_table)
|
| 887 |
+
|
| 888 |
+
# Connect dataset checkboxes to update table
|
| 889 |
+
for dataset_name, checkbox in dataset_checkboxes:
|
| 890 |
+
checkbox.change(
|
| 891 |
+
update_benchmark_table,
|
| 892 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
| 893 |
+
outputs=benchmark_table
|
| 894 |
+
)
|
| 895 |
+
|
| 896 |
+
hide_incomplete_models.change(
|
| 897 |
+
update_benchmark_table,
|
| 898 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
| 899 |
+
outputs=benchmark_table
|
| 900 |
+
)
|
| 901 |
+
|
| 902 |
+
min_average_performance.change(
|
| 903 |
+
update_benchmark_table,
|
| 904 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
| 905 |
+
outputs=benchmark_table
|
| 906 |
+
)
|
| 907 |
+
|
| 908 |
+
show_napolab_thesis.change(
|
| 909 |
+
update_benchmark_table,
|
| 910 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
| 911 |
+
outputs=benchmark_table
|
| 912 |
+
)
|
| 913 |
+
|
| 914 |
+
show_teenytinyllama.change(
|
| 915 |
+
update_benchmark_table,
|
| 916 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
| 917 |
+
outputs=benchmark_table
|
| 918 |
+
)
|
| 919 |
+
|
| 920 |
+
show_portuguese_leaderboard.change(
|
| 921 |
+
update_benchmark_table,
|
| 922 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
| 923 |
+
outputs=benchmark_table
|
| 924 |
+
)
|
| 925 |
+
|
| 926 |
+
show_external_models.change(
|
| 927 |
+
update_benchmark_table,
|
| 928 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
| 929 |
+
outputs=benchmark_table
|
| 930 |
+
)
|
| 931 |
+
|
| 932 |
+
# Connect search query to update table
|
| 933 |
+
search_query.change(
|
| 934 |
+
update_benchmark_table,
|
| 935 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
| 936 |
+
outputs=benchmark_table
|
| 937 |
+
)
|
| 938 |
+
|
| 939 |
+
# Connect export button
|
| 940 |
+
export_button.click(
|
| 941 |
+
export_csv,
|
| 942 |
+
inputs=benchmark_table,
|
| 943 |
+
outputs=csv_file
|
| 944 |
+
)
|
| 945 |
+
|
| 946 |
+
# Connect file download to cleanup
|
| 947 |
+
csv_file.change(
|
| 948 |
+
cleanup_current_csv,
|
| 949 |
+
inputs=None,
|
| 950 |
+
outputs=None
|
| 951 |
+
)
|
| 952 |
+
|
| 953 |
+
# Connect analysis chart events
|
| 954 |
+
# Connect dataset checkboxes to update radar chart
|
| 955 |
+
for dataset_name, checkbox in analysis_dataset_checkboxes:
|
| 956 |
+
checkbox.change(
|
| 957 |
+
update_radar_chart,
|
| 958 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
| 959 |
+
outputs=model_analysis_chart
|
| 960 |
+
)
|
| 961 |
+
|
| 962 |
+
hide_incomplete_models_analysis.change(
|
| 963 |
+
update_radar_chart,
|
| 964 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
| 965 |
+
outputs=model_analysis_chart
|
| 966 |
+
)
|
| 967 |
+
|
| 968 |
+
min_average_performance_analysis.change(
|
| 969 |
+
update_radar_chart,
|
| 970 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
| 971 |
+
outputs=model_analysis_chart
|
| 972 |
+
)
|
| 973 |
+
|
| 974 |
+
show_napolab_thesis_analysis.change(
|
| 975 |
+
update_radar_chart,
|
| 976 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
| 977 |
+
outputs=model_analysis_chart
|
| 978 |
+
)
|
| 979 |
+
|
| 980 |
+
show_teenytinyllama_analysis.change(
|
| 981 |
+
update_radar_chart,
|
| 982 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
| 983 |
+
outputs=model_analysis_chart
|
| 984 |
+
)
|
| 985 |
+
|
| 986 |
+
show_portuguese_leaderboard_analysis.change(
|
| 987 |
+
update_radar_chart,
|
| 988 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
| 989 |
+
outputs=model_analysis_chart
|
| 990 |
+
)
|
| 991 |
+
|
| 992 |
+
show_external_models_analysis.change(
|
| 993 |
+
update_radar_chart,
|
| 994 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
| 995 |
+
outputs=model_analysis_chart
|
| 996 |
+
)
|
| 997 |
+
|
| 998 |
+
# Connect search query to update radar chart
|
| 999 |
+
search_query_analysis.change(
|
| 1000 |
+
update_radar_chart,
|
| 1001 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
| 1002 |
+
outputs=model_analysis_chart
|
| 1003 |
+
)
|
| 1004 |
+
|
| 1005 |
+
if __name__ == "__main__":
|
| 1006 |
+
app.launch(share=True, server_name="0.0.0.0", server_port=7860)
|
config.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration file for the Napolab Leaderboard Gradio App
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
# App Configuration
|
| 6 |
+
APP_TITLE = "Napolab Leaderboard"
|
| 7 |
+
APP_DESCRIPTION = "Natural Portuguese Language Benchmark Leaderboard"
|
| 8 |
+
APP_THEME = "soft"
|
| 9 |
+
APP_PORT = 7860
|
| 10 |
+
APP_HOST = "0.0.0.0"
|
| 11 |
+
APP_SHARE = True
|
| 12 |
+
|
| 13 |
+
# Dataset Configuration
|
| 14 |
+
DEFAULT_DATASET = "assin"
|
| 15 |
+
DEFAULT_SPLIT = "test"
|
| 16 |
+
DEFAULT_SAMPLES = 5
|
| 17 |
+
MAX_SAMPLES = 20
|
| 18 |
+
|
| 19 |
+
# Chart Configuration
|
| 20 |
+
CHART_HEIGHT = 400
|
| 21 |
+
OVERVIEW_CHART_HEIGHT = 600
|
| 22 |
+
CHART_COLORS = {
|
| 23 |
+
"primary": "#1f77b4",
|
| 24 |
+
"secondary": "#ff7f0e",
|
| 25 |
+
"success": "#2ca02c",
|
| 26 |
+
"warning": "#d62728"
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
# Model Configuration
|
| 30 |
+
DEFAULT_MODELS_TO_COMPARE = 2
|
| 31 |
+
|
| 32 |
+
# Cache Configuration
|
| 33 |
+
CACHE_DURATION = 3600 # 1 hour in seconds
|
| 34 |
+
|
| 35 |
+
# Error Messages
|
| 36 |
+
ERROR_MESSAGES = {
|
| 37 |
+
"dataset_load": "Error loading dataset. Please check your internet connection.",
|
| 38 |
+
"no_benchmark": "No benchmark data available for this dataset.",
|
| 39 |
+
"no_models": "No models found for comparison.",
|
| 40 |
+
"invalid_selection": "Invalid selection. Please try again."
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# Links
|
| 44 |
+
LINKS = {
|
| 45 |
+
"github": "https://github.com/ruanchaves/napolab",
|
| 46 |
+
"huggingface_dataset": "https://huggingface.co/datasets/ruanchaves/napolab",
|
| 47 |
+
"open_pt_llm_leaderboard": "https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard"
|
| 48 |
+
}
|
data.yaml
ADDED
|
@@ -0,0 +1,422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Napolab Leaderboard Data Configuration
|
| 2 |
+
# This file contains all datasets and benchmark results for the Gradio app
|
| 3 |
+
#
|
| 4 |
+
# Data Source: "Lessons learned from the evaluation of Portuguese language models"
|
| 5 |
+
# by Ruan Chaves Rodrigues (2023) - Master's dissertation, University of Malta
|
| 6 |
+
# Available at: https://www.um.edu.mt/library/oar/handle/123456789/120557
|
| 7 |
+
|
| 8 |
+
# Data Sources
|
| 9 |
+
sources:
|
| 10 |
+
napolab_thesis:
|
| 11 |
+
name: "Napolab Thesis"
|
| 12 |
+
description: "Lessons learned from the evaluation of Portuguese language models"
|
| 13 |
+
author: "Ruan Chaves Rodrigues"
|
| 14 |
+
year: 2023
|
| 15 |
+
url: "https://www.um.edu.mt/library/oar/handle/123456789/120557"
|
| 16 |
+
institution: "University of Malta"
|
| 17 |
+
|
| 18 |
+
open_pt_llm_leaderboard:
|
| 19 |
+
name: "Open PT LLM Leaderboard"
|
| 20 |
+
description: "Large Language Models on Portuguese Benchmarks"
|
| 21 |
+
author: "Eduardo Garcia"
|
| 22 |
+
year: 2025
|
| 23 |
+
url: "https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard"
|
| 24 |
+
platform: "Hugging Face Spaces"
|
| 25 |
+
|
| 26 |
+
teenytinyllama_paper:
|
| 27 |
+
name: "TeenyTinyLlama Paper"
|
| 28 |
+
description: "TeenyTinyLlama: Open-source tiny language models trained in Brazilian Portuguese"
|
| 29 |
+
authors: ["CorrΓͺa, Nicholas Kluge", "Falk, Sophia", "Fatimah, Shiza", "Sen, Aniket", "De Oliveira, Nythamar"]
|
| 30 |
+
year: 2024
|
| 31 |
+
journal: "Machine Learning with Applications"
|
| 32 |
+
doi: "10.1016/j.mlwa.2024.100558"
|
| 33 |
+
|
| 34 |
+
# Dataset Information
|
| 35 |
+
datasets:
|
| 36 |
+
assin_rte:
|
| 37 |
+
name: "ASSIN RTE"
|
| 38 |
+
description: "AvaliaΓ§Γ£o de Similaridade SemΓ’ntica e InferΓͺncia Textual - RTE"
|
| 39 |
+
tasks: ["RTE"]
|
| 40 |
+
url: "https://huggingface.co/datasets/assin"
|
| 41 |
+
|
| 42 |
+
assin_sts:
|
| 43 |
+
name: "ASSIN STS"
|
| 44 |
+
description: "AvaliaΓ§Γ£o de Similaridade SemΓ’ntica e InferΓͺncia Textual - STS"
|
| 45 |
+
tasks: ["STS"]
|
| 46 |
+
url: "https://huggingface.co/datasets/assin"
|
| 47 |
+
|
| 48 |
+
assin2_rte:
|
| 49 |
+
name: "ASSIN 2 RTE"
|
| 50 |
+
description: "AvaliaΓ§Γ£o de Similaridade SemΓ’ntica e InferΓͺncia Textual (v2) - RTE"
|
| 51 |
+
tasks: ["RTE"]
|
| 52 |
+
url: "https://huggingface.co/datasets/assin2"
|
| 53 |
+
|
| 54 |
+
assin2_sts:
|
| 55 |
+
name: "ASSIN 2 STS"
|
| 56 |
+
description: "AvaliaΓ§Γ£o de Similaridade SemΓ’ntica e InferΓͺncia Textual (v2) - STS"
|
| 57 |
+
tasks: ["STS"]
|
| 58 |
+
url: "https://huggingface.co/datasets/assin2"
|
| 59 |
+
|
| 60 |
+
faquad-nli:
|
| 61 |
+
name: "FaQUaD-NLI"
|
| 62 |
+
description: "Factual Question Answering and Natural Language Inference"
|
| 63 |
+
tasks: ["NLI"]
|
| 64 |
+
url: "https://huggingface.co/datasets/ruanchaves/faquad-nli"
|
| 65 |
+
|
| 66 |
+
hatebr:
|
| 67 |
+
name: "HateBR"
|
| 68 |
+
description: "Hate Speech Detection in Brazilian Portuguese"
|
| 69 |
+
tasks: ["Classification"]
|
| 70 |
+
url: "https://huggingface.co/datasets/ruanchaves/hatebr"
|
| 71 |
+
|
| 72 |
+
porsimplessent:
|
| 73 |
+
name: "PorSimplesSent"
|
| 74 |
+
description: "Portuguese Simple Sentences Sentiment Analysis"
|
| 75 |
+
tasks: ["Sentiment Analysis"]
|
| 76 |
+
url: "https://huggingface.co/datasets/ruanchaves/porsimplessent"
|
| 77 |
+
|
| 78 |
+
reli-sa:
|
| 79 |
+
name: "Reli-SA"
|
| 80 |
+
description: "Religious Sentiment Analysis"
|
| 81 |
+
tasks: ["Sentiment Analysis"]
|
| 82 |
+
url: "https://huggingface.co/datasets/ruanchaves/reli-sa"
|
| 83 |
+
|
| 84 |
+
# Benchmark Results
|
| 85 |
+
benchmark_results:
|
| 86 |
+
assin_rte:
|
| 87 |
+
albertina-pt-pt:
|
| 88 |
+
accuracy: 0.887
|
| 89 |
+
albertina-pt-br:
|
| 90 |
+
accuracy: 0.844
|
| 91 |
+
deberta-v2-large:
|
| 92 |
+
accuracy: 0.864
|
| 93 |
+
xlm-roberta-large:
|
| 94 |
+
accuracy: 0.874
|
| 95 |
+
mdeberta-v3-base:
|
| 96 |
+
accuracy: 0.863
|
| 97 |
+
bertimbau-large:
|
| 98 |
+
accuracy: 0.838
|
| 99 |
+
bert-large:
|
| 100 |
+
accuracy: 0.802
|
| 101 |
+
bertimbau-base:
|
| 102 |
+
accuracy: 0.828
|
| 103 |
+
bert-multilingual-base:
|
| 104 |
+
accuracy: 0.815
|
| 105 |
+
xlm-roberta-base:
|
| 106 |
+
accuracy: 0.822
|
| 107 |
+
bertinho:
|
| 108 |
+
accuracy: 0.786
|
| 109 |
+
ixaes:
|
| 110 |
+
accuracy: 0.782
|
| 111 |
+
|
| 112 |
+
assin_sts:
|
| 113 |
+
albertina-pt-pt:
|
| 114 |
+
accuracy: 0.874
|
| 115 |
+
albertina-pt-br:
|
| 116 |
+
accuracy: 0.883
|
| 117 |
+
deberta-v2-large:
|
| 118 |
+
accuracy: 0.861
|
| 119 |
+
xlm-roberta-large:
|
| 120 |
+
accuracy: 0.863
|
| 121 |
+
mdeberta-v3-base:
|
| 122 |
+
accuracy: 0.855
|
| 123 |
+
bertimbau-large:
|
| 124 |
+
accuracy: 0.826
|
| 125 |
+
bert-large:
|
| 126 |
+
accuracy: 0.822
|
| 127 |
+
bertimbau-base:
|
| 128 |
+
accuracy: 0.844
|
| 129 |
+
bert-multilingual-base:
|
| 130 |
+
accuracy: 0.820
|
| 131 |
+
xlm-roberta-base:
|
| 132 |
+
accuracy: 0.812
|
| 133 |
+
bertinho:
|
| 134 |
+
accuracy: 0.791
|
| 135 |
+
ixaes:
|
| 136 |
+
accuracy: 0.817
|
| 137 |
+
|
| 138 |
+
assin2_rte:
|
| 139 |
+
albertina-pt-pt:
|
| 140 |
+
accuracy: 0.910
|
| 141 |
+
albertina-pt-br:
|
| 142 |
+
accuracy: 0.916
|
| 143 |
+
deberta-v2-large:
|
| 144 |
+
accuracy: 0.911
|
| 145 |
+
xlm-roberta-large:
|
| 146 |
+
accuracy: 0.910
|
| 147 |
+
mdeberta-v3-base:
|
| 148 |
+
accuracy: 0.904
|
| 149 |
+
bertimbau-large:
|
| 150 |
+
accuracy: 0.897
|
| 151 |
+
bert-large:
|
| 152 |
+
accuracy: 0.892
|
| 153 |
+
bertimbau-base:
|
| 154 |
+
accuracy: 0.884
|
| 155 |
+
bert-multilingual-base:
|
| 156 |
+
accuracy: 0.877
|
| 157 |
+
xlm-roberta-base:
|
| 158 |
+
accuracy: 0.875
|
| 159 |
+
bertinho:
|
| 160 |
+
accuracy: 0.855
|
| 161 |
+
ixaes:
|
| 162 |
+
accuracy: 0.879
|
| 163 |
+
ttl-460m:
|
| 164 |
+
accuracy: 0.8643
|
| 165 |
+
ttl-160m:
|
| 166 |
+
accuracy: 0.8578
|
| 167 |
+
|
| 168 |
+
assin2_sts:
|
| 169 |
+
deberta-v2-large:
|
| 170 |
+
accuracy: 0.724
|
| 171 |
+
mdeberta-v3-base:
|
| 172 |
+
accuracy: 0.847
|
| 173 |
+
bertimbau-large:
|
| 174 |
+
accuracy: 0.855
|
| 175 |
+
bert-large:
|
| 176 |
+
accuracy: 0.792
|
| 177 |
+
bertimbau-base:
|
| 178 |
+
accuracy: 0.840
|
| 179 |
+
bert-multilingual-base:
|
| 180 |
+
accuracy: 0.827
|
| 181 |
+
xlm-roberta-base:
|
| 182 |
+
accuracy: 0.847
|
| 183 |
+
bertinho:
|
| 184 |
+
accuracy: 0.802
|
| 185 |
+
ixaes:
|
| 186 |
+
accuracy: 0.822
|
| 187 |
+
|
| 188 |
+
faquad-nli:
|
| 189 |
+
mdeberta-v3-base:
|
| 190 |
+
accuracy: 0.889
|
| 191 |
+
bertimbau-large:
|
| 192 |
+
accuracy: 0.900
|
| 193 |
+
bert-large:
|
| 194 |
+
accuracy: 0.838
|
| 195 |
+
bertimbau-base:
|
| 196 |
+
accuracy: 0.897
|
| 197 |
+
bert-multilingual-base:
|
| 198 |
+
accuracy: 0.865
|
| 199 |
+
xlm-roberta-base:
|
| 200 |
+
accuracy: 0.898
|
| 201 |
+
bertinho:
|
| 202 |
+
accuracy: 0.866
|
| 203 |
+
ixaes:
|
| 204 |
+
accuracy: 0.860
|
| 205 |
+
ttl-460m:
|
| 206 |
+
accuracy: 0.9118
|
| 207 |
+
ttl-160m:
|
| 208 |
+
accuracy: 0.9000
|
| 209 |
+
|
| 210 |
+
hatebr:
|
| 211 |
+
mdeberta-v3-base:
|
| 212 |
+
accuracy: 0.911
|
| 213 |
+
bertimbau-large:
|
| 214 |
+
accuracy: 0.919
|
| 215 |
+
bert-large:
|
| 216 |
+
accuracy: 0.838
|
| 217 |
+
bertimbau-base:
|
| 218 |
+
accuracy: 0.920
|
| 219 |
+
bert-multilingual-base:
|
| 220 |
+
accuracy: 0.871
|
| 221 |
+
xlm-roberta-base:
|
| 222 |
+
accuracy: 0.920
|
| 223 |
+
bertinho:
|
| 224 |
+
accuracy: 0.879
|
| 225 |
+
ixaes:
|
| 226 |
+
accuracy: 0.872
|
| 227 |
+
ttl-460m:
|
| 228 |
+
accuracy: 0.9228
|
| 229 |
+
ttl-160m:
|
| 230 |
+
accuracy: 0.9071
|
| 231 |
+
|
| 232 |
+
porsimplessent:
|
| 233 |
+
mdeberta-v3-base:
|
| 234 |
+
accuracy: 0.953
|
| 235 |
+
bertimbau-large:
|
| 236 |
+
accuracy: 0.919
|
| 237 |
+
bert-large:
|
| 238 |
+
accuracy: 0.907
|
| 239 |
+
bertimbau-base:
|
| 240 |
+
accuracy: 0.920
|
| 241 |
+
bert-multilingual-base:
|
| 242 |
+
accuracy: 0.933
|
| 243 |
+
xlm-roberta-base:
|
| 244 |
+
accuracy: 0.920
|
| 245 |
+
bertinho:
|
| 246 |
+
accuracy: 0.900
|
| 247 |
+
ixaes:
|
| 248 |
+
accuracy: 0.899
|
| 249 |
+
|
| 250 |
+
reli-sa:
|
| 251 |
+
mdeberta-v3-base:
|
| 252 |
+
accuracy: 0.719
|
| 253 |
+
bertimbau-large:
|
| 254 |
+
accuracy: 0.745
|
| 255 |
+
bert-large:
|
| 256 |
+
accuracy: 0.629
|
| 257 |
+
bertimbau-base:
|
| 258 |
+
accuracy: 0.713
|
| 259 |
+
bert-multilingual-base:
|
| 260 |
+
accuracy: 0.642
|
| 261 |
+
xlm-roberta-base:
|
| 262 |
+
accuracy: 0.680
|
| 263 |
+
bertinho:
|
| 264 |
+
accuracy: 0.681
|
| 265 |
+
ixaes:
|
| 266 |
+
accuracy: 0.637
|
| 267 |
+
|
| 268 |
+
# Model Metadata
|
| 269 |
+
model_metadata:
|
| 270 |
+
albertina-pt-pt:
|
| 271 |
+
parameters: 125000000
|
| 272 |
+
architecture: "Albertina PT:PT"
|
| 273 |
+
base_model: "PORTULAN/albertina-ptpt"
|
| 274 |
+
task: "Multiple"
|
| 275 |
+
huggingface_url: "https://huggingface.co/PORTULAN/albertina-ptpt"
|
| 276 |
+
source: "napolab_thesis"
|
| 277 |
+
|
| 278 |
+
albertina-pt-br:
|
| 279 |
+
parameters: 125000000
|
| 280 |
+
architecture: "Albertina PT:BR"
|
| 281 |
+
base_model: "PORTULAN/albertina-ptbr"
|
| 282 |
+
task: "Multiple"
|
| 283 |
+
huggingface_url: "https://huggingface.co/PORTULAN/albertina-ptbr"
|
| 284 |
+
source: "napolab_thesis"
|
| 285 |
+
|
| 286 |
+
deberta-v2-large:
|
| 287 |
+
parameters: 900000000
|
| 288 |
+
architecture: "DeBERTa v2 (large)"
|
| 289 |
+
base_model: "microsoft/deberta-v2-large"
|
| 290 |
+
task: "Multiple"
|
| 291 |
+
huggingface_url: "https://huggingface.co/microsoft/deberta-v2-large"
|
| 292 |
+
source: "napolab_thesis"
|
| 293 |
+
|
| 294 |
+
xlm-roberta-large:
|
| 295 |
+
parameters: 550000000
|
| 296 |
+
architecture: "XLM-RoBERTa (large)"
|
| 297 |
+
base_model: "xlm-roberta-large"
|
| 298 |
+
task: "Multiple"
|
| 299 |
+
huggingface_url: "https://huggingface.co/xlm-roberta-large"
|
| 300 |
+
source: "napolab_thesis"
|
| 301 |
+
|
| 302 |
+
mdeberta-v3-base:
|
| 303 |
+
parameters: 86000000
|
| 304 |
+
architecture: "mDeBERTa v3 (base)"
|
| 305 |
+
base_model: "microsoft/mdeberta-v3-base"
|
| 306 |
+
task: "Multiple"
|
| 307 |
+
huggingface_url: "https://huggingface.co/microsoft/mdeberta-v3-base"
|
| 308 |
+
source: "napolab_thesis"
|
| 309 |
+
|
| 310 |
+
bertimbau-large:
|
| 311 |
+
parameters: 355000000
|
| 312 |
+
architecture: "BERTimbau (large)"
|
| 313 |
+
base_model: "neuralmind/bert-large-portuguese-cased"
|
| 314 |
+
task: "Multiple"
|
| 315 |
+
huggingface_url: "https://huggingface.co/neuralmind/bert-large-portuguese-cased"
|
| 316 |
+
source: "napolab_thesis"
|
| 317 |
+
|
| 318 |
+
bert-large:
|
| 319 |
+
parameters: 355000000
|
| 320 |
+
architecture: "BERT (large)"
|
| 321 |
+
base_model: "bert-large-uncased"
|
| 322 |
+
task: "Multiple"
|
| 323 |
+
huggingface_url: "https://huggingface.co/bert-large-uncased"
|
| 324 |
+
source: "napolab_thesis"
|
| 325 |
+
|
| 326 |
+
bertimbau-base:
|
| 327 |
+
parameters: 110000000
|
| 328 |
+
architecture: "BERTimbau (base)"
|
| 329 |
+
base_model: "neuralmind/bert-base-portuguese-cased"
|
| 330 |
+
task: "Multiple"
|
| 331 |
+
huggingface_url: "https://huggingface.co/neuralmind/bert-base-portuguese-cased"
|
| 332 |
+
source: "napolab_thesis"
|
| 333 |
+
|
| 334 |
+
bert-multilingual-base:
|
| 335 |
+
parameters: 110000000
|
| 336 |
+
architecture: "BERT multilingual (base)"
|
| 337 |
+
base_model: "bert-base-multilingual-cased"
|
| 338 |
+
task: "Multiple"
|
| 339 |
+
huggingface_url: "https://huggingface.co/bert-base-multilingual-cased"
|
| 340 |
+
source: "napolab_thesis"
|
| 341 |
+
|
| 342 |
+
xlm-roberta-base:
|
| 343 |
+
parameters: 270000000
|
| 344 |
+
architecture: "XLM-RoBERTa (base)"
|
| 345 |
+
base_model: "xlm-roberta-base"
|
| 346 |
+
task: "Multiple"
|
| 347 |
+
huggingface_url: "https://huggingface.co/xlm-roberta-base"
|
| 348 |
+
source: "napolab_thesis"
|
| 349 |
+
|
| 350 |
+
bertinho:
|
| 351 |
+
parameters: 110000000
|
| 352 |
+
architecture: "Bertinho"
|
| 353 |
+
base_model: "ricardo-filho/bertinho-portuguese-cased-nli-assin-2"
|
| 354 |
+
task: "Multiple"
|
| 355 |
+
huggingface_url: "https://huggingface.co/ricardo-filho/bertinho-portuguese-cased-nli-assin-2"
|
| 356 |
+
source: "napolab_thesis"
|
| 357 |
+
|
| 358 |
+
ixaes:
|
| 359 |
+
parameters: 110000000
|
| 360 |
+
architecture: "IXAes"
|
| 361 |
+
base_model: "ixa-ehu/ixambert-base-cased"
|
| 362 |
+
task: "Multiple"
|
| 363 |
+
huggingface_url: "https://huggingface.co/ixa-ehu/ixambert-base-cased"
|
| 364 |
+
source: "napolab_thesis"
|
| 365 |
+
|
| 366 |
+
ttl-460m:
|
| 367 |
+
parameters: 460000000
|
| 368 |
+
architecture: "TeenyTinyLlama (460M)"
|
| 369 |
+
base_model: "nicholasKluge/TeenyTinyLlama-460m"
|
| 370 |
+
task: "Multiple"
|
| 371 |
+
huggingface_url: "https://huggingface.co/nicholasKluge/TeenyTinyLlama-460m"
|
| 372 |
+
source: "teenytinyllama_paper"
|
| 373 |
+
|
| 374 |
+
ttl-160m:
|
| 375 |
+
parameters: 160000000
|
| 376 |
+
architecture: "TeenyTinyLlama (160M)"
|
| 377 |
+
base_model: "nicholasKluge/TeenyTinyLlama-160m"
|
| 378 |
+
task: "Multiple"
|
| 379 |
+
huggingface_url: "https://huggingface.co/nicholasKluge/TeenyTinyLlama-160m"
|
| 380 |
+
source: "teenytinyllama_paper"
|
| 381 |
+
|
| 382 |
+
# Additional Models (for Model Hub tab)
|
| 383 |
+
additional_models:
|
| 384 |
+
albertina_models:
|
| 385 |
+
albertina-pt-pt:
|
| 386 |
+
huggingface_url: "https://huggingface.co/PORTULAN/albertina-ptpt"
|
| 387 |
+
albertina-pt-br:
|
| 388 |
+
huggingface_url: "https://huggingface.co/PORTULAN/albertina-ptbr"
|
| 389 |
+
|
| 390 |
+
deberta_models:
|
| 391 |
+
deberta-v2-large:
|
| 392 |
+
huggingface_url: "https://huggingface.co/microsoft/deberta-v2-large"
|
| 393 |
+
mdeberta-v3-base:
|
| 394 |
+
huggingface_url: "https://huggingface.co/microsoft/mdeberta-v3-base"
|
| 395 |
+
|
| 396 |
+
roberta_models:
|
| 397 |
+
xlm-roberta-large:
|
| 398 |
+
huggingface_url: "https://huggingface.co/xlm-roberta-large"
|
| 399 |
+
xlm-roberta-base:
|
| 400 |
+
huggingface_url: "https://huggingface.co/xlm-roberta-base"
|
| 401 |
+
|
| 402 |
+
bert_models:
|
| 403 |
+
bertimbau-large:
|
| 404 |
+
huggingface_url: "https://huggingface.co/neuralmind/bert-large-portuguese-cased"
|
| 405 |
+
bertimbau-base:
|
| 406 |
+
huggingface_url: "https://huggingface.co/neuralmind/bert-base-portuguese-cased"
|
| 407 |
+
bert-large:
|
| 408 |
+
huggingface_url: "https://huggingface.co/bert-large-uncased"
|
| 409 |
+
bert-multilingual-base:
|
| 410 |
+
huggingface_url: "https://huggingface.co/bert-base-multilingual-cased"
|
| 411 |
+
|
| 412 |
+
specialized_models:
|
| 413 |
+
bertinho:
|
| 414 |
+
huggingface_url: "https://huggingface.co/ricardo-filho/bertinho-portuguese-cased-nli-assin-2"
|
| 415 |
+
ixaes:
|
| 416 |
+
huggingface_url: "https://huggingface.co/ixa-ehu/ixambert-base-cased"
|
| 417 |
+
|
| 418 |
+
teenytinyllama_models:
|
| 419 |
+
ttl-460m:
|
| 420 |
+
huggingface_url: "https://huggingface.co/nicholasKluge/TeenyTinyLlama-460m"
|
| 421 |
+
ttl-160m:
|
| 422 |
+
huggingface_url: "https://huggingface.co/nicholasKluge/TeenyTinyLlama-160m"
|
data_loader.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data loader for Napolab Leaderboard
|
| 3 |
+
Loads datasets, benchmark results, and model metadata from YAML configuration files.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import yaml
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Dict, Any, Optional
|
| 10 |
+
|
| 11 |
+
class NapolabDataLoader:
|
| 12 |
+
"""Loads and manages Napolab data from YAML configuration files."""
|
| 13 |
+
|
| 14 |
+
def __init__(self, data_file: str = "data.yaml"):
|
| 15 |
+
"""
|
| 16 |
+
Initialize the data loader.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
data_file: Path to the YAML data file
|
| 20 |
+
"""
|
| 21 |
+
self.data_file = data_file
|
| 22 |
+
self.data = None
|
| 23 |
+
self.load_data()
|
| 24 |
+
|
| 25 |
+
def load_data(self) -> None:
|
| 26 |
+
"""Load data from the YAML file."""
|
| 27 |
+
try:
|
| 28 |
+
# Get the directory where this script is located
|
| 29 |
+
script_dir = Path(__file__).parent
|
| 30 |
+
data_path = script_dir / self.data_file
|
| 31 |
+
|
| 32 |
+
if not data_path.exists():
|
| 33 |
+
raise FileNotFoundError(f"Data file not found: {data_path}")
|
| 34 |
+
|
| 35 |
+
with open(data_path, 'r', encoding='utf-8') as file:
|
| 36 |
+
self.data = yaml.safe_load(file)
|
| 37 |
+
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print(f"Error loading data from {self.data_file}: {e}")
|
| 40 |
+
# Fallback to empty data structure
|
| 41 |
+
self.data = {
|
| 42 |
+
'datasets': {},
|
| 43 |
+
'benchmark_results': {},
|
| 44 |
+
'model_metadata': {},
|
| 45 |
+
'additional_models': {}
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
def get_datasets(self) -> Dict[str, Any]:
|
| 49 |
+
"""Get all datasets information."""
|
| 50 |
+
return self.data.get('datasets', {})
|
| 51 |
+
|
| 52 |
+
def get_benchmark_results(self) -> Dict[str, Any]:
|
| 53 |
+
"""Get all benchmark results."""
|
| 54 |
+
return self.data.get('benchmark_results', {})
|
| 55 |
+
|
| 56 |
+
def get_model_metadata(self) -> Dict[str, Any]:
|
| 57 |
+
"""Get all model metadata."""
|
| 58 |
+
return self.data.get('model_metadata', {})
|
| 59 |
+
|
| 60 |
+
def get_additional_models(self) -> Dict[str, Any]:
|
| 61 |
+
"""Get additional models for the Model Hub."""
|
| 62 |
+
return self.data.get('additional_models', {})
|
| 63 |
+
|
| 64 |
+
def get_dataset_info(self, dataset_name: str) -> Optional[Dict[str, Any]]:
|
| 65 |
+
"""Get information for a specific dataset."""
|
| 66 |
+
return self.data.get('datasets', {}).get(dataset_name)
|
| 67 |
+
|
| 68 |
+
def get_benchmark_for_dataset(self, dataset_name: str) -> Optional[Dict[str, Any]]:
|
| 69 |
+
"""Get benchmark results for a specific dataset."""
|
| 70 |
+
return self.data.get('benchmark_results', {}).get(dataset_name)
|
| 71 |
+
|
| 72 |
+
def get_model_info(self, model_name: str) -> Optional[Dict[str, Any]]:
|
| 73 |
+
"""Get metadata for a specific model."""
|
| 74 |
+
return self.data.get('model_metadata', {}).get(model_name)
|
| 75 |
+
|
| 76 |
+
def get_available_datasets(self) -> list:
|
| 77 |
+
"""Get list of available dataset names."""
|
| 78 |
+
return list(self.data.get('datasets', {}).keys())
|
| 79 |
+
|
| 80 |
+
def get_available_models_for_dataset(self, dataset_name: str) -> list:
|
| 81 |
+
"""Get list of available models for a specific dataset."""
|
| 82 |
+
benchmark = self.get_benchmark_for_dataset(dataset_name)
|
| 83 |
+
if benchmark:
|
| 84 |
+
return list(benchmark.keys())
|
| 85 |
+
return []
|
| 86 |
+
|
| 87 |
+
def get_all_models(self) -> list:
|
| 88 |
+
"""Get list of all available models."""
|
| 89 |
+
return list(self.data.get('model_metadata', {}).keys())
|
| 90 |
+
|
| 91 |
+
def validate_data(self) -> bool:
|
| 92 |
+
"""Validate the loaded data structure."""
|
| 93 |
+
required_keys = ['datasets', 'benchmark_results', 'model_metadata']
|
| 94 |
+
|
| 95 |
+
for key in required_keys:
|
| 96 |
+
if key not in self.data:
|
| 97 |
+
print(f"Missing required key: {key}")
|
| 98 |
+
return False
|
| 99 |
+
|
| 100 |
+
return True
|
| 101 |
+
|
| 102 |
+
def reload_data(self) -> None:
|
| 103 |
+
"""Reload data from the YAML file."""
|
| 104 |
+
self.load_data()
|
| 105 |
+
|
| 106 |
+
def export_data(self, output_file: str = "exported_data.yaml") -> None:
|
| 107 |
+
"""Export the current data to a YAML file."""
|
| 108 |
+
try:
|
| 109 |
+
with open(output_file, 'w', encoding='utf-8') as file:
|
| 110 |
+
yaml.dump(self.data, file, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
| 111 |
+
print(f"Data exported to {output_file}")
|
| 112 |
+
except Exception as e:
|
| 113 |
+
print(f"Error exporting data: {e}")
|
| 114 |
+
|
| 115 |
+
# Global data loader instance
|
| 116 |
+
data_loader = NapolabDataLoader()
|
| 117 |
+
|
| 118 |
+
# Convenience functions for backward compatibility
|
| 119 |
+
def get_napolab_datasets() -> Dict[str, Any]:
|
| 120 |
+
"""Get Napolab datasets (for backward compatibility)."""
|
| 121 |
+
return data_loader.get_datasets()
|
| 122 |
+
|
| 123 |
+
def get_sample_benchmark_results() -> Dict[str, Any]:
|
| 124 |
+
"""Get benchmark results (for backward compatibility)."""
|
| 125 |
+
return data_loader.get_benchmark_results()
|
| 126 |
+
|
| 127 |
+
def get_model_metadata() -> Dict[str, Any]:
|
| 128 |
+
"""Get model metadata (for backward compatibility)."""
|
| 129 |
+
return data_loader.get_model_metadata()
|
| 130 |
+
|
| 131 |
+
def get_additional_models() -> Dict[str, Any]:
|
| 132 |
+
"""Get additional models (for backward compatibility)."""
|
| 133 |
+
return data_loader.get_additional_models()
|
download_external_models.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Script to download external models data from the Open Portuguese LLM Leaderboard
|
| 4 |
+
and convert it to CSV format for import into the benchmark.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import requests
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import json
|
| 10 |
+
import sys
|
| 11 |
+
|
| 12 |
+
def download_external_models():
|
| 13 |
+
"""Download external models data and convert to CSV."""
|
| 14 |
+
|
| 15 |
+
url = "https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard/raw/main/external_models_results.json"
|
| 16 |
+
|
| 17 |
+
print("Downloading external models data...")
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
# Download the JSON file
|
| 21 |
+
response = requests.get(url)
|
| 22 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
| 23 |
+
|
| 24 |
+
# Parse JSON
|
| 25 |
+
data = response.json()
|
| 26 |
+
|
| 27 |
+
if not isinstance(data, list):
|
| 28 |
+
print("Error: Expected JSON array, got:", type(data))
|
| 29 |
+
return
|
| 30 |
+
|
| 31 |
+
print(f"Downloaded {len(data)} external models")
|
| 32 |
+
|
| 33 |
+
# Extract data for each model
|
| 34 |
+
extracted_data = []
|
| 35 |
+
|
| 36 |
+
for item in data:
|
| 37 |
+
if not isinstance(item, dict):
|
| 38 |
+
print(f"Warning: Skipping non-dict item: {type(item)}")
|
| 39 |
+
continue
|
| 40 |
+
|
| 41 |
+
# Extract required fields
|
| 42 |
+
model = item.get('model', '')
|
| 43 |
+
link = item.get('link', '')
|
| 44 |
+
result_metrics = item.get('result_metrics', {})
|
| 45 |
+
|
| 46 |
+
if not isinstance(result_metrics, dict):
|
| 47 |
+
print(f"Warning: Skipping model '{model}' - result_metrics is not a dict")
|
| 48 |
+
continue
|
| 49 |
+
|
| 50 |
+
# Extract metrics
|
| 51 |
+
assin2_sts = result_metrics.get('assin2_sts', 0.0)
|
| 52 |
+
assin2_rte = result_metrics.get('assin2_rte', 0.0)
|
| 53 |
+
faquad_nli = result_metrics.get('faquad_nli', 0.0)
|
| 54 |
+
hatebr_offensive = result_metrics.get('hatebr_offensive', 0.0)
|
| 55 |
+
|
| 56 |
+
# Create row data
|
| 57 |
+
row_data = {
|
| 58 |
+
'model': model,
|
| 59 |
+
'link': link,
|
| 60 |
+
'assin2_sts': assin2_sts,
|
| 61 |
+
'assin2_rte': assin2_rte,
|
| 62 |
+
'faquad_nli': faquad_nli,
|
| 63 |
+
'hatebr_offensive': hatebr_offensive
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
extracted_data.append(row_data)
|
| 67 |
+
|
| 68 |
+
# Create DataFrame
|
| 69 |
+
df = pd.DataFrame(extracted_data)
|
| 70 |
+
|
| 71 |
+
# Save to CSV
|
| 72 |
+
output_file = 'external_models.csv'
|
| 73 |
+
df.to_csv(output_file, index=False)
|
| 74 |
+
|
| 75 |
+
print(f"\nSuccessfully extracted {len(df)} models to {output_file}")
|
| 76 |
+
|
| 77 |
+
# Show first few entries as preview
|
| 78 |
+
print("\nFirst 5 entries:")
|
| 79 |
+
print(df.head().to_string(index=False))
|
| 80 |
+
|
| 81 |
+
# Show some statistics
|
| 82 |
+
if not df.empty:
|
| 83 |
+
print(f"\nStatistics:")
|
| 84 |
+
print(f"Total models: {len(df)}")
|
| 85 |
+
|
| 86 |
+
# Count models with non-zero scores for each metric
|
| 87 |
+
print(f"\nModels with scores:")
|
| 88 |
+
print(f"ASSIN2 STS: {(df['assin2_sts'] > 0).sum()}")
|
| 89 |
+
print(f"ASSIN2 RTE: {(df['assin2_rte'] > 0).sum()}")
|
| 90 |
+
print(f"FaQuAD-NLI: {(df['faquad_nli'] > 0).sum()}")
|
| 91 |
+
print(f"HateBR: {(df['hatebr_offensive'] > 0).sum()}")
|
| 92 |
+
|
| 93 |
+
# Average scores
|
| 94 |
+
print(f"\nAverage scores:")
|
| 95 |
+
print(df[['assin2_sts', 'assin2_rte', 'faquad_nli', 'hatebr_offensive']].mean().round(3))
|
| 96 |
+
|
| 97 |
+
# Show data types and info
|
| 98 |
+
print(f"\nDataFrame info:")
|
| 99 |
+
print(df.info())
|
| 100 |
+
|
| 101 |
+
except requests.exceptions.RequestException as e:
|
| 102 |
+
print(f"Error downloading data: {e}")
|
| 103 |
+
sys.exit(1)
|
| 104 |
+
except json.JSONDecodeError as e:
|
| 105 |
+
print(f"Error parsing JSON: {e}")
|
| 106 |
+
sys.exit(1)
|
| 107 |
+
except Exception as e:
|
| 108 |
+
print(f"Unexpected error: {e}")
|
| 109 |
+
sys.exit(1)
|
| 110 |
+
|
| 111 |
+
def main():
|
| 112 |
+
"""Main function to run the download."""
|
| 113 |
+
print("External Models Data Downloader")
|
| 114 |
+
print("=" * 40)
|
| 115 |
+
|
| 116 |
+
try:
|
| 117 |
+
download_external_models()
|
| 118 |
+
print("\nDownload completed successfully!")
|
| 119 |
+
except Exception as e:
|
| 120 |
+
print(f"Error during download: {e}")
|
| 121 |
+
sys.exit(1)
|
| 122 |
+
|
| 123 |
+
if __name__ == "__main__":
|
| 124 |
+
main()
|
example_usage.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Example Usage of Napolab Leaderboard Data Management
|
| 4 |
+
|
| 5 |
+
This script demonstrates how to use the YAML-based data management system.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from data_loader import NapolabDataLoader
|
| 9 |
+
from manage_data import validate_yaml_structure, add_dataset, add_benchmark_result, add_model_metadata, export_data
|
| 10 |
+
import yaml
|
| 11 |
+
|
| 12 |
+
def example_usage():
|
| 13 |
+
"""Demonstrate the data management functionality."""
|
| 14 |
+
|
| 15 |
+
print("π Napolab Leaderboard Data Management Example")
|
| 16 |
+
print("=" * 50)
|
| 17 |
+
|
| 18 |
+
# 1. Load existing data
|
| 19 |
+
print("\n1. Loading existing data...")
|
| 20 |
+
data_loader = NapolabDataLoader()
|
| 21 |
+
data = data_loader.data
|
| 22 |
+
|
| 23 |
+
print(f"β
Loaded {len(data['datasets'])} datasets")
|
| 24 |
+
print(f"β
Loaded {len(data['model_metadata'])} models")
|
| 25 |
+
|
| 26 |
+
# 2. Validate the data structure
|
| 27 |
+
print("\n2. Validating data structure...")
|
| 28 |
+
if validate_yaml_structure(data):
|
| 29 |
+
print("β
Data structure is valid!")
|
| 30 |
+
else:
|
| 31 |
+
print("β Data structure has issues!")
|
| 32 |
+
return
|
| 33 |
+
|
| 34 |
+
# 3. Add a new dataset
|
| 35 |
+
print("\n3. Adding a new dataset...")
|
| 36 |
+
data = add_dataset(
|
| 37 |
+
data=data,
|
| 38 |
+
dataset_name="example_dataset",
|
| 39 |
+
name="Example Dataset",
|
| 40 |
+
description="An example dataset for demonstration",
|
| 41 |
+
tasks=["Classification", "Sentiment Analysis"],
|
| 42 |
+
url="https://huggingface.co/datasets/example"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# 4. Add a new model
|
| 46 |
+
print("\n4. Adding a new model...")
|
| 47 |
+
data = add_model_metadata(
|
| 48 |
+
data=data,
|
| 49 |
+
model_name="example-model",
|
| 50 |
+
parameters=125000000,
|
| 51 |
+
architecture="BERT Large",
|
| 52 |
+
base_model="bert-large-uncased",
|
| 53 |
+
task="Classification",
|
| 54 |
+
huggingface_url="https://huggingface.co/example/model"
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# 5. Add benchmark results
|
| 58 |
+
print("\n5. Adding benchmark results...")
|
| 59 |
+
data = add_benchmark_result(
|
| 60 |
+
data=data,
|
| 61 |
+
dataset_name="example_dataset",
|
| 62 |
+
model_name="example-model",
|
| 63 |
+
metrics={
|
| 64 |
+
"accuracy": 0.89,
|
| 65 |
+
"f1": 0.88,
|
| 66 |
+
"precision": 0.90,
|
| 67 |
+
"recall": 0.87
|
| 68 |
+
}
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# 6. Export the updated data
|
| 72 |
+
print("\n6. Exporting updated data...")
|
| 73 |
+
export_data(data, "example_updated_data.yaml")
|
| 74 |
+
|
| 75 |
+
# 7. Demonstrate data access
|
| 76 |
+
print("\n7. Demonstrating data access...")
|
| 77 |
+
|
| 78 |
+
# Get dataset info
|
| 79 |
+
dataset_info = data_loader.get_dataset_info("assin")
|
| 80 |
+
if dataset_info:
|
| 81 |
+
print(f"π ASSIN dataset: {dataset_info['name']}")
|
| 82 |
+
print(f" Tasks: {', '.join(dataset_info['tasks'])}")
|
| 83 |
+
|
| 84 |
+
# Get available models for a dataset
|
| 85 |
+
models = data_loader.get_available_models_for_dataset("assin")
|
| 86 |
+
print(f"π€ Available models for ASSIN: {len(models)} models")
|
| 87 |
+
|
| 88 |
+
# Get model info
|
| 89 |
+
model_info = data_loader.get_model_info("mdeberta-v3-base-assin-similarity")
|
| 90 |
+
if model_info:
|
| 91 |
+
print(f"π§ Model parameters: {model_info['parameters']:,}")
|
| 92 |
+
print(f" Architecture: {model_info['architecture']}")
|
| 93 |
+
|
| 94 |
+
print("\nβ
Example completed successfully!")
|
| 95 |
+
print("π Check 'example_updated_data.yaml' for the updated data")
|
| 96 |
+
|
| 97 |
+
def demonstrate_yaml_structure():
|
| 98 |
+
"""Show the YAML structure."""
|
| 99 |
+
print("\nπ YAML Data Structure Example:")
|
| 100 |
+
print("-" * 30)
|
| 101 |
+
|
| 102 |
+
example_data = {
|
| 103 |
+
'datasets': {
|
| 104 |
+
'my_dataset': {
|
| 105 |
+
'name': 'My Dataset',
|
| 106 |
+
'description': 'A custom dataset',
|
| 107 |
+
'tasks': ['Classification'],
|
| 108 |
+
'url': 'https://huggingface.co/datasets/my_dataset'
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
'benchmark_results': {
|
| 112 |
+
'my_dataset': {
|
| 113 |
+
'my-model': {
|
| 114 |
+
'accuracy': 0.92,
|
| 115 |
+
'f1': 0.91
|
| 116 |
+
}
|
| 117 |
+
}
|
| 118 |
+
},
|
| 119 |
+
'model_metadata': {
|
| 120 |
+
'my-model': {
|
| 121 |
+
'parameters': 110000000,
|
| 122 |
+
'architecture': 'BERT Base',
|
| 123 |
+
'base_model': 'bert-base-uncased',
|
| 124 |
+
'task': 'Classification',
|
| 125 |
+
'huggingface_url': 'https://huggingface.co/my-model'
|
| 126 |
+
}
|
| 127 |
+
}
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
print(yaml.dump(example_data, default_flow_style=False, allow_unicode=True))
|
| 131 |
+
|
| 132 |
+
if __name__ == "__main__":
|
| 133 |
+
example_usage()
|
| 134 |
+
demonstrate_yaml_structure()
|
external_models.csv
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model,link,assin2_sts,assin2_rte,faquad_nli,hatebr_offensive
|
| 2 |
+
sabia-2-small,https://www.maritaca.ai/,0.7053302344881672,0.9121728362223306,0.7575848453041435,0.753800795680591
|
| 3 |
+
sabia-2-medium,https://www.maritaca.ai/,0.7804108376537757,0.923459363368553,0.7657657657657658,0.8349989882997386
|
| 4 |
+
gpt-3.5-turbo-0125,https://www.openai.com/,0.7378460201077941,0.8823038414050672,0.746353108609074,0.8056205941193919
|
| 5 |
+
claude-3-haiku-20240307,https://www.claude.ai/,0.7892124744168747,0.9184462138121732,0.6340996599941455,0.8023698759439051
|
| 6 |
+
gemini-1.0-pro,https://ai.google.dev/,0.7058831239763663,0.8945993304651698,0.7070913567220611,0.8086330094493972
|
| 7 |
+
gemini-1.5-pro-preview-0409,https://cloud.google.com/vertex-ai,0.8159702278408203,0.9328989988467518,0.7290756302521009,0.8697698647467024
|
| 8 |
+
deepseek-v2-chat,https://www.deepseek.com/,0.8533174657651231,0.9440170304568147,0.7995469048381548,0.8842986491071644
|
| 9 |
+
gemini-1.5-flash-preview-0514,https://cloud.google.com/vertex-ai,0.841655158151231,0.9362097477374545,0.8092185592185592,0.9099110141445836
|
| 10 |
+
gemini-1.5-flash-001,https://cloud.google.com/vertex-ai,0.838806085610371,0.9366169973822607,0.7963910785668922,0.9092078461170015
|
| 11 |
+
gpt-4o-mini-2024-07-18,https://www.openai.com/,0.7259038954527597,0.942809846745341,0.819807735300693,0.8682357029532165
|
| 12 |
+
nemotron-4-340b-instruct,https://huggingface.co/nvidia/Nemotron-4-340B-Instruct,0.7857731021403329,0.9489354458928496,0.8194444444444444,0.8641580001234928
|
| 13 |
+
llama_405b_instruct,https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct,0.7888441732870783,0.9476445477916471,0.825063276593557,0.9073940659389119
|
| 14 |
+
sabia-3,https://www.maritaca.ai/,0.8253863689009022,0.9477034821619312,0.8243848812618203,0.8278737774590023
|
| 15 |
+
llama3_3_70b,https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct,0.7275578599896508,0.9407071010860484,0.8787563033858187,0.9024358249091997
|
| 16 |
+
llama3_2_90b,https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct,0.7368518566379951,0.9216548775103446,0.8632015306122449,0.8965270877302478
|
| 17 |
+
gemini-1.5-flash-002,https://cloud.google.com/vertex-ai,0.8380176734291938,0.941176117215237,0.8360786822325283,0.9046145161133335
|
| 18 |
+
gemini-1.5-flash-8b-001,https://aistudio.google.com,0.7638946799836569,0.9329452628161146,0.7937022965448601,0.850497640901663
|
| 19 |
+
gemini-2.0-flash-001,https://cloud.google.com/vertex-ai,0.8440142633742483,0.9305165510724053,0.7533651260745065,0.8890432813545366
|
| 20 |
+
gemini-2.0-flash-lite-001,https://cloud.google.com/vertex-ai,0.8492479991621328,0.9216548775103446,0.7652777777777777,0.8522499647780968
|
| 21 |
+
gemini-2.5-pro-exp-03-25,https://aistudio.google.com,0.837785744915033,0.9415510158830285,0.8738735797309651,0.9248478168290788
|
| 22 |
+
deepSeek-v3-0324,https://huggingface.co/deepseek-ai/DeepSeek-V3-0324,0.8145997097875548,0.9421860387625551,0.796751127001399,0.9060129756724185
|
| 23 |
+
qwen2-5-vl-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct,0.7595538567467497,0.9472975104201871,0.8447190882122586,0.8810695094657859
|
| 24 |
+
qwen2-5-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-72B-Instruct,0.8230708844558656,0.9509720145268106,0.8194444444444444,0.8810033427242816
|
| 25 |
+
qwen2-5-vl-32b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct,0.7780549055529008,0.9472975104201871,0.8447190882122586,0.8810695094657859
|
| 26 |
+
qwen-turbo-2024-11-01,https://www.alibabacloud.com/en/product/modelstudio,0.7640477700456898,0.9260451969385788,0.8128063725490196,0.8567933277676292
|
| 27 |
+
gpt-4o-2024-08-06,https://www.openai.com/,0.8078677969518289,0.9407235712144604,0.8654396266184885,0.9320137873994456
|
| 28 |
+
claude-3-7-sonnet-20250219,https://www.anthropic.com/,0.8087979933117393,0.9472965253044003,0.8097848807348216,0.9125114739050616
|
| 29 |
+
llama-4-scout-16e,https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct,0.7741640227983941,0.9312877465954967,0.8567037452287072,0.8813700069483281
|
| 30 |
+
llama-4-maverick-128e,https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct,0.7333246903202654,0.9329419027588105,0.7823695413019562,0.9047550357833591
|
| 31 |
+
gemma-3-27b-it,https://huggingface.co/google/gemma-3-27b-it,0.8147646517017526,0.9411147367212748,0.8143210816987241,0.8729414870796344
|
extract_portuguese_leaderboard.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Script to extract data from JSON files in a repository folder
|
| 4 |
+
and save it as a CSV file for import into the benchmark.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
import argparse
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
def is_valid_json_file(file_path):
|
| 15 |
+
"""
|
| 16 |
+
Check if a file is a valid JSON file containing a dict.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
file_path (str): Path to the JSON file
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
bool: True if valid JSON dict, False otherwise
|
| 23 |
+
"""
|
| 24 |
+
try:
|
| 25 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 26 |
+
data = json.load(f)
|
| 27 |
+
return isinstance(data, dict)
|
| 28 |
+
except (json.JSONDecodeError, FileNotFoundError, UnicodeDecodeError):
|
| 29 |
+
return False
|
| 30 |
+
|
| 31 |
+
def find_json_files(repo_path):
|
| 32 |
+
"""
|
| 33 |
+
Recursively find all JSON files in the repository folder.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
repo_path (str): Path to the repository folder
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
list: List of paths to valid JSON files
|
| 40 |
+
"""
|
| 41 |
+
json_files = []
|
| 42 |
+
repo_path = Path(repo_path)
|
| 43 |
+
|
| 44 |
+
if not repo_path.exists():
|
| 45 |
+
print(f"Error: Repository path '{repo_path}' does not exist.")
|
| 46 |
+
return []
|
| 47 |
+
|
| 48 |
+
if not repo_path.is_dir():
|
| 49 |
+
print(f"Error: Repository path '{repo_path}' is not a directory.")
|
| 50 |
+
return []
|
| 51 |
+
|
| 52 |
+
print(f"Scanning repository: {repo_path}")
|
| 53 |
+
|
| 54 |
+
for file_path in repo_path.rglob("*.json"):
|
| 55 |
+
if is_valid_json_file(file_path):
|
| 56 |
+
json_files.append(file_path)
|
| 57 |
+
print(f"Found valid JSON file: {file_path}")
|
| 58 |
+
|
| 59 |
+
print(f"Total valid JSON files found: {len(json_files)}")
|
| 60 |
+
return json_files
|
| 61 |
+
|
| 62 |
+
def extract_data_from_json(json_file_path):
|
| 63 |
+
"""
|
| 64 |
+
Extract data from a single JSON file.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
json_file_path (Path): Path to the JSON file
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
dict or None: Extracted data or None if extraction failed
|
| 71 |
+
"""
|
| 72 |
+
try:
|
| 73 |
+
with open(json_file_path, 'r', encoding='utf-8') as f:
|
| 74 |
+
data = json.load(f)
|
| 75 |
+
|
| 76 |
+
# Check if required fields exist
|
| 77 |
+
if 'config_general' not in data or 'results' not in data:
|
| 78 |
+
return None
|
| 79 |
+
|
| 80 |
+
config_general = data['config_general']
|
| 81 |
+
results = data['results']
|
| 82 |
+
|
| 83 |
+
# Extract model information
|
| 84 |
+
model_name = config_general.get('model_name', '')
|
| 85 |
+
model_private = config_general.get('model_private', False)
|
| 86 |
+
|
| 87 |
+
# Extract results
|
| 88 |
+
all_grouped = results.get('all_grouped', {})
|
| 89 |
+
|
| 90 |
+
# Extract metrics
|
| 91 |
+
assin2_rte = all_grouped.get('assin2_rte', 0.0)
|
| 92 |
+
assin2_sts = all_grouped.get('assin2_sts', 0.0)
|
| 93 |
+
faquad_nli = all_grouped.get('faquad_nli', 0.0)
|
| 94 |
+
hatebr_offensive = all_grouped.get('hatebr_offensive', 0.0)
|
| 95 |
+
|
| 96 |
+
# Create row data
|
| 97 |
+
row_data = {
|
| 98 |
+
'json_file': str(json_file_path),
|
| 99 |
+
'model_name': model_name,
|
| 100 |
+
'model_private': model_private,
|
| 101 |
+
'assin2_rte': assin2_rte,
|
| 102 |
+
'assin2_sts': assin2_sts,
|
| 103 |
+
'faquad_nli': faquad_nli,
|
| 104 |
+
'hatebr_offensive': hatebr_offensive
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
return row_data
|
| 108 |
+
|
| 109 |
+
except Exception as e:
|
| 110 |
+
print(f"Error processing {json_file_path}: {e}")
|
| 111 |
+
return None
|
| 112 |
+
|
| 113 |
+
def extract_portuguese_leaderboard(repo_path):
|
| 114 |
+
"""
|
| 115 |
+
Extract data from JSON files in the repository folder and save as CSV.
|
| 116 |
+
|
| 117 |
+
Args:
|
| 118 |
+
repo_path (str): Path to the repository folder
|
| 119 |
+
"""
|
| 120 |
+
|
| 121 |
+
print("Scanning repository for JSON files...")
|
| 122 |
+
|
| 123 |
+
# Find all JSON files
|
| 124 |
+
json_files = find_json_files(repo_path)
|
| 125 |
+
|
| 126 |
+
if not json_files:
|
| 127 |
+
print("No valid JSON files found in the repository.")
|
| 128 |
+
return
|
| 129 |
+
|
| 130 |
+
# Prepare data for DataFrame
|
| 131 |
+
data = []
|
| 132 |
+
|
| 133 |
+
# Process each JSON file
|
| 134 |
+
for i, json_file in enumerate(json_files):
|
| 135 |
+
print(f"Processing file {i+1}/{len(json_files)}: {json_file.name}")
|
| 136 |
+
|
| 137 |
+
row_data = extract_data_from_json(json_file)
|
| 138 |
+
if row_data:
|
| 139 |
+
data.append(row_data)
|
| 140 |
+
|
| 141 |
+
# Print progress every 10 files
|
| 142 |
+
if (i + 1) % 10 == 0:
|
| 143 |
+
print(f" Processed {i + 1} files...")
|
| 144 |
+
|
| 145 |
+
if not data:
|
| 146 |
+
print("No valid data extracted from JSON files.")
|
| 147 |
+
return
|
| 148 |
+
|
| 149 |
+
# Create DataFrame
|
| 150 |
+
df = pd.DataFrame(data)
|
| 151 |
+
|
| 152 |
+
# Write to CSV
|
| 153 |
+
output_file = 'portuguese_leaderboard.csv'
|
| 154 |
+
df.to_csv(output_file, index=False)
|
| 155 |
+
|
| 156 |
+
print(f"\nSuccessfully extracted {len(df)} models to {output_file}")
|
| 157 |
+
|
| 158 |
+
# Show first few entries as preview
|
| 159 |
+
print("\nFirst 5 entries:")
|
| 160 |
+
print(df.head().to_string(index=False))
|
| 161 |
+
|
| 162 |
+
# Show some statistics
|
| 163 |
+
if not df.empty:
|
| 164 |
+
print(f"\nStatistics:")
|
| 165 |
+
print(f"Total models: {len(df)}")
|
| 166 |
+
print(f"Private models: {df['model_private'].sum()}")
|
| 167 |
+
print(f"Public models: {(~df['model_private']).sum()}")
|
| 168 |
+
|
| 169 |
+
# Average scores
|
| 170 |
+
print(f"\nAverage scores:")
|
| 171 |
+
print(df[['assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']].mean().round(2))
|
| 172 |
+
|
| 173 |
+
# Show data types and info
|
| 174 |
+
print(f"\nDataFrame info:")
|
| 175 |
+
print(df.info())
|
| 176 |
+
|
| 177 |
+
def main():
|
| 178 |
+
"""Main function to run the extraction."""
|
| 179 |
+
parser = argparse.ArgumentParser(description='Extract Portuguese LLM Leaderboard data from JSON files')
|
| 180 |
+
parser.add_argument('repo_path', help='Path to the repository folder containing JSON files')
|
| 181 |
+
|
| 182 |
+
args = parser.parse_args()
|
| 183 |
+
|
| 184 |
+
print("Portuguese LLM Leaderboard Data Extractor")
|
| 185 |
+
print("=" * 50)
|
| 186 |
+
|
| 187 |
+
try:
|
| 188 |
+
extract_portuguese_leaderboard(args.repo_path)
|
| 189 |
+
print("\nExtraction completed successfully!")
|
| 190 |
+
except Exception as e:
|
| 191 |
+
print(f"Error during extraction: {e}")
|
| 192 |
+
sys.exit(1)
|
| 193 |
+
|
| 194 |
+
if __name__ == "__main__":
|
| 195 |
+
main()
|
manage_data.py
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Data Management Utility for Napolab Leaderboard
|
| 4 |
+
|
| 5 |
+
This script provides utilities to manage, validate, and update the YAML data file.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import yaml
|
| 9 |
+
import argparse
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from data_loader import NapolabDataLoader
|
| 12 |
+
from typing import Dict, Any
|
| 13 |
+
|
| 14 |
+
def validate_yaml_structure(data: Dict[str, Any]) -> bool:
|
| 15 |
+
"""Validate the YAML data structure."""
|
| 16 |
+
print("π Validating YAML structure...")
|
| 17 |
+
|
| 18 |
+
required_sections = ['datasets', 'benchmark_results', 'model_metadata']
|
| 19 |
+
|
| 20 |
+
for section in required_sections:
|
| 21 |
+
if section not in data:
|
| 22 |
+
print(f"β Missing required section: {section}")
|
| 23 |
+
return False
|
| 24 |
+
print(f"β
Found section: {section}")
|
| 25 |
+
|
| 26 |
+
# Validate datasets
|
| 27 |
+
print("\nπ Validating datasets...")
|
| 28 |
+
for dataset_name, dataset_info in data['datasets'].items():
|
| 29 |
+
required_fields = ['name', 'description', 'tasks', 'url']
|
| 30 |
+
for field in required_fields:
|
| 31 |
+
if field not in dataset_info:
|
| 32 |
+
print(f"β Dataset '{dataset_name}' missing field: {field}")
|
| 33 |
+
return False
|
| 34 |
+
print(f"β
Dataset '{dataset_name}' is valid")
|
| 35 |
+
|
| 36 |
+
# Validate benchmark results
|
| 37 |
+
print("\nπ Validating benchmark results...")
|
| 38 |
+
for dataset_name, models in data['benchmark_results'].items():
|
| 39 |
+
if dataset_name not in data['datasets']:
|
| 40 |
+
print(f"β οΈ Warning: Benchmark for '{dataset_name}' but no dataset info found")
|
| 41 |
+
|
| 42 |
+
for model_name, metrics in models.items():
|
| 43 |
+
if not isinstance(metrics, dict):
|
| 44 |
+
print(f"β Invalid metrics format for model '{model_name}'")
|
| 45 |
+
return False
|
| 46 |
+
print(f"β
Model '{model_name}' has {len(metrics)} metrics")
|
| 47 |
+
|
| 48 |
+
# Validate model metadata
|
| 49 |
+
print("\nπ€ Validating model metadata...")
|
| 50 |
+
for model_name, metadata in data['model_metadata'].items():
|
| 51 |
+
required_fields = ['parameters', 'architecture', 'base_model', 'task']
|
| 52 |
+
for field in required_fields:
|
| 53 |
+
if field not in metadata:
|
| 54 |
+
print(f"β Model '{model_name}' missing field: {field}")
|
| 55 |
+
return False
|
| 56 |
+
print(f"β
Model '{model_name}' is valid")
|
| 57 |
+
|
| 58 |
+
print("\nπ All validations passed!")
|
| 59 |
+
return True
|
| 60 |
+
|
| 61 |
+
def create_sample_data() -> Dict[str, Any]:
|
| 62 |
+
"""Create a sample data structure."""
|
| 63 |
+
return {
|
| 64 |
+
'datasets': {
|
| 65 |
+
'sample_dataset': {
|
| 66 |
+
'name': 'Sample Dataset',
|
| 67 |
+
'description': 'A sample dataset for testing',
|
| 68 |
+
'tasks': ['Classification'],
|
| 69 |
+
'url': 'https://huggingface.co/datasets/sample'
|
| 70 |
+
}
|
| 71 |
+
},
|
| 72 |
+
'benchmark_results': {
|
| 73 |
+
'sample_dataset': {
|
| 74 |
+
'sample-model': {
|
| 75 |
+
'accuracy': 0.85,
|
| 76 |
+
'f1': 0.84
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
'model_metadata': {
|
| 81 |
+
'sample-model': {
|
| 82 |
+
'parameters': 100000000,
|
| 83 |
+
'architecture': 'BERT Base',
|
| 84 |
+
'base_model': 'bert-base-uncased',
|
| 85 |
+
'task': 'Classification',
|
| 86 |
+
'huggingface_url': 'https://huggingface.co/sample/model'
|
| 87 |
+
}
|
| 88 |
+
},
|
| 89 |
+
'additional_models': {}
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
def add_dataset(data: Dict[str, Any], dataset_name: str, name: str, description: str,
|
| 93 |
+
tasks: list, url: str) -> Dict[str, Any]:
|
| 94 |
+
"""Add a new dataset to the data structure."""
|
| 95 |
+
data['datasets'][dataset_name] = {
|
| 96 |
+
'name': name,
|
| 97 |
+
'description': description,
|
| 98 |
+
'tasks': tasks,
|
| 99 |
+
'url': url
|
| 100 |
+
}
|
| 101 |
+
print(f"β
Added dataset: {dataset_name}")
|
| 102 |
+
return data
|
| 103 |
+
|
| 104 |
+
def add_benchmark_result(data: Dict[str, Any], dataset_name: str, model_name: str,
|
| 105 |
+
metrics: Dict[str, float]) -> Dict[str, Any]:
|
| 106 |
+
"""Add benchmark results for a model on a dataset."""
|
| 107 |
+
if dataset_name not in data['benchmark_results']:
|
| 108 |
+
data['benchmark_results'][dataset_name] = {}
|
| 109 |
+
|
| 110 |
+
data['benchmark_results'][dataset_name][model_name] = metrics
|
| 111 |
+
print(f"β
Added benchmark result for {model_name} on {dataset_name}")
|
| 112 |
+
return data
|
| 113 |
+
|
| 114 |
+
def add_model_metadata(data: Dict[str, Any], model_name: str, parameters: int,
|
| 115 |
+
architecture: str, base_model: str, task: str,
|
| 116 |
+
huggingface_url: str = None) -> Dict[str, Any]:
|
| 117 |
+
"""Add model metadata."""
|
| 118 |
+
data['model_metadata'][model_name] = {
|
| 119 |
+
'parameters': parameters,
|
| 120 |
+
'architecture': architecture,
|
| 121 |
+
'base_model': base_model,
|
| 122 |
+
'task': task
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
if huggingface_url:
|
| 126 |
+
data['model_metadata'][model_name]['huggingface_url'] = huggingface_url
|
| 127 |
+
|
| 128 |
+
print(f"β
Added model metadata: {model_name}")
|
| 129 |
+
return data
|
| 130 |
+
|
| 131 |
+
def export_data(data: Dict[str, Any], output_file: str) -> None:
|
| 132 |
+
"""Export data to a YAML file."""
|
| 133 |
+
try:
|
| 134 |
+
with open(output_file, 'w', encoding='utf-8') as file:
|
| 135 |
+
yaml.dump(data, file, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
| 136 |
+
print(f"β
Data exported to {output_file}")
|
| 137 |
+
except Exception as e:
|
| 138 |
+
print(f"β Error exporting data: {e}")
|
| 139 |
+
|
| 140 |
+
def main():
|
| 141 |
+
"""Main function for command-line interface."""
|
| 142 |
+
parser = argparse.ArgumentParser(description='Manage Napolab Leaderboard Data')
|
| 143 |
+
parser.add_argument('action', choices=['validate', 'create-sample', 'add-dataset', 'add-benchmark', 'add-model'],
|
| 144 |
+
help='Action to perform')
|
| 145 |
+
parser.add_argument('--data-file', default='data.yaml', help='Path to data file')
|
| 146 |
+
parser.add_argument('--output', help='Output file for export')
|
| 147 |
+
|
| 148 |
+
# Dataset arguments
|
| 149 |
+
parser.add_argument('--dataset-name', help='Dataset name')
|
| 150 |
+
parser.add_argument('--dataset-display-name', help='Dataset display name')
|
| 151 |
+
parser.add_argument('--dataset-description', help='Dataset description')
|
| 152 |
+
parser.add_argument('--dataset-tasks', nargs='+', help='Dataset tasks')
|
| 153 |
+
parser.add_argument('--dataset-url', help='Dataset URL')
|
| 154 |
+
|
| 155 |
+
# Benchmark arguments
|
| 156 |
+
parser.add_argument('--model-name', help='Model name')
|
| 157 |
+
parser.add_argument('--metrics', nargs='+', help='Metrics as key=value pairs')
|
| 158 |
+
|
| 159 |
+
# Model metadata arguments
|
| 160 |
+
parser.add_argument('--parameters', type=int, help='Number of parameters')
|
| 161 |
+
parser.add_argument('--architecture', help='Model architecture')
|
| 162 |
+
parser.add_argument('--base-model', help='Base model name')
|
| 163 |
+
parser.add_argument('--task', help='Task type')
|
| 164 |
+
parser.add_argument('--huggingface-url', help='Hugging Face URL')
|
| 165 |
+
|
| 166 |
+
args = parser.parse_args()
|
| 167 |
+
|
| 168 |
+
# Load existing data or create new
|
| 169 |
+
data_loader = NapolabDataLoader(args.data_file)
|
| 170 |
+
data = data_loader.data
|
| 171 |
+
|
| 172 |
+
if args.action == 'validate':
|
| 173 |
+
if validate_yaml_structure(data):
|
| 174 |
+
print("β
Data validation successful!")
|
| 175 |
+
else:
|
| 176 |
+
print("β Data validation failed!")
|
| 177 |
+
return 1
|
| 178 |
+
|
| 179 |
+
elif args.action == 'create-sample':
|
| 180 |
+
data = create_sample_data()
|
| 181 |
+
export_data(data, args.output or 'sample_data.yaml')
|
| 182 |
+
|
| 183 |
+
elif args.action == 'add-dataset':
|
| 184 |
+
if not all([args.dataset_name, args.dataset_display_name, args.dataset_description,
|
| 185 |
+
args.dataset_tasks, args.dataset_url]):
|
| 186 |
+
print("β All dataset arguments are required")
|
| 187 |
+
return 1
|
| 188 |
+
|
| 189 |
+
data = add_dataset(data, args.dataset_name, args.dataset_display_name,
|
| 190 |
+
args.dataset_description, args.dataset_tasks, args.dataset_url)
|
| 191 |
+
export_data(data, args.data_file)
|
| 192 |
+
|
| 193 |
+
elif args.action == 'add-benchmark':
|
| 194 |
+
if not all([args.dataset_name, args.model_name, args.metrics]):
|
| 195 |
+
print("β All benchmark arguments are required")
|
| 196 |
+
return 1
|
| 197 |
+
|
| 198 |
+
# Parse metrics
|
| 199 |
+
metrics = {}
|
| 200 |
+
for metric in args.metrics:
|
| 201 |
+
if '=' in metric:
|
| 202 |
+
key, value = metric.split('=', 1)
|
| 203 |
+
try:
|
| 204 |
+
metrics[key] = float(value)
|
| 205 |
+
except ValueError:
|
| 206 |
+
print(f"β Invalid metric value: {metric}")
|
| 207 |
+
return 1
|
| 208 |
+
|
| 209 |
+
data = add_benchmark_result(data, args.dataset_name, args.model_name, metrics)
|
| 210 |
+
export_data(data, args.data_file)
|
| 211 |
+
|
| 212 |
+
elif args.action == 'add-model':
|
| 213 |
+
if not all([args.model_name, args.parameters, args.architecture,
|
| 214 |
+
args.base_model, args.task]):
|
| 215 |
+
print("β All model metadata arguments are required")
|
| 216 |
+
return 1
|
| 217 |
+
|
| 218 |
+
data = add_model_metadata(data, args.model_name, args.parameters,
|
| 219 |
+
args.architecture, args.base_model, args.task,
|
| 220 |
+
args.huggingface_url)
|
| 221 |
+
export_data(data, args.data_file)
|
| 222 |
+
|
| 223 |
+
return 0
|
| 224 |
+
|
| 225 |
+
if __name__ == "__main__":
|
| 226 |
+
exit(main())
|
portuguese_leaderboard.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.0.0
|
| 2 |
+
pandas>=1.5.0
|
| 3 |
+
numpy>=1.21.0
|
| 4 |
+
plotly>=5.0.0
|
| 5 |
+
transformers>=4.20.0
|
| 6 |
+
torch>=1.12.0
|
| 7 |
+
huggingface-hub>=0.10.0
|
| 8 |
+
PyYAML>=6.0
|
run_app.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Napolab Leaderboard Launcher Script
|
| 4 |
+
|
| 5 |
+
This script checks dependencies and launches the Gradio app for the Napolab leaderboard.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
import subprocess
|
| 10 |
+
import importlib.util
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
def check_dependency(package_name):
|
| 14 |
+
"""Check if a package is installed."""
|
| 15 |
+
spec = importlib.util.find_spec(package_name)
|
| 16 |
+
return spec is not None
|
| 17 |
+
|
| 18 |
+
def install_dependencies():
|
| 19 |
+
"""Install required dependencies."""
|
| 20 |
+
print("Installing required dependencies...")
|
| 21 |
+
try:
|
| 22 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
|
| 23 |
+
print("β
Dependencies installed successfully!")
|
| 24 |
+
return True
|
| 25 |
+
except subprocess.CalledProcessError as e:
|
| 26 |
+
print(f"β Failed to install dependencies: {e}")
|
| 27 |
+
return False
|
| 28 |
+
|
| 29 |
+
def main():
|
| 30 |
+
"""Main launcher function."""
|
| 31 |
+
print("π Napolab Leaderboard Launcher")
|
| 32 |
+
print("=" * 40)
|
| 33 |
+
|
| 34 |
+
# Check if we're in the right directory
|
| 35 |
+
if not Path("app.py").exists():
|
| 36 |
+
print("β Error: app.py not found. Please run this script from the leaderboard directory.")
|
| 37 |
+
sys.exit(1)
|
| 38 |
+
|
| 39 |
+
# Check required dependencies
|
| 40 |
+
required_packages = ["gradio", "pandas", "numpy", "datasets", "plotly"]
|
| 41 |
+
missing_packages = []
|
| 42 |
+
|
| 43 |
+
for package in required_packages:
|
| 44 |
+
if not check_dependency(package):
|
| 45 |
+
missing_packages.append(package)
|
| 46 |
+
|
| 47 |
+
if missing_packages:
|
| 48 |
+
print(f"β Missing dependencies: {', '.join(missing_packages)}")
|
| 49 |
+
print("Installing dependencies...")
|
| 50 |
+
if not install_dependencies():
|
| 51 |
+
print("β Failed to install dependencies. Please install them manually:")
|
| 52 |
+
print("pip install -r requirements.txt")
|
| 53 |
+
sys.exit(1)
|
| 54 |
+
else:
|
| 55 |
+
print("β
All dependencies are installed!")
|
| 56 |
+
|
| 57 |
+
# Launch the app
|
| 58 |
+
print("\nπ Launching Napolab Leaderboard...")
|
| 59 |
+
print("The app will be available at: http://localhost:7860")
|
| 60 |
+
print("Press Ctrl+C to stop the server")
|
| 61 |
+
print("-" * 40)
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
import app
|
| 65 |
+
# The app will be launched by the import
|
| 66 |
+
except KeyboardInterrupt:
|
| 67 |
+
print("\nπ Server stopped by user")
|
| 68 |
+
except Exception as e:
|
| 69 |
+
print(f"β Error launching app: {e}")
|
| 70 |
+
sys.exit(1)
|
| 71 |
+
|
| 72 |
+
if __name__ == "__main__":
|
| 73 |
+
main()
|
validate_data.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Validation script for the updated Napolab data structure
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from data_loader import NapolabDataLoader
|
| 7 |
+
from manage_data import validate_yaml_structure
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
def main():
|
| 11 |
+
"""Validate the updated data structure."""
|
| 12 |
+
print("π Validating Updated Napolab Data Structure")
|
| 13 |
+
print("=" * 50)
|
| 14 |
+
print("π Data Source: Master's thesis 'Lessons learned from the evaluation of Portuguese language models'")
|
| 15 |
+
print(" by Ruan Chaves Rodrigues (2023) - University of Malta")
|
| 16 |
+
print(" Available at: https://www.um.edu.mt/library/oar/handle/123456789/120557")
|
| 17 |
+
print("=" * 50)
|
| 18 |
+
|
| 19 |
+
# Load data
|
| 20 |
+
data_loader = NapolabDataLoader()
|
| 21 |
+
data = data_loader.data
|
| 22 |
+
|
| 23 |
+
# Validate structure
|
| 24 |
+
print("\n1. Validating YAML structure...")
|
| 25 |
+
if validate_yaml_structure(data):
|
| 26 |
+
print("β
YAML structure is valid!")
|
| 27 |
+
else:
|
| 28 |
+
print("β YAML structure has issues!")
|
| 29 |
+
return
|
| 30 |
+
|
| 31 |
+
# Check datasets
|
| 32 |
+
print("\n2. Checking datasets...")
|
| 33 |
+
datasets = data_loader.get_datasets()
|
| 34 |
+
print(f"π Found {len(datasets)} datasets:")
|
| 35 |
+
for name, info in datasets.items():
|
| 36 |
+
print(f" - {name}: {info['name']} ({', '.join(info['tasks'])})")
|
| 37 |
+
|
| 38 |
+
# Check benchmark results
|
| 39 |
+
print("\n3. Checking benchmark results...")
|
| 40 |
+
benchmark_results = data_loader.get_benchmark_results()
|
| 41 |
+
print(f"π Found {len(benchmark_results)} benchmark datasets:")
|
| 42 |
+
for dataset_name, models in benchmark_results.items():
|
| 43 |
+
print(f" - {dataset_name}: {len(models)} models")
|
| 44 |
+
|
| 45 |
+
# Check model metadata
|
| 46 |
+
print("\n4. Checking model metadata...")
|
| 47 |
+
model_metadata = data_loader.get_model_metadata()
|
| 48 |
+
print(f"π€ Found {len(model_metadata)} models:")
|
| 49 |
+
|
| 50 |
+
# Group models by architecture
|
| 51 |
+
architectures = {}
|
| 52 |
+
for model_name, metadata in model_metadata.items():
|
| 53 |
+
arch = metadata['architecture']
|
| 54 |
+
if arch not in architectures:
|
| 55 |
+
architectures[arch] = []
|
| 56 |
+
architectures[arch].append(model_name)
|
| 57 |
+
|
| 58 |
+
for arch, models in architectures.items():
|
| 59 |
+
print(f" - {arch}: {len(models)} models")
|
| 60 |
+
for model in models[:3]: # Show first 3 models
|
| 61 |
+
print(f" * {model}")
|
| 62 |
+
if len(models) > 3:
|
| 63 |
+
print(f" ... and {len(models) - 3} more")
|
| 64 |
+
|
| 65 |
+
# Test data access functions
|
| 66 |
+
print("\n5. Testing data access functions...")
|
| 67 |
+
|
| 68 |
+
# Test getting available models for a dataset
|
| 69 |
+
test_dataset = list(benchmark_results.keys())[0]
|
| 70 |
+
models = data_loader.get_available_models_for_dataset(test_dataset)
|
| 71 |
+
print(f" Available models for {test_dataset}: {len(models)} models")
|
| 72 |
+
|
| 73 |
+
# Test getting model info
|
| 74 |
+
if models:
|
| 75 |
+
test_model = models[0]
|
| 76 |
+
model_info = data_loader.get_model_info(test_model)
|
| 77 |
+
if model_info:
|
| 78 |
+
print(f" Model {test_model}: {model_info['parameters']:,} parameters")
|
| 79 |
+
|
| 80 |
+
# Create a summary table
|
| 81 |
+
print("\n6. Creating summary table...")
|
| 82 |
+
summary_data = []
|
| 83 |
+
|
| 84 |
+
for dataset_name, models in benchmark_results.items():
|
| 85 |
+
for model_name, metrics in models.items():
|
| 86 |
+
if model_name in model_metadata:
|
| 87 |
+
summary_data.append({
|
| 88 |
+
'Dataset': dataset_name,
|
| 89 |
+
'Model': model_name,
|
| 90 |
+
'Architecture': model_metadata[model_name]['architecture'],
|
| 91 |
+
'Parameters': model_metadata[model_name]['parameters'],
|
| 92 |
+
'Performance': metrics.get('accuracy', 0)
|
| 93 |
+
})
|
| 94 |
+
|
| 95 |
+
if summary_data:
|
| 96 |
+
df = pd.DataFrame(summary_data)
|
| 97 |
+
print(f"π Summary: {len(df)} model-dataset combinations")
|
| 98 |
+
print(f" Average performance: {df['Performance'].mean():.3f}")
|
| 99 |
+
print(f" Best performance: {df['Performance'].max():.3f}")
|
| 100 |
+
print(f" Models with >0.9 performance: {(df['Performance'] > 0.9).sum()}")
|
| 101 |
+
|
| 102 |
+
print("\nβ
Validation completed successfully!")
|
| 103 |
+
print("π The updated data structure is ready to use!")
|
| 104 |
+
|
| 105 |
+
if __name__ == "__main__":
|
| 106 |
+
main()
|