Phoneme_Detection_Leaderboard

Running

File size: 5,577 Bytes

from pathlib import Path

# Directory where request by models are stored
DIR_OUTPUT_REQUESTS = Path("requested_models")
EVAL_REQUESTS_PATH = Path("eval_requests")

FINAL_SIZE = 100

##########################
# Text definitions       #
##########################

banner_url = "https://huggingface.co/datasets/reach-vb/random-images/resolve/main/phoneme_leaderboard.png"
BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>'

TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🤗 Phoneme Detection Leaderboard </b> </body> </html>"

INTRODUCTION_TEXT = """📐 The 🤗 Phoneme Detection Leaderboard ranks and evaluates phoneme recognition models 
    on the Hugging Face Hub. 
    \nWe report the Average [PER](https://en.wikipedia.org/wiki/Phoneme_error_rate) (⬇️ lower the better) and Average Duration. Models are ranked based on their Average PER, from lowest to highest. Check the 📈 Metrics tab to understand how the models are evaluated. 
    \nIf you want results for a model that is not listed here, you can submit a request for it to be included ✉️✨. 
    \nThe leaderboard includes phoneme recognition evaluation across multiple datasets."""

CITATION_TEXT = """@misc{phoneme-detection-leaderboard,
	title        = {Phoneme Detection Leaderboard},
	author       = {Your Name and Contributors},
	year         = 2024,
	publisher    = {Hugging Face},
	howpublished = "\\url{https://huggingface.co/spaces/your-org/phoneme-detection-leaderboard}"
}
"""

METRICS_TAB_TEXT = """
Here you will find details about the phoneme recognition metrics and datasets reported in our leaderboard.

## Metrics

Models are evaluated using the Phoneme Error Rate (PER) metric. The PER metric
is used to assess the accuracy of a phoneme recognition system. Models are ranked in the leaderboard based 
on their PER, lowest to highest.

### Phoneme Error Rate (PER)

Phoneme Error Rate is used to measure the **accuracy** of automatic phoneme recognition systems. It calculates the percentage 
of phonemes in the system's output that differ from the reference (correct) phoneme sequence. **A lower PER value indicates higher accuracy**.

The PER is calculated using sequence alignment between predicted and reference phoneme sequences, taking into account:
- Substitutions (S): predicted phoneme differs from reference
- Deletions (D): reference phoneme missing in prediction  
- Insertions (I): predicted phoneme not in reference

```
PER = (S + D + I) / N * 100
```

Where N is the total number of reference phonemes.

## How to reproduce our results

The Phoneme Detection Leaderboard is an effort to benchmark open source phoneme recognition models. 
Along with the Leaderboard we're open-sourcing the codebase used for running these evaluations.

P.S. We'd love to know which other models you'd like us to benchmark next. Contributions are more than welcome! ♥️

## Benchmark datasets

Evaluating Phoneme Recognition systems requires diverse datasets with phonetic transcriptions. We use multiple datasets to obtain robust evaluation scores for each model.

| Dataset | Description | Language | Notes |
|---------|-------------|----------|-------|
| mirfan899/phoneme_asr | General phoneme recognition | English | split: train, field: phonetic |
| mirfan899/kids_phoneme_md | Children's speech phoneme dataset | English | split: train, field: phonetic |
| kylelovesllms/timit_asr_ipa | TIMIT phoneme transcriptions (IPA) | English | split: train, field: text |
| openslr/librispeech_asr | LibriSpeech clean test subset | English | split: test.clean, field: text, streaming |
| leduckhai/MultiMed | Multi-domain medical speech (English config) | English | split: test, config: English, streaming |

For more details on the individual datasets and how models are evaluated, refer to our documentation.
"""

LEADERBOARD_CSS = """
#leaderboard-table {
    max-height: 600px;
    overflow-y: auto;
}

#leaderboard-table th .header-content {
    white-space: nowrap;
}

#leaderboard-table td:first-child {
    min-width: 300px;
}

#phoneme-table {
    max-height: 600px;
    overflow-y: auto;
}

#phoneme-table th .header-content {
    white-space: nowrap;
}

#phoneme-table th:hover {
    background-color: var(--table-row-focus);
}

#phoneme-table td:first-child {
    min-width: 300px;
}
"""


DATASETS = [
    {
        "name": "mirfan899/phoneme_asr",
        "split": "train",
        "field": "phonetic",
        "max_samples": 500,  # Limit to 1000 samples
        "use_streaming": False
    },
    {
        "name": "mirfan899/kids_phoneme_md", 
        "split": "train",
        "field": "phonetic",
        "max_samples": 500,
        "use_streaming": False
    },
    {
        "name": "kylelovesllms/timit_asr_ipa",
        "split": "train",
        "field": "text",
        "max_samples": 500,  # Smaller limit for this dataset
        "use_streaming": False
    },
    {
        "name": "openslr/librispeech_asr",
        "split": "test.clean",  # Use full split with streaming
        "field": "text",
        "max_samples": 500,  # Larger dataset, allow more samples
        "use_streaming": True  # Use streaming for better runtime
    },
    {
        "name": "leduckhai/MultiMed",
        "split": "test",
        "field": "text",
        "max_samples": 1500,
        "config": "English",  # Fixed: add config name for English
        "use_streaming": True  # Use streaming for better runtime
    }
]