|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
DIR_OUTPUT_REQUESTS = Path("requested_models") |
|
|
EVAL_REQUESTS_PATH = Path("eval_requests") |
|
|
|
|
|
FINAL_SIZE = 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
banner_url = "https://huggingface.co/datasets/reach-vb/random-images/resolve/main/phoneme_leaderboard.png" |
|
|
BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>' |
|
|
|
|
|
TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🤗 Phoneme Detection Leaderboard </b> </body> </html>" |
|
|
|
|
|
INTRODUCTION_TEXT = """📐 The 🤗 Phoneme Detection Leaderboard ranks and evaluates phoneme recognition models |
|
|
on the Hugging Face Hub. |
|
|
\nWe report the Average [PER](https://en.wikipedia.org/wiki/Phoneme_error_rate) (⬇️ lower the better) and Average Duration. Models are ranked based on their Average PER, from lowest to highest. Check the 📈 Metrics tab to understand how the models are evaluated. |
|
|
\nIf you want results for a model that is not listed here, you can submit a request for it to be included ✉️✨. |
|
|
\nThe leaderboard includes phoneme recognition evaluation across multiple datasets.""" |
|
|
|
|
|
CITATION_TEXT = """@misc{phoneme-detection-leaderboard, |
|
|
title = {Phoneme Detection Leaderboard}, |
|
|
author = {Your Name and Contributors}, |
|
|
year = 2024, |
|
|
publisher = {Hugging Face}, |
|
|
howpublished = "\\url{https://huggingface.co/spaces/your-org/phoneme-detection-leaderboard}" |
|
|
} |
|
|
""" |
|
|
|
|
|
METRICS_TAB_TEXT = """ |
|
|
Here you will find details about the phoneme recognition metrics and datasets reported in our leaderboard. |
|
|
|
|
|
## Metrics |
|
|
|
|
|
Models are evaluated using the Phoneme Error Rate (PER) metric. The PER metric |
|
|
is used to assess the accuracy of a phoneme recognition system. Models are ranked in the leaderboard based |
|
|
on their PER, lowest to highest. |
|
|
|
|
|
### Phoneme Error Rate (PER) |
|
|
|
|
|
Phoneme Error Rate is used to measure the **accuracy** of automatic phoneme recognition systems. It calculates the percentage |
|
|
of phonemes in the system's output that differ from the reference (correct) phoneme sequence. **A lower PER value indicates higher accuracy**. |
|
|
|
|
|
The PER is calculated using sequence alignment between predicted and reference phoneme sequences, taking into account: |
|
|
- Substitutions (S): predicted phoneme differs from reference |
|
|
- Deletions (D): reference phoneme missing in prediction |
|
|
- Insertions (I): predicted phoneme not in reference |
|
|
|
|
|
``` |
|
|
PER = (S + D + I) / N * 100 |
|
|
``` |
|
|
|
|
|
Where N is the total number of reference phonemes. |
|
|
|
|
|
## How to reproduce our results |
|
|
|
|
|
The Phoneme Detection Leaderboard is an effort to benchmark open source phoneme recognition models. |
|
|
Along with the Leaderboard we're open-sourcing the codebase used for running these evaluations. |
|
|
|
|
|
P.S. We'd love to know which other models you'd like us to benchmark next. Contributions are more than welcome! ♥️ |
|
|
|
|
|
## Benchmark datasets |
|
|
|
|
|
Evaluating Phoneme Recognition systems requires diverse datasets with phonetic transcriptions. We use multiple datasets to obtain robust evaluation scores for each model. |
|
|
|
|
|
| Dataset | Description | Language | Notes | |
|
|
|---------|-------------|----------|-------| |
|
|
| mirfan899/phoneme_asr | General phoneme recognition | English | split: train, field: phonetic | |
|
|
| mirfan899/kids_phoneme_md | Children's speech phoneme dataset | English | split: train, field: phonetic | |
|
|
| kylelovesllms/timit_asr_ipa | TIMIT phoneme transcriptions (IPA) | English | split: train, field: text | |
|
|
| openslr/librispeech_asr | LibriSpeech clean test subset | English | split: test.clean, field: text, streaming | |
|
|
| leduckhai/MultiMed | Multi-domain medical speech (English config) | English | split: test, config: English, streaming | |
|
|
|
|
|
For more details on the individual datasets and how models are evaluated, refer to our documentation. |
|
|
""" |
|
|
|
|
|
LEADERBOARD_CSS = """ |
|
|
#leaderboard-table { |
|
|
max-height: 600px; |
|
|
overflow-y: auto; |
|
|
} |
|
|
|
|
|
#leaderboard-table th .header-content { |
|
|
white-space: nowrap; |
|
|
} |
|
|
|
|
|
#leaderboard-table td:first-child { |
|
|
min-width: 300px; |
|
|
} |
|
|
|
|
|
#phoneme-table { |
|
|
max-height: 600px; |
|
|
overflow-y: auto; |
|
|
} |
|
|
|
|
|
#phoneme-table th .header-content { |
|
|
white-space: nowrap; |
|
|
} |
|
|
|
|
|
#phoneme-table th:hover { |
|
|
background-color: var(--table-row-focus); |
|
|
} |
|
|
|
|
|
#phoneme-table td:first-child { |
|
|
min-width: 300px; |
|
|
} |
|
|
""" |
|
|
|
|
|
|
|
|
DATASETS = [ |
|
|
{ |
|
|
"name": "mirfan899/phoneme_asr", |
|
|
"split": "train", |
|
|
"field": "phonetic", |
|
|
"max_samples": 500, |
|
|
"use_streaming": False |
|
|
}, |
|
|
{ |
|
|
"name": "mirfan899/kids_phoneme_md", |
|
|
"split": "train", |
|
|
"field": "phonetic", |
|
|
"max_samples": 500, |
|
|
"use_streaming": False |
|
|
}, |
|
|
{ |
|
|
"name": "kylelovesllms/timit_asr_ipa", |
|
|
"split": "train", |
|
|
"field": "text", |
|
|
"max_samples": 500, |
|
|
"use_streaming": False |
|
|
}, |
|
|
{ |
|
|
"name": "openslr/librispeech_asr", |
|
|
"split": "test.clean", |
|
|
"field": "text", |
|
|
"max_samples": 500, |
|
|
"use_streaming": True |
|
|
}, |
|
|
{ |
|
|
"name": "leduckhai/MultiMed", |
|
|
"split": "test", |
|
|
"field": "text", |
|
|
"max_samples": 1500, |
|
|
"config": "English", |
|
|
"use_streaming": True |
|
|
} |
|
|
] |