File size: 5,577 Bytes
dba24db 99d9342 dba24db 99d9342 dba24db 99d9342 dba24db 99d9342 dba24db 99d9342 dba24db 45089ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
from pathlib import Path
# Directory where request by models are stored
DIR_OUTPUT_REQUESTS = Path("requested_models")
EVAL_REQUESTS_PATH = Path("eval_requests")
FINAL_SIZE = 100
##########################
# Text definitions #
##########################
banner_url = "https://huggingface.co/datasets/reach-vb/random-images/resolve/main/phoneme_leaderboard.png"
BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>'
TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🤗 Phoneme Detection Leaderboard </b> </body> </html>"
INTRODUCTION_TEXT = """📐 The 🤗 Phoneme Detection Leaderboard ranks and evaluates phoneme recognition models
on the Hugging Face Hub.
\nWe report the Average [PER](https://en.wikipedia.org/wiki/Phoneme_error_rate) (⬇️ lower the better) and Average Duration. Models are ranked based on their Average PER, from lowest to highest. Check the 📈 Metrics tab to understand how the models are evaluated.
\nIf you want results for a model that is not listed here, you can submit a request for it to be included ✉️✨.
\nThe leaderboard includes phoneme recognition evaluation across multiple datasets."""
CITATION_TEXT = """@misc{phoneme-detection-leaderboard,
title = {Phoneme Detection Leaderboard},
author = {Your Name and Contributors},
year = 2024,
publisher = {Hugging Face},
howpublished = "\\url{https://huggingface.co/spaces/your-org/phoneme-detection-leaderboard}"
}
"""
METRICS_TAB_TEXT = """
Here you will find details about the phoneme recognition metrics and datasets reported in our leaderboard.
## Metrics
Models are evaluated using the Phoneme Error Rate (PER) metric. The PER metric
is used to assess the accuracy of a phoneme recognition system. Models are ranked in the leaderboard based
on their PER, lowest to highest.
### Phoneme Error Rate (PER)
Phoneme Error Rate is used to measure the **accuracy** of automatic phoneme recognition systems. It calculates the percentage
of phonemes in the system's output that differ from the reference (correct) phoneme sequence. **A lower PER value indicates higher accuracy**.
The PER is calculated using sequence alignment between predicted and reference phoneme sequences, taking into account:
- Substitutions (S): predicted phoneme differs from reference
- Deletions (D): reference phoneme missing in prediction
- Insertions (I): predicted phoneme not in reference
```
PER = (S + D + I) / N * 100
```
Where N is the total number of reference phonemes.
## How to reproduce our results
The Phoneme Detection Leaderboard is an effort to benchmark open source phoneme recognition models.
Along with the Leaderboard we're open-sourcing the codebase used for running these evaluations.
P.S. We'd love to know which other models you'd like us to benchmark next. Contributions are more than welcome! ♥️
## Benchmark datasets
Evaluating Phoneme Recognition systems requires diverse datasets with phonetic transcriptions. We use multiple datasets to obtain robust evaluation scores for each model.
| Dataset | Description | Language | Notes |
|---------|-------------|----------|-------|
| mirfan899/phoneme_asr | General phoneme recognition | English | split: train, field: phonetic |
| mirfan899/kids_phoneme_md | Children's speech phoneme dataset | English | split: train, field: phonetic |
| kylelovesllms/timit_asr_ipa | TIMIT phoneme transcriptions (IPA) | English | split: train, field: text |
| openslr/librispeech_asr | LibriSpeech clean test subset | English | split: test.clean, field: text, streaming |
| leduckhai/MultiMed | Multi-domain medical speech (English config) | English | split: test, config: English, streaming |
For more details on the individual datasets and how models are evaluated, refer to our documentation.
"""
LEADERBOARD_CSS = """
#leaderboard-table {
max-height: 600px;
overflow-y: auto;
}
#leaderboard-table th .header-content {
white-space: nowrap;
}
#leaderboard-table td:first-child {
min-width: 300px;
}
#phoneme-table {
max-height: 600px;
overflow-y: auto;
}
#phoneme-table th .header-content {
white-space: nowrap;
}
#phoneme-table th:hover {
background-color: var(--table-row-focus);
}
#phoneme-table td:first-child {
min-width: 300px;
}
"""
DATASETS = [
{
"name": "mirfan899/phoneme_asr",
"split": "train",
"field": "phonetic",
"max_samples": 500, # Limit to 1000 samples
"use_streaming": False
},
{
"name": "mirfan899/kids_phoneme_md",
"split": "train",
"field": "phonetic",
"max_samples": 500,
"use_streaming": False
},
{
"name": "kylelovesllms/timit_asr_ipa",
"split": "train",
"field": "text",
"max_samples": 500, # Smaller limit for this dataset
"use_streaming": False
},
{
"name": "openslr/librispeech_asr",
"split": "test.clean", # Use full split with streaming
"field": "text",
"max_samples": 500, # Larger dataset, allow more samples
"use_streaming": True # Use streaming for better runtime
},
{
"name": "leduckhai/MultiMed",
"split": "test",
"field": "text",
"max_samples": 1500,
"config": "English", # Fixed: add config name for English
"use_streaming": True # Use streaming for better runtime
}
] |