from pathlib import Path # Directory where request by models are stored DIR_OUTPUT_REQUESTS = Path("requested_models") EVAL_REQUESTS_PATH = Path("eval_requests") FINAL_SIZE = 100 ########################## # Text definitions # ########################## banner_url = "https://huggingface.co/datasets/reach-vb/random-images/resolve/main/phoneme_leaderboard.png" BANNER = f'
Banner
' TITLE = "

🤗 Phoneme Detection Leaderboard " INTRODUCTION_TEXT = """📐 The 🤗 Phoneme Detection Leaderboard ranks and evaluates phoneme recognition models on the Hugging Face Hub. \nWe report the Average [PER](https://en.wikipedia.org/wiki/Phoneme_error_rate) (⬇️ lower the better) and Average Duration. Models are ranked based on their Average PER, from lowest to highest. Check the 📈 Metrics tab to understand how the models are evaluated. \nIf you want results for a model that is not listed here, you can submit a request for it to be included ✉️✨. \nThe leaderboard includes phoneme recognition evaluation across multiple datasets.""" CITATION_TEXT = """@misc{phoneme-detection-leaderboard, title = {Phoneme Detection Leaderboard}, author = {Your Name and Contributors}, year = 2024, publisher = {Hugging Face}, howpublished = "\\url{https://huggingface.co/spaces/your-org/phoneme-detection-leaderboard}" } """ METRICS_TAB_TEXT = """ Here you will find details about the phoneme recognition metrics and datasets reported in our leaderboard. ## Metrics Models are evaluated using the Phoneme Error Rate (PER) metric. The PER metric is used to assess the accuracy of a phoneme recognition system. Models are ranked in the leaderboard based on their PER, lowest to highest. ### Phoneme Error Rate (PER) Phoneme Error Rate is used to measure the **accuracy** of automatic phoneme recognition systems. It calculates the percentage of phonemes in the system's output that differ from the reference (correct) phoneme sequence. **A lower PER value indicates higher accuracy**. The PER is calculated using sequence alignment between predicted and reference phoneme sequences, taking into account: - Substitutions (S): predicted phoneme differs from reference - Deletions (D): reference phoneme missing in prediction - Insertions (I): predicted phoneme not in reference ``` PER = (S + D + I) / N * 100 ``` Where N is the total number of reference phonemes. ## How to reproduce our results The Phoneme Detection Leaderboard is an effort to benchmark open source phoneme recognition models. Along with the Leaderboard we're open-sourcing the codebase used for running these evaluations. P.S. We'd love to know which other models you'd like us to benchmark next. Contributions are more than welcome! ♥️ ## Benchmark datasets Evaluating Phoneme Recognition systems requires diverse datasets with phonetic transcriptions. We use multiple datasets to obtain robust evaluation scores for each model. | Dataset | Description | Language | Notes | |---------|-------------|----------|-------| | mirfan899/phoneme_asr | General phoneme recognition | English | split: train, field: phonetic | | mirfan899/kids_phoneme_md | Children's speech phoneme dataset | English | split: train, field: phonetic | | kylelovesllms/timit_asr_ipa | TIMIT phoneme transcriptions (IPA) | English | split: train, field: text | | openslr/librispeech_asr | LibriSpeech clean test subset | English | split: test.clean, field: text, streaming | | leduckhai/MultiMed | Multi-domain medical speech (English config) | English | split: test, config: English, streaming | For more details on the individual datasets and how models are evaluated, refer to our documentation. """ LEADERBOARD_CSS = """ #leaderboard-table { max-height: 600px; overflow-y: auto; } #leaderboard-table th .header-content { white-space: nowrap; } #leaderboard-table td:first-child { min-width: 300px; } #phoneme-table { max-height: 600px; overflow-y: auto; } #phoneme-table th .header-content { white-space: nowrap; } #phoneme-table th:hover { background-color: var(--table-row-focus); } #phoneme-table td:first-child { min-width: 300px; } """ DATASETS = [ { "name": "mirfan899/phoneme_asr", "split": "train", "field": "phonetic", "max_samples": 500, # Limit to 1000 samples "use_streaming": False }, { "name": "mirfan899/kids_phoneme_md", "split": "train", "field": "phonetic", "max_samples": 500, "use_streaming": False }, { "name": "kylelovesllms/timit_asr_ipa", "split": "train", "field": "text", "max_samples": 500, # Smaller limit for this dataset "use_streaming": False }, { "name": "openslr/librispeech_asr", "split": "test.clean", # Use full split with streaming "field": "text", "max_samples": 500, # Larger dataset, allow more samples "use_streaming": True # Use streaming for better runtime }, { "name": "leduckhai/MultiMed", "split": "test", "field": "text", "max_samples": 1500, "config": "English", # Fixed: add config name for English "use_streaming": True # Use streaming for better runtime } ]