from pathlib import Path # Directory where request by models are stored DIR_OUTPUT_REQUESTS = Path("requested_models") EVAL_REQUESTS_PATH = Path("eval_requests") FINAL_SIZE = 100 ########################## # Text definitions # ########################## banner_url = "https://huggingface.co/datasets/reach-vb/random-images/resolve/main/phoneme_leaderboard.png" BANNER = f'

' TITLE = "

🤗 Phoneme Detection Leaderboard " INTRODUCTION_TEXT = """📐 The 🤗 Phoneme Detection Leaderboard ranks and evaluates phoneme recognition models on the Hugging Face Hub. \nWe report the Average [PER](https://en.wikipedia.org/wiki/Phoneme_error_rate) (⬇️ lower the better) and Average Duration. Models are ranked based on their Average PER, from lowest to highest. Check the 📈 Metrics tab to understand how the models are evaluated. \nIf you want results for a model that is not listed here, you can submit a request for it to be included ✉️✨. \nThe leaderboard includes phoneme recognition evaluation across multiple datasets.""" CITATION_TEXT = """@misc{phoneme-detection-leaderboard, title = {Phoneme Detection Leaderboard}, author = {Your Name and Contributors}, year = 2024, publisher = {Hugging Face}, howpublished = "\\url{https://huggingface.co/spaces/your-org/phoneme-detection-leaderboard}" } """ METRICS_TAB_TEXT = """ Here you will find details about the phoneme recognition metrics and datasets reported in our leaderboard. ## Metrics Models are evaluated using the Phoneme Error Rate (PER) metric. The PER metric is used to assess the accuracy of a phoneme recognition system. Models are ranked in the leaderboard based on their PER, lowest to highest. ### Phoneme Error Rate (PER) Phoneme Error Rate is used to measure the accuracy of automatic phoneme recognition systems. It calculates the percentage of phonemes in the system's output that differ from the reference (correct) phoneme sequence. A lower PER value indicates higher accuracy. The PER is calculated using sequence alignment between predicted and reference phoneme sequences, taking into account: - Substitutions (S): predicted phoneme differs from reference - Deletions (D): reference phoneme missing in prediction - Insertions (I): predicted phoneme not in reference ``` PER = (S + D + I) / N * 100 ``` Where N is the total number of reference phonemes. ## How to reproduce our results The Phoneme Detection Leaderboard is an effort to benchmark open source phoneme recognition models. Along with the Leaderboard we're open-sourcing the codebase used for running these evaluations. P.S. We'd love to know which other models you'd like us to benchmark next. Contributions are more than welcome! ♥️ ## Benchmark datasets Evaluating Phoneme Recognition systems requires diverse datasets with phonetic transcriptions. We use multiple datasets to obtain robust evaluation scores for each model. | Dataset | Description | Language | Notes | |---------|-------------|----------|-------| | mirfan899/phoneme_asr | General phoneme recognition | English | split: train, field: phonetic | | mirfan899/kids_phoneme_md | Children's speech phoneme dataset | English | split: train, field: phonetic | | kylelovesllms/timit_asr_ipa | TIMIT phoneme transcriptions (IPA) | English | split: train, field: text | | openslr/librispeech_asr | LibriSpeech clean test subset | English | split: test.clean, field: text, streaming | | leduckhai/MultiMed | Multi-domain medical speech (English config) | English | split: test, config: English, streaming | For more details on the individual datasets and how models are evaluated, refer to our documentation. """ LEADERBOARD_CSS = """ #leaderboard-table { max-height: 600px; overflow-y: auto; } #leaderboard-table th .header-content { white-space: nowrap; } #leaderboard-table td:first-child { min-width: 300px; } #phoneme-table { max-height: 600px; overflow-y: auto; } #phoneme-table th .header-content { white-space: nowrap; } #phoneme-table th:hover { background-color: var(--table-row-focus); } #phoneme-table td:first-child { min-width: 300px; } """ DATASETS = [ { "name": "mirfan899/phoneme_asr", "split": "train", "field": "phonetic", "max_samples": 500, # Limit to 1000 samples "use_streaming": False }, { "name": "mirfan899/kids_phoneme_md", "split": "train", "field": "phonetic", "max_samples": 500, "use_streaming": False }, { "name": "kylelovesllms/timit_asr_ipa", "split": "train", "field": "text", "max_samples": 500, # Smaller limit for this dataset "use_streaming": False }, { "name": "openslr/librispeech_asr", "split": "test.clean", # Use full split with streaming "field": "text", "max_samples": 500, # Larger dataset, allow more samples "use_streaming": True # Use streaming for better runtime }, { "name": "leduckhai/MultiMed", "split": "test", "field": "text", "max_samples": 1500, "config": "English", # Fixed: add config name for English "use_streaming": True # Use streaming for better runtime } ]