Phoneme_Detection_Leaderboard

Running

App Files Files Community

lataon commited on Oct 3

Commit

99d9342

1 Parent(s): 45089ef

add: new model, ds

Browse files

Files changed (10) hide show

app.py +20 -16
constants.py +27 -4
eval-results/{results_1759479712_HuBERT-Base.json → results_1759491458_HuBERT-Base.json} +10 -10
eval-results/{results_1759479712_HuBERT-fine-tuned.json → results_1759491458_HuBERT-fine-tuned.json} +10 -10
eval-results/{results_1759479712_LJSpeech-Gruut.json → results_1759491458_LJSpeech-Gruut.json} +10 -10
eval-results/{results_1759479712_Timit.json → results_1759491458_Timit.json} +10 -10
eval-results/{results_1759479712_WavLM.json → results_1759491458_WavLM.json} +10 -10
eval-results/{results_1759479712_Whisper.json → results_1759491458_Whisper.json} +10 -10
utils/load_model.py +91 -21
utils_display.py +4 -0

app.py CHANGED Viewed

@@ -36,6 +36,14 @@ def load_results(results_dir: str) -> pd.DataFrame:
     rows = []
     all_dataset_keys = set()
     if not os.path.isdir(results_dir):
         return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"])
@@ -72,7 +80,7 @@ def load_results(results_dir: str) -> pd.DataFrame:
                 dur_value = dataset_data.get("avg_duration") if dataset_data else None
                 display_name = dataset_display_names[dataset_key]
-                per_values[f"PER {display_name}"] = per_value
                 if dur_value is not None:
                     dur_values.append(dur_value)
@@ -80,9 +88,11 @@ def load_results(results_dir: str) -> pd.DataFrame:
             # Calculate average PER across all datasets
             per_vals = [v for v in per_values.values() if v is not None]
             avg_per = sum(per_vals) / len(per_vals) if per_vals else None
             # Calculate average duration
             avg_dur = sum(dur_values) / len(dur_values) if dur_values else None
             row = {
                 "Model": make_clickable_model(model_name),
@@ -109,7 +119,15 @@ def load_results(results_dir: str) -> pd.DataFrame:
 # Load initial data
 try:
-    eval_queue_repo, requested_models, csv_results = load_all_info_from_dataset_hub()
     if eval_queue_repo is None or requested_models is None or csv_results is None:
         # No token provided, fallback to local results
         original_df = load_results(EVAL_RESULTS_DIR)
@@ -143,20 +161,6 @@ except Exception as e:
     # Fallback to local results
     original_df = load_results(EVAL_RESULTS_DIR)
-# If no data is loaded, create a sample empty dataframe with proper columns
-if original_df.empty:
-    print("No results found. Creating empty dataframe with sample data...")
-    # Create sample data to demonstrate the interface
-    sample_data = {
-        "Model": [make_clickable_model("sample/hubert-base"), make_clickable_model("sample/whisper-base")],
-        "Average PER ⬇️": [15.2, 18.5],
-        "Avg Duration (s)": [0.12, 0.15],
-        "PER phoneme_asr": [14.8, 17.2],
-        "PER kids_phoneme_md": [15.6, 19.8]
-    }
-    original_df = pd.DataFrame(sample_data)
-    print("Sample data created for demonstration.")
 COLS = [c.name for c in fields(PhonemeEvalColumn)]
 TYPES = [c.type for c in fields(PhonemeEvalColumn)]

     rows = []
     all_dataset_keys = set()
+    def round_two_decimals(value):
+        try:
+            if value is None:
+                return None
+            return round(float(value), 2)
+        except Exception:
+            return value
     if not os.path.isdir(results_dir):
         return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"])
                 dur_value = dataset_data.get("avg_duration") if dataset_data else None
                 display_name = dataset_display_names[dataset_key]
+                per_values[f"{display_name}"] = round_two_decimals(per_value)
                 if dur_value is not None:
                     dur_values.append(dur_value)
             # Calculate average PER across all datasets
             per_vals = [v for v in per_values.values() if v is not None]
             avg_per = sum(per_vals) / len(per_vals) if per_vals else None
+            avg_per = round_two_decimals(avg_per)
             # Calculate average duration
             avg_dur = sum(dur_values) / len(dur_values) if dur_values else None
+            avg_dur = round_two_decimals(avg_dur)
             row = {
                 "Model": make_clickable_model(model_name),
 # Load initial data
 try:
+    # Support both legacy (3-tuple) and new (4-tuple) returns
+    hub_info = load_all_info_from_dataset_hub()
+    if isinstance(hub_info, tuple) and len(hub_info) >= 3:
+        eval_queue_repo = hub_info[0]
+        requested_models = hub_info[1]
+        csv_results = hub_info[2]
+        # Fourth value (if present) is not used in this app
+    else:
+        eval_queue_repo, requested_models, csv_results = None, None, None
     if eval_queue_repo is None or requested_models is None or csv_results is None:
         # No token provided, fallback to local results
         original_df = load_results(EVAL_RESULTS_DIR)
     # Fallback to local results
     original_df = load_results(EVAL_RESULTS_DIR)
 COLS = [c.name for c in fields(PhonemeEvalColumn)]
 TYPES = [c.type for c in fields(PhonemeEvalColumn)]

constants.py CHANGED Viewed

@@ -4,6 +4,8 @@ from pathlib import Path
 DIR_OUTPUT_REQUESTS = Path("requested_models")
 EVAL_REQUESTS_PATH = Path("eval_requests")
 ##########################
 # Text definitions       #
 ##########################
@@ -64,19 +66,36 @@ P.S. We'd love to know which other models you'd like us to benchmark next. Contr
 Evaluating Phoneme Recognition systems requires diverse datasets with phonetic transcriptions. We use multiple datasets to obtain robust evaluation scores for each model.
-| Dataset | Description | Language | License |
-|---------|-------------|----------|---------|
-| phoneme_asr | General phoneme recognition dataset | English | Open |
-| kids_phoneme_md | Children's speech phoneme dataset | English | Open |
 For more details on the individual datasets and how models are evaluated, refer to our documentation.
 """
 LEADERBOARD_CSS = """
 #leaderboard-table th .header-content {
     white-space: nowrap;
 }
 #phoneme-table th .header-content {
     white-space: nowrap;
 }
@@ -84,6 +103,10 @@ LEADERBOARD_CSS = """
 #phoneme-table th:hover {
     background-color: var(--table-row-focus);
 }
 """

 DIR_OUTPUT_REQUESTS = Path("requested_models")
 EVAL_REQUESTS_PATH = Path("eval_requests")
+FINAL_SIZE = 100
 ##########################
 # Text definitions       #
 ##########################
 Evaluating Phoneme Recognition systems requires diverse datasets with phonetic transcriptions. We use multiple datasets to obtain robust evaluation scores for each model.
+| Dataset | Description | Language | Notes |
+|---------|-------------|----------|-------|
+| mirfan899/phoneme_asr | General phoneme recognition | English | split: train, field: phonetic |
+| mirfan899/kids_phoneme_md | Children's speech phoneme dataset | English | split: train, field: phonetic |
+| kylelovesllms/timit_asr_ipa | TIMIT phoneme transcriptions (IPA) | English | split: train, field: text |
+| openslr/librispeech_asr | LibriSpeech clean test subset | English | split: test.clean, field: text, streaming |
+| leduckhai/MultiMed | Multi-domain medical speech (English config) | English | split: test, config: English, streaming |
 For more details on the individual datasets and how models are evaluated, refer to our documentation.
 """
 LEADERBOARD_CSS = """
+#leaderboard-table {
+    max-height: 600px;
+    overflow-y: auto;
+}
 #leaderboard-table th .header-content {
     white-space: nowrap;
 }
+#leaderboard-table td:first-child {
+    min-width: 300px;
+}
+#phoneme-table {
+    max-height: 600px;
+    overflow-y: auto;
+}
 #phoneme-table th .header-content {
     white-space: nowrap;
 }
 #phoneme-table th:hover {
     background-color: var(--table-row-focus);
 }
+#phoneme-table td:first-child {
+    min-width: 300px;
+}
 """

eval-results/{results_1759479712_HuBERT-Base.json → results_1759491458_HuBERT-Base.json} RENAMED Viewed

@@ -6,24 +6,24 @@
   },
   "results": {
     "phoneme_asr": {
-      "per": 80.73712068409569,
-      "avg_duration": 1.006052589416504
     },
     "kids_phoneme_md": {
-      "per": 74.8274712307235,
-      "avg_duration": 1.4053531885147095
     },
     "timit_asr_ipa": {
-      "per": 79.21011611385504,
-      "avg_duration": 0.8184992551803589
     },
     "librispeech_asr": {
-      "per": 81.8414587948362,
-      "avg_duration": 2.6552599668502808
     },
     "MultiMed": {
-      "per": 86.31836686921642,
-      "avg_duration": 2.520846700668335
     }
   }
 }

   },
   "results": {
     "phoneme_asr": {
+      "per": 78.22004335857109,
+      "avg_duration": 3.3285199880599974
     },
     "kids_phoneme_md": {
+      "per": 79.46124268247958,
+      "avg_duration": 7.384845638275147
     },
     "timit_asr_ipa": {
+      "per": 80.13455092277195,
+      "avg_duration": 3.2261718797683714
     },
     "librispeech_asr": {
+      "per": 81.18908836624553,
+      "avg_duration": 7.476902644634247
     },
     "MultiMed": {
+      "per": 83.5727737665735,
+      "avg_duration": 10.891806457042694
     }
   }
 }

eval-results/{results_1759479712_HuBERT-fine-tuned.json → results_1759491458_HuBERT-fine-tuned.json} RENAMED Viewed

@@ -6,24 +6,24 @@
   },
   "results": {
     "phoneme_asr": {
-      "per": 3.1765040500162365,
-      "avg_duration": 1.0928319931030273
     },
     "kids_phoneme_md": {
-      "per": 13.847118841760139,
-      "avg_duration": 1.43447744846344
     },
     "timit_asr_ipa": {
-      "per": 3.5624700539646397,
-      "avg_duration": 0.8138290405273437
     },
     "librispeech_asr": {
-      "per": 2.1361935038679745,
-      "avg_duration": 2.591994023323059
     },
     "MultiMed": {
-      "per": 12.195454796657222,
-      "avg_duration": 2.4015810966491697
     }
   }
 }

   },
   "results": {
     "phoneme_asr": {
+      "per": 2.0906059507271304,
+      "avg_duration": 3.4651901078224183
     },
     "kids_phoneme_md": {
+      "per": 20.20195546890277,
+      "avg_duration": 7.601937489509583
     },
     "timit_asr_ipa": {
+      "per": 2.6819661674832194,
+      "avg_duration": 3.3618062925338745
     },
     "librispeech_asr": {
+      "per": 1.6319143740707203,
+      "avg_duration": 7.760291111469269
     },
     "MultiMed": {
+      "per": 9.572457365078227,
+      "avg_duration": 11.040356299877168
     }
   }
 }

eval-results/{results_1759479712_LJSpeech-Gruut.json → results_1759491458_LJSpeech-Gruut.json} RENAMED Viewed

@@ -6,24 +6,24 @@
   },
   "results": {
     "phoneme_asr": {
-      "per": 28.34934978626287,
-      "avg_duration": 0.3894784927368164
     },
     "kids_phoneme_md": {
-      "per": 62.007568280756246,
-      "avg_duration": 0.5734055519104004
     },
     "timit_asr_ipa": {
-      "per": 24.322912970242964,
-      "avg_duration": 0.3130455732345581
     },
     "librispeech_asr": {
-      "per": 21.098893815003613,
-      "avg_duration": 1.034156036376953
     },
     "MultiMed": {
-      "per": 37.90138577574676,
-      "avg_duration": 1.0464757680892944
     }
   }
 }

   },
   "results": {
     "phoneme_asr": {
+      "per": 27.635612463370368,
+      "avg_duration": 2.216831774711609
     },
     "kids_phoneme_md": {
+      "per": 61.80856575663577,
+      "avg_duration": 4.8097358679771425
     },
     "timit_asr_ipa": {
+      "per": 28.17040265355878,
+      "avg_duration": 2.08021559715271
     },
     "librispeech_asr": {
+      "per": 20.67960537404926,
+      "avg_duration": 4.945555350780487
     },
     "MultiMed": {
+      "per": 31.53710463881287,
+      "avg_duration": 7.100828051567078
     }
   }
 }

eval-results/{results_1759479712_Timit.json → results_1759491458_Timit.json} RENAMED Viewed

@@ -6,24 +6,24 @@
   },
   "results": {
     "phoneme_asr": {
-      "per": 32.78310772297904,
-      "avg_duration": 1.0769179582595825
     },
     "kids_phoneme_md": {
-      "per": 42.393439204382865,
-      "avg_duration": 1.4808897733688355
     },
     "timit_asr_ipa": {
-      "per": 28.852864777541704,
-      "avg_duration": 0.8038362503051758
     },
     "librispeech_asr": {
-      "per": 28.88432664616071,
-      "avg_duration": 2.5855883836746214
     },
     "MultiMed": {
-      "per": 42.29417929178023,
-      "avg_duration": 2.4689067125320436
     }
   }
 }

   },
   "results": {
     "phoneme_asr": {
+      "per": 31.917506576464163,
+      "avg_duration": 3.4731807804107664
     },
     "kids_phoneme_md": {
+      "per": 44.56843086404637,
+      "avg_duration": 7.674495687484741
     },
     "timit_asr_ipa": {
+      "per": 33.44181535059672,
+      "avg_duration": 3.374768352508545
     },
     "librispeech_asr": {
+      "per": 29.537610471893803,
+      "avg_duration": 7.891125264167786
     },
     "MultiMed": {
+      "per": 37.45253395374299,
+      "avg_duration": 11.265925951004029
     }
   }
 }

eval-results/{results_1759479712_WavLM.json → results_1759491458_WavLM.json} RENAMED Viewed

@@ -6,24 +6,24 @@
   },
   "results": {
     "phoneme_asr": {
-      "per": 25.04219454527341,
-      "avg_duration": 1.054517960548401
     },
     "kids_phoneme_md": {
-      "per": 63.40875812391994,
-      "avg_duration": 1.476344680786133
     },
     "timit_asr_ipa": {
-      "per": 22.821457511149568,
-      "avg_duration": 0.7534051895141601
     },
     "librispeech_asr": {
-      "per": 36.13438162282092,
-      "avg_duration": 2.5621693611145018
     },
     "MultiMed": {
-      "per": 57.01443813462704,
-      "avg_duration": 2.337135744094849
     }
   }
 }

   },
   "results": {
     "phoneme_asr": {
+      "per": 24.631130546986757,
+      "avg_duration": 3.4335393691062928
     },
     "kids_phoneme_md": {
+      "per": 63.661901397475695,
+      "avg_duration": 7.561313712596894
     },
     "timit_asr_ipa": {
+      "per": 22.054351601266735,
+      "avg_duration": 3.340735013484955
     },
     "librispeech_asr": {
+      "per": 32.58195540587739,
+      "avg_duration": 7.779554929733276
     },
     "MultiMed": {
+      "per": 45.96974612462279,
+      "avg_duration": 11.072271597385406
     }
   }
 }

eval-results/{results_1759479712_Whisper.json → results_1759491458_Whisper.json} RENAMED Viewed

@@ -6,24 +6,24 @@
   },
   "results": {
     "phoneme_asr": {
-      "per": 83.44842270480702,
-      "avg_duration": 1.5802977561950684
     },
     "kids_phoneme_md": {
-      "per": 73.97112058868787,
-      "avg_duration": 1.4796640157699585
     },
     "timit_asr_ipa": {
-      "per": 78.25013458573484,
-      "avg_duration": 1.2946593046188355
     },
     "librispeech_asr": {
-      "per": 82.02327697665437,
-      "avg_duration": 1.9603740453720093
     },
     "MultiMed": {
-      "per": 77.10185035170976,
-      "avg_duration": 1.68308687210083
     }
   }
 }

   },
   "results": {
     "phoneme_asr": {
+      "per": 78.71122630859638,
+      "avg_duration": 3.847285704612732
     },
     "kids_phoneme_md": {
+      "per": 77.85164413992199,
+      "avg_duration": 8.320557019710542
     },
     "timit_asr_ipa": {
+      "per": 80.6895957363744,
+      "avg_duration": 3.7425442838668825
     },
     "librispeech_asr": {
+      "per": 81.412840566159,
+      "avg_duration": 8.644328632354735
     },
     "MultiMed": {
+      "per": 80.89067869438723,
+      "avg_duration": 11.937099692821503
     }
   }
 }

utils/load_model.py CHANGED Viewed

@@ -9,6 +9,7 @@ from transformers import (
 from .cmu_process import text_to_phoneme, cmu_to_ipa, clean_cmu
 from dotenv import load_dotenv
 # Load environment variables from .env file
 load_dotenv()
@@ -17,6 +18,10 @@ load_dotenv()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("Using device:", device)
 # === Helper: move all tensors to model device ===
 def to_device(batch, device):
     if isinstance(batch, dict):
@@ -61,9 +66,16 @@ wavlm_model = AutoModelForCTC.from_pretrained("speech31/wavlm-large-english-phon
 def run_hubert_base(wav):
     start = time.time()
     inputs = base_proc(wav, sampling_rate=16000, return_tensors="pt", padding=True).input_values
-    inputs = inputs.to(device)
-    with torch.no_grad():
         logits = base_model(inputs).logits
     ids = torch.argmax(logits, dim=-1)
     text = base_proc.batch_decode(ids)[0]
@@ -74,20 +86,47 @@ def run_hubert_base(wav):
 def run_whisper(wav):
     start = time.time()
-    # Preprocess
-    inputs = whisper_proc(wav, sampling_rate=16000, return_tensors="pt").input_features
-    inputs = inputs.to(device)
-    # Forward pass
-    with torch.no_grad():
-        pred_ids = whisper_model.generate(inputs)
-    # Decode
     text = whisper_proc.batch_decode(pred_ids, skip_special_tokens=True)[0]
-    # Convert to phonemes
     phonemes = text_to_phoneme(text)
     return phonemes.strip(), time.time() - start
@@ -95,10 +134,18 @@ def run_model(wav):
     start = time.time()
     # Prepare input (BatchEncoding supports .to(device))
-    inputs = proc(wav, sampling_rate=16000, return_tensors="pt").to(device)
     # Forward pass
-    with torch.no_grad():
         logits = model(**inputs).logits
     # Greedy decode
@@ -112,10 +159,17 @@ def run_timit(wav):
     start = time.time()
     # Read and process the input
     inputs = timit_proc(wav, sampling_rate=16_000, return_tensors="pt", padding=True)
-    inputs = inputs.to(device)
     # Forward pass
-    with torch.no_grad():
         logits = timit_model(inputs.input_values, attention_mask=inputs.attention_mask).logits
     # Decode id into string
@@ -135,10 +189,18 @@ def run_gruut(wav):
         sampling_rate=16000,
         return_tensors="pt",
         padding=True
-    ).to(device)
     # Forward pass
-    with torch.no_grad():
         logits = gruut_model(**inputs).logits
     # Greedy decode → IPA phonemes
@@ -157,13 +219,21 @@ def run_wavlm_large_phoneme(wav):
         sampling_rate=16000,
         return_tensors="pt",
         padding=True
-    ).to(device)
     input_values = inputs.input_values
     attention_mask = inputs.get("attention_mask", None)
     # Forward pass
-    with torch.no_grad():
         logits = wavlm_model(input_values, attention_mask=attention_mask).logits
     # Greedy decode → phoneme tokens

 from .cmu_process import text_to_phoneme, cmu_to_ipa, clean_cmu
 from dotenv import load_dotenv
+import torch.backends.cudnn as cudnn
 # Load environment variables from .env file
 load_dotenv()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("Using device:", device)
+# Enable faster cudnn autotuner for variable input lengths
+if device.type == "cuda":
+    cudnn.benchmark = True
 # === Helper: move all tensors to model device ===
 def to_device(batch, device):
     if isinstance(batch, dict):
 def run_hubert_base(wav):
     start = time.time()
     inputs = base_proc(wav, sampling_rate=16000, return_tensors="pt", padding=True).input_values
+    if device.type == "cuda":
+        try:
+            inputs = inputs.pin_memory()
+        except Exception:
+            pass
+        inputs = inputs.to(device, non_blocking=True)
+    else:
+        inputs = inputs.to(device)
+    with torch.inference_mode():
         logits = base_model(inputs).logits
     ids = torch.argmax(logits, dim=-1)
     text = base_proc.batch_decode(ids)[0]
 def run_whisper(wav):
     start = time.time()
+    inputs = whisper_proc(wav, sampling_rate=16000, return_tensors="pt")
+    input_features = inputs.input_features
+    if device.type == "cuda":
+        try:
+            input_features = input_features.pin_memory()
+        except Exception:
+            pass
+        input_features = input_features.to(device, non_blocking=True)
+    else:
+        input_features = input_features.to(device)
+    attention_mask = inputs.get("attention_mask", None)
+    gen_kwargs = {"language": "en"}
+    if attention_mask is not None:
+        if device.type == "cuda":
+            try:
+                attention_mask = attention_mask.pin_memory()
+            except Exception:
+                pass
+            gen_kwargs["attention_mask"] = attention_mask.to(device, non_blocking=True)
+        else:
+            gen_kwargs["attention_mask"] = attention_mask.to(device)
+    # Force English transcription and use greedy decoding with short max tokens for speed
+    try:
+        forced_ids = whisper_proc.get_decoder_prompt_ids(language="en", task="transcribe")
+    except Exception:
+        forced_ids = None
+    with torch.inference_mode():
+        pred_ids = whisper_model.generate(
+            input_features,
+            forced_decoder_ids=forced_ids,
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=64,
+            use_cache=True,
+            **gen_kwargs,
+        )
     text = whisper_proc.batch_decode(pred_ids, skip_special_tokens=True)[0]
     phonemes = text_to_phoneme(text)
     return phonemes.strip(), time.time() - start
     start = time.time()
     # Prepare input (BatchEncoding supports .to(device))
+    inputs = proc(wav, sampling_rate=16000, return_tensors="pt")
+    if device.type == "cuda":
+        try:
+            inputs = inputs.pin_memory()
+        except Exception:
+            pass
+        inputs = inputs.to(device, non_blocking=True)
+    else:
+        inputs = inputs.to(device)
     # Forward pass
+    with torch.inference_mode():
         logits = model(**inputs).logits
     # Greedy decode
     start = time.time()
     # Read and process the input
     inputs = timit_proc(wav, sampling_rate=16_000, return_tensors="pt", padding=True)
+    if device.type == "cuda":
+        try:
+            inputs = inputs.pin_memory()
+        except Exception:
+            pass
+        inputs = inputs.to(device, non_blocking=True)
+    else:
+        inputs = inputs.to(device)
     # Forward pass
+    with torch.inference_mode():
         logits = timit_model(inputs.input_values, attention_mask=inputs.attention_mask).logits
     # Decode id into string
         sampling_rate=16000,
         return_tensors="pt",
         padding=True
+    )
+    if device.type == "cuda":
+        try:
+            inputs = inputs.pin_memory()
+        except Exception:
+            pass
+        inputs = inputs.to(device, non_blocking=True)
+    else:
+        inputs = inputs.to(device)
     # Forward pass
+    with torch.inference_mode():
         logits = gruut_model(**inputs).logits
     # Greedy decode → IPA phonemes
         sampling_rate=16000,
         return_tensors="pt",
         padding=True
+    )
+    if device.type == "cuda":
+        try:
+            inputs = inputs.pin_memory()
+        except Exception:
+            pass
+        inputs = inputs.to(device, non_blocking=True)
+    else:
+        inputs = inputs.to(device)
     input_values = inputs.input_values
     attention_mask = inputs.get("attention_mask", None)
     # Forward pass
+    with torch.inference_mode():
         logits = wavlm_model(input_values, attention_mask=attention_mask).logits
     # Greedy decode → phoneme tokens

utils_display.py CHANGED Viewed

@@ -34,6 +34,10 @@ def make_clickable_model(model_name):
         link = "https://huggingface.co/vitouphy/wav2vec2-xls-r-300m-timit-phoneme"
     elif model_name_list[0] == "Whisper":
         link = "https://huggingface.co/openai/whisper-base"
     else:
         link = f"https://huggingface.co/{model_name}"
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'

         link = "https://huggingface.co/vitouphy/wav2vec2-xls-r-300m-timit-phoneme"
     elif model_name_list[0] == "Whisper":
         link = "https://huggingface.co/openai/whisper-base"
+    elif model_name_list[0] == "WavLM":
+        link = "https://huggingface.co/speech31/wavlm-large-english-phoneme"
+    elif model_name_list[0] == "LJSpeech Gruut":
+        link = "https://huggingface.co/bookbot/wav2vec2-ljspeech-gruut"
     else:
         link = f"https://huggingface.co/{model_name}"
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'