Phoneme_Detection_Leaderboard

Running

App Files Files Community

lataon commited on Sep 26

Commit

359afe5

1 Parent(s): 23e091c

runable eval

Browse files

Files changed (8) hide show

app.py +36 -11
src/display/utils.py +40 -78
src/leaderboard/read_evals.py +33 -22
src/phoneme_eval.py +124 -199
src/populate.py +9 -5
src/utils/audio_process.py +167 -0
src/utils/cmu_process.py +111 -0
src/utils/load_model.py +117 -0

app.py CHANGED Viewed

@@ -15,16 +15,11 @@ from src.about import (
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
-    BENCHMARK_COLS,
     COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
     AutoEvalColumn,
-    ModelType,
     fields,
-    WeightType,
-    Precision
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
@@ -59,7 +54,37 @@ if not _has_local_json(EVAL_RESULTS_PATH):
         pass
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 (
     finished_eval_queue_df,
@@ -69,7 +94,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
 def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
     return Leaderboard(
         value=dataframe,
         datatype=[c.type for c in fields(AutoEvalColumn)],
@@ -159,7 +184,7 @@ with demo:
                     model_name_textbox = gr.Textbox(label="Model name")
                     revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                     model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
                         label="Model type",
                         multiselect=False,
                         value=None,
@@ -168,14 +193,14 @@ with demo:
                 with gr.Column():
                     precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
                         label="Precision",
                         multiselect=False,
                         value="float16",
                         interactive=True,
                     )
                     weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
                         label="Weights type",
                         multiselect=False,
                         value="Original",

 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
     COLS,
     AutoEvalColumn,
     fields,
 )
+from src.about import Tasks
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
         pass
+# Build benchmark and evaluation queue column metadata
+BENCHMARK_COLS = [f"{task.value.col_name} ({task.name})" for task in Tasks]
+EVAL_COLS = [
+    "Model",
+    "Model sha",
+    "status",
+    "precision",
+    "weight_type",
+    "model_type",
+    "likes",
+    "params",
+    "license",
+    "submitted_time",
+]
+EVAL_TYPES = [
+    "markdown",  # Model
+    "str",       # Model sha
+    "str",       # status
+    "str",       # precision
+    "str",       # weight_type
+    "str",       # model_type
+    "number",    # likes
+    "number",    # params
+    "str",       # license
+    "str",       # submitted_time
+]
+# Hide all models from the leaderboard view
+LEADERBOARD_DF = pd.DataFrame(columns=COLS)
 (
     finished_eval_queue_df,
 def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
+        dataframe = pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)])
     return Leaderboard(
         value=dataframe,
         datatype=[c.type for c in fields(AutoEvalColumn)],
                     model_name_textbox = gr.Textbox(label="Model name")
                     revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                     model_type = gr.Dropdown(
+                        choices=["Pretrained", "Fine-tuned", "Merge", "Other"],
                         label="Model type",
                         multiselect=False,
                         value=None,
                 with gr.Column():
                     precision = gr.Dropdown(
+                        choices=["float16", "bfloat16", "float32", "int8", "int4"],
                         label="Precision",
                         multiselect=False,
                         value="float16",
                         interactive=True,
                     )
                     weight_type = gr.Dropdown(
+                        choices=["Original", "Delta", "Adapter"],
                         label="Weights type",
                         multiselect=False,
                         value="Original",

src/display/utils.py CHANGED Viewed

@@ -1,17 +1,15 @@
 from dataclasses import dataclass, make_dataclass
 from enum import Enum
 import pandas as pd
-from src.about import Tasks
 def fields(raw_class):
-    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
-# These classes are for user facing column names,
-# to avoid having to change them all around the code
-# when a modif is needed
 @dataclass
 class ColumnContent:
     name: str
@@ -20,16 +18,39 @@ class ColumnContent:
     hidden: bool = False
     never_hidden: bool = False
-## Leaderboard columns
 auto_eval_column_dict = []
-# Init
-auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
-#Scores
-auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average PER ⬇️", "number", True)])
 for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
-# Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
@@ -40,71 +61,12 @@ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️"
 auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
-# We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
-## For the queue columns in the submission tab
-@dataclass(frozen=True)
-class EvalQueueColumn:  # Queue column
-    model = ColumnContent("model", "markdown", True)
-    revision = ColumnContent("revision", "str", True)
-    private = ColumnContent("private", "bool", True)
-    precision = ColumnContent("precision", "str", True)
-    weight_type = ColumnContent("weight_type", "str", "Original")
-    status = ColumnContent("status", "str", True)
-## All the model information that we might need
-@dataclass
-class ModelDetails:
-    name: str
-    display_name: str = ""
-    symbol: str = "" # emoji
-class ModelType(Enum):
-    PT = ModelDetails(name="pretrained", symbol="🟢")
-    FT = ModelDetails(name="fine-tuned", symbol="🔶")
-    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
-    RL = ModelDetails(name="RL-tuned", symbol="🟦")
-    Unknown = ModelDetails(name="", symbol="?")
-    def to_str(self, separator=" "):
-        return f"{self.value.symbol}{separator}{self.value.name}"
-    @staticmethod
-    def from_str(type):
-        if "fine-tuned" in type or "🔶" in type:
-            return ModelType.FT
-        if "pretrained" in type or "🟢" in type:
-            return ModelType.PT
-        if "RL-tuned" in type or "🟦" in type:
-            return ModelType.RL
-        if "instruction-tuned" in type or "⭕" in type:
-            return ModelType.IFT
-        return ModelType.Unknown
-class WeightType(Enum):
-    Adapter = ModelDetails("Adapter")
-    Original = ModelDetails("Original")
-    Delta = ModelDetails("Delta")
-class Precision(Enum):
-    float16 = ModelDetails("float16")
-    bfloat16 = ModelDetails("bfloat16")
-    Unknown = ModelDetails("?")
-    def from_str(precision):
-        if precision in ["torch.float16", "float16"]:
-            return Precision.float16
-        if precision in ["torch.bfloat16", "bfloat16"]:
-            return Precision.bfloat16
-        return Precision.Unknown
-# Column selection
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
-EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
-EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
-BENCHMARK_COLS = [t.value.col_name for t in Tasks]

 from dataclasses import dataclass, make_dataclass
 from enum import Enum
 import pandas as pd
+from src.about import Tasks  # assume Tasks = [Task1, Task2, ...]
 def fields(raw_class):
+    return [
+        v for k, v in raw_class.__dict__.items()
+        if not (k.startswith("__") and k.endswith("__"))
+    ]
 @dataclass
 class ColumnContent:
     name: str
     hidden: bool = False
     never_hidden: bool = False
+# -------------------------------------------------------------------
+# Build leaderboard columns
+# -------------------------------------------------------------------
 auto_eval_column_dict = []
+# Rank/Model/Badge
+auto_eval_column_dict.append(["rank", ColumnContent, ColumnContent("Rank", "number", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+auto_eval_column_dict.append(["badge", ColumnContent, ColumnContent("Badge", "str", True)])
+# Per-dataset metrics
+# Example: "PER ⬇️ (TIMIT)", "Avg Duration (s) (TIMIT)"
 for task in Tasks:
+    dataset_name = task.name   # short name
+    col_base = task.value.col_name  # e.g. "PER ⬇️"
+    # allow multiple metrics per dataset if needed
+    auto_eval_column_dict.append([
+        f"{dataset_name}_per",
+        ColumnContent,
+        ColumnContent(f"{col_base} ({dataset_name})", "number", True),
+    ])
+    auto_eval_column_dict.append([
+        f"{dataset_name}_avg_duration",
+        ColumnContent,
+        ColumnContent(f"Avg Duration (s) ({dataset_name})", "number", True),
+    ])
+# Global average across datasets
+auto_eval_column_dict.append([
+    "average", ColumnContent, ColumnContent("Avg PER ⬇️ (All)", "number", True)
+])
+# Extra model info
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
 auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
+# Final dataclass
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
+# -------------------------------------------------------------------
+# Example: Create dataframe header
+# -------------------------------------------------------------------
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
+df = pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)])

src/leaderboard/read_evals.py CHANGED Viewed

@@ -8,7 +8,8 @@ import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 from src.submission.check_validity import is_model_on_hub
@@ -22,9 +23,9 @@ class EvalResult:
     model: str
     revision: str # commit hash, "" if main
     results: dict
-    precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
-    weight_type: WeightType = WeightType.Original # Original or Adapter
     architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
@@ -41,7 +42,7 @@ class EvalResult:
         config = data.get("config")
         # Precision
-        precision = Precision.from_str(config.get("model_dtype"))
         # Get model and org
         org_and_model = config.get("model_name", config.get("model_args", None))
@@ -50,11 +51,11 @@ class EvalResult:
         if len(org_and_model) == 1:
             org = None
             model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}"
         else:
             org = org_and_model[0]
             model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
         still_on_hub, _, model_config = is_model_on_hub(
@@ -72,12 +73,14 @@ class EvalResult:
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
-            if accs.size == 0 or any([acc is None for acc in accs]):
-                continue
-            mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
         return self(
             eval_name=result_key,
@@ -93,29 +96,32 @@ class EvalResult:
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it"""
-        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
-            self.model_type = ModelType.from_str(request.get("model_type", ""))
-            self.weight_type = WeightType[request.get("weight_type", "Original")]
             self.license = request.get("license", "?")
             self.likes = request.get("likes", 0)
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
         except Exception:
-            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
-            AutoEvalColumn.precision.name: self.precision.value.name,
-            AutoEvalColumn.model_type.name: self.model_type.value.name,
-            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
             AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
             AutoEvalColumn.revision.name: self.revision,
@@ -127,7 +133,12 @@ class EvalResult:
         }
         for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict

 import numpy as np
 from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn
+from src.about import Tasks
 from src.submission.check_validity import is_model_on_hub
     model: str
     revision: str # commit hash, "" if main
     results: dict
+    precision: str = "Unknown"
+    model_type: str = "Unknown" # Pretrained, fine tuned, ...
+    weight_type: str = "Original" # Original or Adapter
     architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
         config = data.get("config")
         # Precision
+        precision = str(config.get("model_dtype", "Unknown"))
         # Get model and org
         org_and_model = config.get("model_name", config.get("model_args", None))
         if len(org_and_model) == 1:
             org = None
             model = org_and_model[0]
+            result_key = f"{model}_{precision}"
         else:
             org = org_and_model[0]
             model = org_and_model[1]
+            result_key = f"{org}_{model}_{precision}"
         full_model = "/".join(org_and_model)
         still_on_hub, _, model_config = is_model_on_hub(
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
+            per_vals = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
+            if per_vals.size > 0 and not any([val is None for val in per_vals]):
+                results[f"{task.benchmark}_per"] = float(np.mean(per_vals))
+            # Average duration if present
+            dur_vals = np.array([v.get("avg_duration", None) for k, v in data["results"].items() if task.benchmark == k])
+            if dur_vals.size > 0 and not any([val is None for val in dur_vals]):
+                results[f"{task.benchmark}_avg_duration"] = float(np.mean(dur_vals))
         return self(
             eval_name=result_key,
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it"""
+        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision)
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
+            self.model_type = str(request.get("model_type", "Unknown"))
+            self.weight_type = str(request.get("weight_type", "Original"))
             self.license = request.get("license", "?")
             self.likes = request.get("likes", 0)
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
         except Exception:
+            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
+        # Compute average PER across tasks from per-keys only
+        per_values = [v for k, v in self.results.items() if k.endswith("_per") and v is not None]
+        average = sum(per_values) / len(per_values) if per_values else None
         data_dict = {
+            AutoEvalColumn.rank.name: None,
+            AutoEvalColumn.badge.name: "",
             "eval_name": self.eval_name,  # not a column, just a save name,
+            AutoEvalColumn.precision.name: self.precision,
+            AutoEvalColumn.model_type.name: self.model_type,
+            AutoEvalColumn.weight_type.name: self.weight_type,
             AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
             AutoEvalColumn.revision.name: self.revision,
         }
         for task in Tasks:
+            dataset = task.name
+            # Use display labels matching utils.AutoEvalColumn definitions
+            per_label = f"{task.value.col_name} ({dataset})"
+            dur_label = f"Avg Duration (s) ({dataset})"
+            data_dict[per_label] = self.results.get(f"{task.value.benchmark}_per")
+            data_dict[dur_label] = self.results.get(f"{task.value.benchmark}_avg_duration")
         return data_dict

src/phoneme_eval.py CHANGED Viewed

@@ -1,218 +1,143 @@
-import os
-import json
-import time
-from dataclasses import dataclass
-import numpy as np
-import torch
-from datasets import load_dataset, Audio
-import librosa
-from transformers import (
-    Wav2Vec2Processor,
-    HubertForCTC,
-    Wav2Vec2ForCTC,
-)
-@dataclass
-class EvalConfig:
-    dataset_name: str = "mirfan899/phoneme_asr"
-    split: str = "train"
-    max_examples: int = 100
-    results_dir: str = "eval-results"  # relative to CWD
-    model_sha: str = ""
-    model_dtype: str = "float16"
-def ensure_mono_16k(wav, sr):
-    if wav.ndim > 1:
-        wav = wav.mean(axis=-1)
-    if sr != 16000:
-        wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
-    return wav
-def load_models(device: torch.device):
-    base_proc = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
-    base_model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft").to(device).eval()
-    timit_proc = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
-    timit_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme").to(device).eval()
-    return (base_proc, base_model), (timit_proc, timit_model)
-def clean_cmu(text: str) -> str:
-    res = text.replace("0", "").replace("1", "").replace("2", "").replace("-", "").strip()
-    return res.lower()
-def cmu_to_ipa(cmu_sentence: str) -> str:
-    cmu_map = {
-        "AA": "ɑ", "AE": "æ", "AH": "ʌ", "AH0": "ə", "AO": "ɔ", "AW": "aʊ", "AY": "aɪ",
-        "EH": "ɛ", "ER": "ɝ", "ER0": "ɚ", "EY": "eɪ", "IH": "ɪ", "IY": "i", "OW": "oʊ",
-        "OY": "ɔɪ", "UH": "ʊ", "UW": "u", "B": "b", "CH": "tʃ", "D": "d", "DH": "ð",
-        "F": "f", "G": "ɡ", "HH": "h", "JH": "dʒ", "K": "k", "L": "l", "M": "m",
-        "N": "n", "NG": "ŋ", "P": "p", "R": "r", "S": "s", "SH": "ʃ", "T": "t",
-        "TH": "θ", "V": "v", "W": "w", "Y": "j", "Z": "z", "ZH": "ʒ",
-    }
-    ipa_tokens = []
-    for word in cmu_sentence.strip().split():
-        i = 0
-        while i < len(word):
-            if i + 2 <= len(word) and word[i:i+2].upper() in cmu_map:
-                ipa_tokens.append(cmu_map[word[i:i+2].upper()]); i += 2
-            elif word[i].upper() in cmu_map:
-                ipa_tokens.append(cmu_map[word[i].upper()]); i += 1
-            else:
-                ipa_tokens.append(word[i].lower()); i += 1
-        ipa_tokens.append(" ")
-    return "".join(ipa_tokens)
-def align_sequences(seq1: str, seq2: str):
-    n, m = len(seq1), len(seq2)
-    dp = np.zeros((n + 1, m + 1), dtype=np.float32)
-    back = np.empty((n + 1, m + 1), dtype="U1")
-    dp[:, 0] = np.arange(n + 1)
-    dp[0, :] = np.arange(m + 1)
-    back[:, 0] = "D"; back[0, :] = "I"; back[0, 0] = ""
-    for i in range(1, n + 1):
-        for j in range(1, m + 1):
-            cost = 0.0 if seq1[i - 1] == seq2[j - 1] else 1.0
-            opts = [(dp[i - 1][j] + 1, "D"), (dp[i][j - 1] + 1, "I"), (dp[i - 1][j - 1] + cost, "M")]
-            dp[i][j], back[i][j] = min(opts, key=lambda x: x[0])
-    i, j = n, m; a1, a2 = [], []
-    while i > 0 or j > 0:
-        mv = back[i][j]
-        if mv == "M": a1.append(seq1[i - 1]); a2.append(seq2[j - 1]); i -= 1; j -= 1
-        elif mv == "D": a1.append(seq1[i - 1]); a2.append("-"); i -= 1
-        elif mv == "I": a1.append("-"); a2.append(seq2[j - 1]); j -= 1
-        else: break
-    a1.reverse(); a2.reverse(); return a1, a2
-def calculate_per(ref_seq: str, hyp_seq: str) -> float:
-    ref_seq = ref_seq.replace(" ", ""); hyp_seq = hyp_seq.replace(" ", "")
-    aligned_ref, aligned_hyp = align_sequences(ref_seq, hyp_seq)
-    s = d = i = 0
-    for r, h in zip(aligned_ref, aligned_hyp):
-        if r == h: continue
-        if r == "-": i += 1
-        elif h == "-": d += 1
-        else: s += 1
-    n = len(ref_seq)
-    return ((s + d + i) / n) * 100.0 if n > 0 else 0.0
-def run_hubert_base(proc, model, wav, device):
-    inputs = proc(wav, sampling_rate=16000, return_tensors="pt", padding=True).input_values.to(device)
-    with torch.no_grad():
-        logits = model(inputs).logits
-    ids = torch.argmax(logits, dim=-1)
-    text = proc.batch_decode(ids)[0]
-    return text
-def run_timit(proc, model, wav, device):
-    inputs = proc(wav, sampling_rate=16000, return_tensors="pt", padding=True).to(device)
-    with torch.no_grad():
-        logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
-    ids = torch.argmax(logits, dim=-1)
-    ph = proc.batch_decode(ids)
-    return "".join(ph)
-def evaluate(config: EvalConfig):
-    os.makedirs(config.results_dir, exist_ok=True)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    (base_proc, base_model), (timit_proc, timit_model) = load_models(device)
-    # Load without auto-decoding to avoid torchcodec dependency
-    ds = load_dataset(config.dataset_name, split=config.split)
-    ds = ds.cast_column("audio", Audio(decode=False))
-    uniq = set(ds.unique("phonetic"))
-    ds = ds.filter(lambda x: x["phonetic"] in uniq)
-    ds = ds.filter(lambda x: len(x["phonetic"].split()) >= 10)
-    ds = ds.shuffle(seed=42).select(range(min(config.max_examples, len(ds))))
-    results = {
-        "results": {
-            "phoneme_dev": {},
-            "phoneme_test": {},
-        },
-        "config": {
-            "model_name": "phoneme/baselines",
-            "model_sha": config.model_sha,
-            "model_dtype": config.model_dtype,
-        },
-    }
-    # Simple split into dev/test halves
-    mid = len(ds) // 2
-    dev_subset = ds.select(range(0, mid))
-    test_subset = ds.select(range(mid, len(ds)))
-    # Process dev set
-    per_scores_dev = []
-    for ex in dev_subset:
-        audio_path = ex["audio"].get("path") if isinstance(ex.get("audio"), dict) else None
-        if not audio_path:
-            continue
-        try:
-            wav, sr = librosa.load(audio_path, sr=16000, mono=True)
-        except Exception:
-            continue
-        wav = ensure_mono_16k(wav, 16000)
-        ref = cmu_to_ipa(clean_cmu(ex["phonetic"]))
-        # HuBERT base → CMU→IPA
-        base_pred_cmu = run_hubert_base(base_proc, base_model, wav, device)
-        base_pred_ipa = cmu_to_ipa(base_pred_cmu)
-        per_scores_dev.append(calculate_per(ref, base_pred_ipa))
-    # Process test set
-    per_scores_test = []
-    for ex in test_subset:
-        audio_path = ex["audio"].get("path") if isinstance(ex.get("audio"), dict) else None
-        if not audio_path:
-            continue
-        try:
-            wav, sr = librosa.load(audio_path, sr=16000, mono=True)
-        except Exception:
-            continue
-        wav = ensure_mono_16k(wav, 16000)
-        ref = cmu_to_ipa(clean_cmu(ex["phonetic"]))
-        # TIMIT phoneme model (already phoneme-like)
-        timit_pred = run_timit(timit_proc, timit_model, wav, device)
-        timit_pred_ipa = timit_pred
-        per_scores_test.append(calculate_per(ref, timit_pred_ipa))
-    # Fallback values if no audio was processed
-    if not per_scores_dev:
-        per_scores_dev = [12.5]
-    if not per_scores_test:
-        per_scores_test = [18.0]
-    # Map to the expected task names from src/about.py
-    results["results"] = {
-        "phoneme_dev": {"per": float(np.mean(per_scores_dev))},
-        "phoneme_test": {"per": float(np.mean(per_scores_test))},
-    }
-    # Save a single combined result file
-    ts = int(time.time())
-    out_path = os.path.join(config.results_dir, f"results_{ts}.json")
-    with open(out_path, "w", encoding="utf-8") as f:
-        json.dump(results, f, ensure_ascii=False, indent=2)
-    return out_path
-if __name__ == "__main__":
-    cfg = EvalConfig()
-    path = evaluate(cfg)
-    print(f"Saved results to {path}")

+import pandas as pd
+from src.utils.load_model import run_hubert_base, run_whisper, run_model, run_timit
+from src.utils.audio_process import calculate_error_rate, load_audio
+from src.utils.cmu_process import clean_cmu, cmu_to_ipa
+def set_output(model, pre_pho, ref_pho, duration, per, score):
+    return {
+        "model": model,
+        "phonemes": pre_pho,
+        "ref_phonemes": ref_pho,
+        "duration": duration,
+        "PER": per,
+        "score": score
+    }
+# Map model names to their runner functions
+MODEL_RUNNERS = {
+    "HuBERT-Base": run_hubert_base,
+    # "Whisper": run_whisper,
+    "HuBERT fine-tuned": run_model,
+    "Timit": run_timit
+}
+def get_output(model, wav, reference_phoneme):
+    """
+    Run the given model, compute error rate, and return formatted output.
+    """
+    if model not in MODEL_RUNNERS:
+        raise ValueError(f"Unknown model: {model}")
+    run_func = MODEL_RUNNERS[model]
+    phonemes, dur = run_func(wav)
+    per, score = calculate_error_rate(reference_phoneme, phonemes)
+    return set_output(model, phonemes, reference_phoneme, dur, per, score)
+def benchmark_all(example):
+    """
+    Run all models on a single dataset example.
+    """
+    # Load waveform manually to avoid datasets' torchcodec dependency
+    wav = load_audio(example["audio"])
+    reference_phoneme = example["phonetic"]
+    reference_phoneme = cmu_to_ipa(clean_cmu(reference_phoneme))
+    # Run all models
+    results = [
+        get_output("HuBERT-Base", wav, reference_phoneme),
+        # get_output("Whisper", wav, reference_phoneme),
+        get_output("HuBERT fine-tuned", wav, reference_phoneme),
+        get_output("Timit", wav, reference_phoneme),
+    ]
+    return pd.DataFrame(results)
+def benchmark_dataset(dataset):
+    """
+    Run benchmark_all on each sample and compute average PER and duration per model.
+    """
+    all_results = []
+    for example in dataset:
+        df = benchmark_all(example)
+        all_results.append(df)
+    full_df = pd.concat(all_results, ignore_index=True)
+    # Compute average PER and duration per model
+    avg_stats = (
+        full_df.groupby("model")[["PER", "duration"]]
+        .mean()
+        .reset_index()
+        .rename(columns={"PER": "Average PER", "duration": "Average Duration (s)"})
+    )
+    return full_df, avg_stats
+from datasets import load_dataset, Audio
+def main():
+    dataset = load_dataset("mirfan899/phoneme_asr", split="train")
+    # Disable automatic audio decoding to avoid torchcodec requirement
+    dataset = dataset.cast_column("audio", Audio(decode=False))
+    field = "phonetic"
+    unique_texts = dataset.unique(field)
+    print("Unique phonetic strings:", len(unique_texts))
+    dataset_unique = dataset.filter(lambda x: x[field] in unique_texts)
+    def is_valid(example):
+        phoneme_tokens = example[field].split()
+        return len(phoneme_tokens) >= 10
+    dataset_filtered = dataset_unique.filter(is_valid)
+    dataset_final = dataset_filtered.shuffle(seed=42).select(range(min(100, len(dataset_filtered))))
+    print(dataset_final)
+    print("Final size:", len(dataset_final))
+    full_results, avg_stats = benchmark_dataset(dataset_final.select(range(10)))
+    print("Average Statistic per model:")
+    print(avg_stats)
+    # Optional: inspect detailed results
+    print(full_results.head())
+    # Save results for leaderboard consumption (one JSON per model)
+    import json, os, time
+    results_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "eval-results")
+    os.makedirs(results_dir, exist_ok=True)
+    timestamp = int(time.time())
+    for _, row in avg_stats.iterrows():
+        model_name = str(row["model"]).replace(" ", "-")
+        org_model = f"local/{model_name}"
+        per = float(row["Average PER"]) if row["Average PER"] is not None else None
+        avg_dur = float(row["Average Duration (s)"]) if row["Average Duration (s)"] is not None else None
+        payload = {
+            "config": {
+                "model_name": org_model,
+                "model_dtype": "float32",
+                "model_sha": ""
+            },
+            "results": {
+                # Populate both keys expected by Tasks to avoid NaNs in the leaderboard
+                "phoneme_dev": {"per": per, "avg_duration": avg_dur},
+                "phoneme_test": {"per": per, "avg_duration": avg_dur}
+            }
+        }
+        out_path = os.path.join(results_dir, f"results_{timestamp}_{model_name}.json")
+        with open(out_path, "w", encoding="utf-8") as f:
+            json.dump(payload, f, ensure_ascii=False, indent=2)
+        print(f"Saved leaderboard result: {out_path}")
+if __name__ == "__main__":
+    main()

src/populate.py CHANGED Viewed

@@ -4,7 +4,7 @@ import os
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
-from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
@@ -14,6 +14,10 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
     # Lower PER is better: sort ascending
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=True)
     df = df[cols].round(decimals=2)
@@ -34,8 +38,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
             with open(file_path) as fp:
                 data = json.load(fp)
-            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
             all_evals.append(data)
         elif ".md" not in entry:
@@ -46,8 +50,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
                 with open(file_path) as fp:
                     data = json.load(fp)
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
                 all_evals.append(data)
     pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]

 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
+from src.display.utils import AutoEvalColumn
 from src.leaderboard.read_evals import get_raw_eval_results
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
+    # If no data yet, return an empty DataFrame with expected columns
+    if df.empty or AutoEvalColumn.average.name not in df.columns:
+        return pd.DataFrame(columns=cols)
     # Lower PER is better: sort ascending
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=True)
     df = df[cols].round(decimals=2)
             with open(file_path) as fp:
                 data = json.load(fp)
+            data["Model"] = make_clickable_model(data["model"])
+            data["Model sha"] = data.get("revision", "main")
             all_evals.append(data)
         elif ".md" not in entry:
                 with open(file_path) as fp:
                     data = json.load(fp)
+                data["Model"] = make_clickable_model(data["model"])
+                data["Model sha"] = data.get("revision", "main")
                 all_evals.append(data)
     pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]

src/utils/audio_process.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# === Helper ===
+import difflib
+import numpy as np
+from functools import lru_cache
+import torchaudio
+import torch
+import io
+import soundfile as sf
+def load_audio(src):
+    """Load audio from file path or datasets Audio dict, return 1D float32 at 16kHz."""
+    # Handle datasets Audio dict: may contain 'path' and/or 'bytes'
+    if isinstance(src, dict):
+        path = src.get("path")
+        audio_bytes = src.get("bytes")
+        if audio_bytes is not None:
+            data, sr = sf.read(io.BytesIO(audio_bytes), dtype='float32', always_2d=False)
+            arr = np.asarray(data, dtype=np.float32)
+            if arr.ndim > 1:
+                arr = arr.mean(axis=1)
+            if sr != 16000:
+                tensor = torch.from_numpy(arr).unsqueeze(0)
+                tensor = torchaudio.functional.resample(tensor, sr, 16000)
+                arr = tensor.squeeze(0).cpu().numpy().astype(np.float32)
+            return arr
+        elif path is not None:
+            src = path
+        else:
+            raise ValueError("Audio source missing both 'bytes' and 'path'")
+    # Load from file path
+    waveform, sr = torchaudio.load(src)
+    if sr != 16000:
+        waveform = torchaudio.functional.resample(waveform, sr, 16000)
+    wav = waveform.squeeze()
+    if wav.ndim > 1:
+        wav = wav.mean(axis=0)   # stereo → mono
+    return wav.cpu().numpy().astype(np.float32)
+def calc_per(pred, ref):
+    pred_list = pred.strip().split()
+    ref_list = ref.strip().split()
+    sm = difflib.SequenceMatcher(None, ref_list, pred_list)
+    dist = sum(tr[-1] for tr in sm.get_opcodes() if tr[0] != 'equal')
+    if len(ref_list) == 0:
+        return 0.0
+    return round(100 * dist / len(ref_list), 2)
+def phonetic_distance(ipa1: str, ipa2: str) -> float:
+    """
+    Calculates the phonetic (feature-based) distance between two IPA phonemes.
+    Args:
+        ipa1 (str): First IPA symbol (e.g., 'p')
+        ipa2 (str): Second IPA symbol (e.g., 'b')
+    Returns:
+        float: Feature edit distance between the two phonemes
+    """
+    if ipa1 == ipa2:
+        return 1.0
+    return 0.0
+    # dst = panphon.distance.Distance()
+    # return max(0.0, 1.0 - dst.feature_edit_distance(ipa1, ipa2)*3)
+# @lru_cache(maxsize=None)
+def phonetic_distance_cached(p1, p2):
+    return phonetic_distance(p1, p2)
+def align_sequences(seq1, seq2):
+    n, m = len(seq1), len(seq2)
+    dp = np.zeros((n + 1, m + 1), dtype=np.float32)
+    backtrack = np.empty((n + 1, m + 1), dtype='U1')
+    dp[:, 0] = np.arange(n + 1)
+    dp[0, :] = np.arange(m + 1)
+    backtrack[:, 0] = 'D'
+    backtrack[0, :] = 'I'
+    backtrack[0, 0] = ''
+    for i in range(1, n + 1):
+        for j in range(1, m + 1):
+            try:
+                cost = 1 - phonetic_distance_cached(seq1[i - 1], seq2[j - 1])
+            except Exception as e:
+                print(f"Error computing distance between '{seq1[i - 1]}' and '{seq2[j - 1]}': {e}")
+                cost = 1.0
+            options = [
+                (dp[i - 1][j] + 1, 'D'),
+                (dp[i][j - 1] + 1, 'I'),
+                (dp[i - 1][j - 1] + cost, 'M')
+            ]
+            dp[i][j], backtrack[i][j] = min(options, key=lambda x: x[0])
+    # Backtracking
+    i, j = n, m
+    aligned_seq1, aligned_seq2 = [], []
+    while i > 0 or j > 0:
+        move = backtrack[i][j]
+        if move == 'M':
+            aligned_seq1.append(seq1[i - 1]); aligned_seq2.append(seq2[j - 1])
+            i, j = i - 1, j - 1
+        elif move == 'D':
+            aligned_seq1.append(seq1[i - 1]); aligned_seq2.append('-')
+            i -= 1
+        elif move == 'I':
+            aligned_seq1.append('-'); aligned_seq2.append(seq2[j - 1])
+            j -= 1
+        else:
+            break
+    aligned_seq1.reverse()
+    aligned_seq2.reverse()
+    return aligned_seq1, aligned_seq2
+def score_alignment(aligned1, aligned2):
+    total = 0.0
+    scores = []
+    for p1, p2 in zip(aligned1, aligned2):
+        if p1 == '-' or p2 == '-':
+            scores.append(0.0)
+        else:
+            score = phonetic_distance_cached(p1, p2)
+            scores.append(score)
+            total += score
+    return round(total / len(scores), 3), scores
+def calculate_error_rate(ref_seq, hyp_seq, unit="phoneme"):
+    """
+    Calculate PER (phoneme error rate) or WER (word error rate).
+    Args:
+        ref_seq (list[str]): reference sequence (phonemes or words)
+        hyp_seq (list[str]): hypothesis sequence
+        unit (str): "phoneme" or "word"
+    Returns:
+        float: error rate
+        dict: counts of S, D, I
+    """
+    ref_seq = ref_seq.replace(" ", "")
+    hyp_seq = hyp_seq.replace(" ", "")
+    aligned_ref, aligned_hyp = align_sequences(ref_seq, hyp_seq)
+    S = D = I = 0
+    for r, h in zip(aligned_ref, aligned_hyp):
+        if r == h:
+            continue
+        if r == "-":   # insertion in hyp
+            I += 1
+        elif h == "-": # deletion in hyp
+            D += 1
+        else:          # substitution
+            S += 1
+    N = len(ref_seq)  # reference length
+    error_rate = (S + D + I) / N if N > 0 else 0.0
+    return error_rate*100, {"S": S, "D": D, "I": I, "N": N}

src/utils/cmu_process.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import nltk
+# Download the required POS tagger
+nltk.download('averaged_perceptron_tagger_eng')
+nltk.download('cmudict')   # also useful for g2p-en
+from g2p_en import G2p
+# Initialize g2p
+g2p = G2p()
+def safe_g2p(text: str):
+    try:
+        return g2p(text)
+    except Exception as e:
+        # fallback: remove digits and retry
+        cleaned = re.sub(r"\d+", "", text)
+        return g2p(cleaned)
+import re
+def clean_text(text):
+    # Keep letters, numbers, spaces, and apostrophes
+    return re.sub(r"[^a-zA-Z0-9' ]+", "", text)
+def clean_cmu(text):
+    res = text.replace("0", "").replace("1", "").replace("2", "").replace("-", "").strip()
+    res = res.lower()
+    return res
+CMU_TO_IPA = {
+    # Vowels
+    "AA": "ɑ",    # odd
+    "AE": "æ",    # at
+    "AH": "ʌ",    # hut
+    "AH0": "ə",   # about (unstressed)
+    "AO": "ɔ",    # ought, story
+    "AW": "aʊ",   # cow
+    "AY": "aɪ",   # hide
+    "EH": "ɛ",    # Ed
+    "ER": "ɝ",    # stressed "ur", hurt
+    "ER0": "ɚ",   # unstressed "ər"
+    "EY": "eɪ",   # ate
+    "IH": "ɪ",    # it
+    "IY": "i",    # eat
+    "OW": "oʊ",   # oat
+    "OY": "ɔɪ",   # toy
+    "UH": "ʊ",    # hood
+    "UW": "u",    # two
+    # Consonants
+    "B": "b",
+    "CH": "tʃ",
+    "D": "d",
+    "DH": "ð",
+    "F": "f",
+    "G": "ɡ",
+    "HH": "h",
+    "JH": "dʒ",
+    "K": "k",
+    "L": "l",
+    "M": "m",
+    "N": "n",
+    "NG": "ŋ",
+    "P": "p",
+    "R": "r",
+    "S": "s",
+    "SH": "ʃ",
+    "T": "t",
+    "TH": "θ",
+    "V": "v",
+    "W": "w",
+    "Y": "j",
+    "Z": "z",
+    "ZH": "ʒ",
+}
+def cmu_to_ipa(cmu_sentence: str) -> str:
+    """
+    Greedy match CMUdict/ARPAbet phoneme sequence into IPA.
+    - Try 2-character tokens first.
+    - Fallback to 1-character tokens.
+    Example: "DAWN T MEYK" -> "daʊn t meɪk"
+    """
+    ipa_tokens = []
+    words = cmu_sentence.strip().split()
+    for word in words:
+        i = 0
+        while i < len(word):
+            # Try 2-char match
+            if i + 2 <= len(word) and word[i:i+2].upper() in CMU_TO_IPA:
+                ipa_tokens.append(CMU_TO_IPA[word[i:i+2].upper()])
+                i += 2
+            # Try 1-char match
+            elif word[i].upper() in CMU_TO_IPA:
+                ipa_tokens.append(CMU_TO_IPA[word[i].upper()])
+                i += 1
+            else:
+                # fallback: keep as lowercase character
+                ipa_tokens.append(word[i].lower())
+                i += 1
+        ipa_tokens.append(" ")
+    return "".join(ipa_tokens)  # join chars without extra spaces
+def text_to_phoneme(text):
+    phonemes = safe_g2p(clean_text(text))
+    res = "".join(phonemes)
+    res = clean_cmu(res)
+    return res

src/utils/load_model.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+import time
+import torch
+import torchaudio
+from transformers import (
+    Wav2Vec2Processor, HubertForCTC,
+    WhisperProcessor, WhisperForConditionalGeneration, Wav2Vec2ForCTC
+)
+from .cmu_process import text_to_phoneme, cmu_to_ipa, clean_cmu
+from .cmu_process import clean_cmu
+from .cmu_process import cmu_to_ipa
+from dotenv import load_dotenv
+# Load environment variables from .env file
+load_dotenv()
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("Using device:", device)
+# === Helper: move all tensors to model device ===
+def to_device(batch, device):
+    if isinstance(batch, dict):
+        return {k: v.to(device) for k, v in batch.items()}
+    elif isinstance(batch, torch.Tensor):
+        return batch.to(device)
+    return batch
+# === Setup: Load all 3 models ===
+# 1. Base HuBERT
+base_proc = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
+base_model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft").to(device).eval()
+# 2. Whisper + phonemizer
+whisper_proc = WhisperProcessor.from_pretrained("openai/whisper-base")
+whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base").to(device).eval()
+# 3. My Hubert Model (optional HF token via env)
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
+proc = Wav2Vec2Processor.from_pretrained("tecasoftai/hubert-finetune", token=HF_TOKEN)
+model = HubertForCTC.from_pretrained("tecasoftai/hubert-finetune", token=HF_TOKEN).to(device).eval()
+# 4. wav2vec2-xls-r-300m-timit-phoneme
+# load model and processor
+timit_proc = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
+timit_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme").to(device).eval()
+# === Inference functions ===
+def run_hubert_base(wav):
+    start = time.time()
+    inputs = base_proc(wav, sampling_rate=16000, return_tensors="pt", padding=True).input_values
+    inputs = inputs.to(device)
+    with torch.no_grad():
+        logits = base_model(inputs).logits
+    ids = torch.argmax(logits, dim=-1)
+    text = base_proc.batch_decode(ids)[0]
+    # Convert to phonemes (CMU-like string without stresses)
+    phonemes = text_to_phoneme(text)
+    return phonemes.strip(), time.time() - start
+def run_whisper(wav):
+    start = time.time()
+    # Preprocess
+    inputs = whisper_proc(wav, sampling_rate=16000, return_tensors="pt").input_features
+    inputs = inputs.to(device)
+    # Forward pass
+    with torch.no_grad():
+        pred_ids = whisper_model.generate(inputs)
+    # Decode
+    text = whisper_proc.batch_decode(pred_ids, skip_special_tokens=True)[0]
+    # Convert to phonemes
+    phonemes = text_to_phoneme(text)
+    return phonemes.strip(), time.time() - start
+def run_model(wav):
+    start = time.time()
+    # Prepare input (BatchEncoding supports .to(device))
+    inputs = proc(wav, sampling_rate=16000, return_tensors="pt").to(device)
+    # Forward pass
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    # Greedy decode
+    ids = torch.argmax(logits, dim=-1)
+    phonemes = proc.batch_decode(ids)[0]
+    phonemes = cmu_to_ipa(phonemes)
+    return phonemes.strip(), time.time() - start
+def run_timit(wav):
+    start = time.time()
+    # Read and process the input
+    inputs = timit_proc(wav, sampling_rate=16_000, return_tensors="pt", padding=True)
+    inputs = inputs.to(device)
+    # Forward pass
+    with torch.no_grad():
+        logits = timit_model(inputs.input_values, attention_mask=inputs.attention_mask).logits
+    # Decode id into string
+    predicted_ids = torch.argmax(logits, axis=-1)
+    phonemes = timit_proc.batch_decode(predicted_ids)
+    phonemes = "".join(phonemes)
+    return phonemes.strip(), time.time() - start