Spaces:
Running
Running
| """ | |
| Copyright 2025 Balacoon | |
| Fetches samples from `balacoon/speech_gen_baselines` and | |
| `balacoon/speech_gen_eval_testsets` datasets. | |
| """ | |
| import re | |
| import logging | |
| import requests | |
| import pandas as pd | |
| from huggingface_hub import hf_hub_url | |
| def get_samples_data(system_type: str, models: list[str], dataset: str) -> tuple[pd.DataFrame, list[str]]: | |
| """ | |
| Fetches `demo` and `id_mapping` from `balacoon/speech_gen_eval_testsets` for the given dataset. | |
| Then fetches reference files according to `id_mapping` from `balacoon/speech_gen_eval_testsets`. | |
| Finally fetches synthetic samples for different models from `balacoon/speech_gen_baselines` | |
| according to `demo`. | |
| """ | |
| testsets_repo = "balacoon/speech_gen_eval_testsets" | |
| # 1. get demo and id_mapping | |
| demo_path = f"{dataset}/demo" | |
| id_mapping_path = f"{dataset}/id_mapping" | |
| try: | |
| # read demo ids | |
| url = hf_hub_url( | |
| repo_id=testsets_repo, | |
| filename=demo_path, | |
| repo_type="dataset" | |
| ) | |
| response = requests.get(url) | |
| demo = response.text.splitlines() | |
| demo = [re.split(r"\s+", x.strip(), maxsplit=1) for x in demo] | |
| if system_type == "vocoder": | |
| # no need for mapping, mapping is to itself | |
| mapping = {name: name for name, _ in demo} | |
| else: | |
| # read id mapping | |
| url = hf_hub_url( | |
| repo_id=testsets_repo, | |
| filename=id_mapping_path, | |
| repo_type="dataset" | |
| ) | |
| response = requests.get(url) | |
| mapping = response.text.splitlines() | |
| mapping = [x.split() for x in mapping] | |
| mapping = {k: v for k, v in mapping} | |
| except Exception as e: | |
| logging.error(f"Failed to read demo / mapping for {dataset}: {e}") | |
| return pd.DataFrame() | |
| # 2. get reference files | |
| if not all(x in mapping for x, _ in demo): | |
| raise ValueError(f"Failed to fetch demo or mapping for {dataset}, refresh the page.") | |
| ref_ids = list(set([mapping[x] for x, _ in demo])) | |
| reference_samples = {} | |
| for id in ref_ids: | |
| try: | |
| url = hf_hub_url( | |
| repo_id=testsets_repo, | |
| filename=f"{dataset}/wav/{id}.wav", | |
| repo_type="dataset" | |
| ) | |
| reference_samples[id] = f"<audio src='{url}' controls></audio>" | |
| except Exception as e: | |
| logging.error(f"Failed to read reference {id} for {dataset}: {e}") | |
| continue | |
| # 3. get synthetic samples | |
| systems_samples = {model: {} for model in models} | |
| baselines_repo = "balacoon/speech_gen_baselines" | |
| for model in models: | |
| for id, _ in demo: | |
| try: | |
| filename = f"{system_type}/{model}/{dataset}/wav/{id}.wav" | |
| url = hf_hub_url( | |
| repo_id=baselines_repo, | |
| filename=filename, | |
| repo_type="dataset" | |
| ) | |
| systems_samples[model][id] = f"<audio src='{url}' controls></audio>" | |
| except Exception as e: | |
| logging.error(f"Failed to read sample {id} from {filename} in {dataset}: {e}") | |
| continue | |
| # filter out demo ids, checking if all samples are present | |
| filtered_demo = [] | |
| for id, txt in demo: | |
| if id not in mapping: | |
| continue | |
| ref_id = mapping[id] | |
| if ref_id not in reference_samples: | |
| continue | |
| if all(id in systems_samples[model] for model in models): | |
| filtered_demo.append((id, txt)) | |
| # finally create a dataframe | |
| rows = [] | |
| for id, txt in filtered_demo: | |
| row = { | |
| "id": id, | |
| "text": txt, | |
| "reference": reference_samples[mapping[id]], | |
| } | |
| for model in models: | |
| row[model] = systems_samples[model][id] | |
| rows.append(row) | |
| datatypes = ["text", "text", "markdown"] + ["markdown"] * len(models) | |
| return pd.DataFrame(rows), datatypes | |