tcid / data.py
Ákos Hadnagy
Hook it up to the data-source
2018d03
raw
history blame
3 kB
from huggingface_hub import HfApi, HfFileSystem, login
import pandas as pd
fs = HfFileSystem()
IMPORTANT_MODELS = [
"auto",
"bert", # old but dominant (encoder only)
"gpt2", # old (decoder)
"t5", # old (encoder-decoder)
"modernbert", # (encoder only)
"vit", # old (vision) - fixed comma
"clip", # old but dominant (vision)
"detr", # objection detection, segmentation (vision)
"table-transformer", # objection detection (visioin) - maybe just detr?
"got_ocr2", # ocr (vision)
"whisper", # old but dominant (audio)
"wav2vec2", # old (audio)
"llama", # new and dominant (meta)
"gemma3", # new (google)
"qwen2", # new (Alibaba)
"mistral3", # new (Mistral) - added missing comma
"qwen2_5_vl", # new (vision)
"llava", # many models from it (vision)
"smolvlm", # new (video)
"internvl", # new (video)
"gemma3n", # new (omnimodal models)
"qwen2_5_omni", # new (omnimodal models)
]
def get_data():
files_amd = fs.glob(
"hf://datasets/optimum-amd/transformers_daily_ci/**/runs/**/ci_results_run_models_gpu/model_results.json"
)
files_amd.sort(reverse=True)
df_amd = pd.read_json(f"hf://{files_amd[0]}", orient="index")
df_amd.index.name = "model_name"
df_amd["failed_multi_no_amd"] = df_amd["failures"].apply(
lambda x: len(x["multi"]) if "multi" in x else 0
)
df_amd["failed_single_no_amd"] = df_amd["failures"].apply(
lambda x: len(x["single"]) if "single" in x else 0
)
files_nvidia = fs.glob(
"hf://datasets/hf-internal-testing/transformers_daily_ci/**/ci_results_run_models_gpu/model_results.json"
)
files_nvidia.sort(reverse=True)
df_nvidia = pd.read_json(
f"https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/raw/main/{files_nvidia[0].lstrip('datasets/hf-internal-testing/transformers_daily_ci/')}",
orient="index",
)
df_nvidia.index.name = "model_name"
df_nvidia["failed_multi_no_nvidia"] = df_nvidia["failures"].apply(
lambda x: len(x["multi"]) if "multi" in x else 0
)
df_nvidia["failed_single_no_nvidia"] = df_nvidia["failures"].apply(
lambda x: len(x["single"]) if "single" in x else 0
)
df_nvidia
joined = df_amd.join(df_nvidia, rsuffix="_nvidia", lsuffix="_amd", how="outer")
joined = joined[
[
"success_amd",
"success_nvidia",
"failed_multi_no_amd",
"failed_multi_no_nvidia",
"failed_single_no_amd",
"failed_single_no_nvidia",
"failures_amd",
"failures_nvidia",
"job_link_amd",
"job_link_nvidia",
]
]
joined.index = joined.index.str.replace("^models_", "", regex=True)
important_models_lower = [model.lower() for model in IMPORTANT_MODELS]
filtered_joined = joined[joined.index.str.lower().isin(important_models_lower)]
return filtered_joined