Spaces:
Running
Running
| import json | |
| import os | |
| from datetime import datetime, timedelta | |
| from github import Github | |
| from huggingface_hub import HfApi, login | |
| def check_dataset_updates(dataset_id): | |
| api = HfApi() | |
| github = Github(os.environ["GH_TOKEN"]) | |
| repo = github.get_repo("argmaxinc/whisperkit") | |
| dataset_info = api.dataset_info(dataset_id) | |
| last_modified = dataset_info.lastModified.isoformat() | |
| current_sha = dataset_info.sha | |
| repo_tree = api.list_repo_tree( | |
| repo_id=dataset_id, | |
| repo_type="dataset", | |
| path_in_repo="benchmark_data", | |
| recursive=False, | |
| ) | |
| cutoff_date = datetime.now(dataset_info.lastModified.tzinfo) - timedelta(weeks=6) | |
| commit_dates_hashes = [item.path.split("/")[-1] for item in repo_tree] | |
| new_commit_hashes = [] | |
| for commit_date_hash in commit_dates_hashes: | |
| commit_date, commit_hash = commit_date_hash.split("_") | |
| commit_date = datetime.strptime(commit_date, "%Y-%m-%dT%H%M%S").replace( | |
| tzinfo=dataset_info.lastModified.tzinfo | |
| ) | |
| if commit_date < cutoff_date: | |
| continue | |
| new_commit_hashes.append(commit_hash) | |
| commit_info = [] | |
| for commit_hash in new_commit_hashes: | |
| try: | |
| commit = repo.get_commit(commit_hash) | |
| commit_date = commit.commit.author.date | |
| version = get_commit_version(repo, commit_hash) | |
| if version: | |
| commit_info.append((commit_hash, commit_date, version)) | |
| except Exception as e: | |
| print(f"Error processing commit {commit_hash}: {str(e)}") | |
| continue | |
| # Sort by commit date | |
| commit_info.sort(key=lambda x: x[1]) | |
| # Extract sorted commits and versions | |
| new_releases = [info[0] for info in commit_info] | |
| new_versions = [info[2] for info in commit_info] | |
| cache_dir = "dashboard_data" | |
| cache_file = os.path.join(cache_dir, "version.json") | |
| with open(cache_file, "r") as f: | |
| version = json.load(f) | |
| releases = version["releases"] | |
| versions = version["versions"] | |
| updated_releases = [] | |
| updated_versions = [] | |
| for release, version in zip(new_releases, new_versions): | |
| if release not in releases: | |
| updated_releases.append(release) | |
| updated_versions.append(version) | |
| if os.path.exists(cache_file): | |
| with open(cache_file, "r") as f: | |
| cached_data = json.load(f) | |
| if cached_data.get("sha") == current_sha: | |
| with open(os.environ["GITHUB_OUTPUT"], "a") as fh: | |
| print(f"has_updates=false", file=fh) | |
| return | |
| with open(cache_file, "w") as f: | |
| json.dump( | |
| { | |
| "last_modified": last_modified, | |
| "sha": current_sha, | |
| "releases": releases + updated_releases, | |
| "versions": versions + updated_versions, | |
| }, | |
| f, | |
| ) | |
| with open(os.environ["GITHUB_OUTPUT"], "a") as fh: | |
| print(f"has_updates=true", file=fh) | |
| def get_commit_version(repo, commit_hash): | |
| try: | |
| releases = list(repo.get_releases()) | |
| releases.sort(key=lambda x: x.created_at) | |
| commit = repo.get_commit(commit_hash) | |
| commit_date = commit.commit.author.date | |
| for i, release in enumerate(releases): | |
| if commit_date <= release.created_at: | |
| return releases[i].tag_name.lstrip("v") | |
| return releases[-1].tag_name.lstrip("v") | |
| except Exception as e: | |
| print(f"Error processing commit {commit_hash}: {str(e)}") | |
| return None | |
| if __name__ == "__main__": | |
| login(token=os.environ["HF_TOKEN"]) | |
| check_dataset_updates("argmaxinc/whisperkit-evals-dataset") | |