"""Download and merge all data files from a Hugging Face dataset repo. Usage: HF_TOKEN must be exported in your environment (or pass --token). HF_DATASET_ID may be exported or passed via --repo. Example: export HF_TOKEN="hf_..." python download_dataset.py --repo kathiasi/tts-rubric-responses --outdir out This script downloads any files under `data/` (parquet or arrow/ipc), reads them, concatenates into a single table, and writes `combined.parquet` and `combined.csv` in `outdir`. """ import os import argparse import json from huggingface_hub import HfApi, hf_hub_download import pyarrow.parquet as pq import pyarrow.ipc as ipc import pandas as pd def read_parquet(path): try: tbl = pq.read_table(path) return tbl.to_pandas() except Exception as e: raise RuntimeError(f"Failed to read parquet {path}: {e}") def read_arrow(path): try: with open(path, 'rb') as f: reader = ipc.open_file(f) tbl = reader.read_all() return tbl.to_pandas() except Exception as e: raise RuntimeError(f"Failed to read arrow/ipc {path}: {e}") def download_and_merge(repo_id, outdir, token=None): api = HfApi() token = token or os.environ.get('HF_TOKEN') if not token: raise RuntimeError('HF_TOKEN not provided; export HF_TOKEN or pass --token') files = api.list_repo_files(repo_id=repo_id, repo_type='dataset', token=token) data_files = [f for f in files if f.startswith('data/')] if not data_files: print('No data/ files found in dataset repo. Files found:') print(json.dumps(files, indent=2)) return os.makedirs(outdir, exist_ok=True) dfs = [] for fname in sorted(data_files): print('Processing', fname) local_path = hf_hub_download(repo_id=repo_id, repo_type='dataset', filename=fname, token=token) if fname.endswith('.parquet'): df = read_parquet(local_path) elif fname.endswith('.arrow') or fname.endswith('.ipc'): df = read_arrow(local_path) else: print('Skipping unsupported data file:', fname) continue dfs.append(df) if not dfs: print('No supported data files were read.') return combined = pd.concat(dfs, ignore_index=True) out_parquet = os.path.join(outdir, 'combined.parquet') out_csv = os.path.join(outdir, 'combined.csv') print(f'Writing {len(combined)} rows to', out_parquet) combined.to_parquet(out_parquet, index=False) print('Also writing CSV to', out_csv) combined.to_csv(out_csv, index=False) print('Done.') if __name__ == '__main__': p = argparse.ArgumentParser() p.add_argument('--repo', help='Dataset repo id (user/name)', default=os.environ.get('HF_DATASET_ID')) p.add_argument('--outdir', help='Output directory', default='hf_dataset') p.add_argument('--token', help='Hugging Face token (optional)', default=None) args = p.parse_args() if not args.repo: print('Dataset repo id is required via --repo or HF_DATASET_ID env var') raise SystemExit(1) download_and_merge(args.repo, args.outdir, token=args.token)