import os import io import json import pandas as pd from huggingface_hub import HfApi def _as_parquet_bytes(record: dict) -> bytes: df = pd.DataFrame([record]) buf = io.BytesIO() df.to_parquet(buf, index=False) return buf.getvalue() def persist_to_hf( dataset_repo: str, record: dict, anon_pdf_bytes: bytes, parquet_path: str, json_path: str, pdf_path: str, ): token = os.environ.get("HF_TOKEN") if not token: return {"error": "HF_TOKEN not set"} api = HfApi(token=token) pq_bytes = _as_parquet_bytes(record) api.upload_file( path_or_fileobj=pq_bytes, path_in_repo=parquet_path, repo_id=dataset_repo, repo_type="dataset", commit_message="Add candidate parquet record", ) js_bytes = json.dumps(record, ensure_ascii=False, indent=2).encode("utf-8") api.upload_file( path_or_fileobj=js_bytes, path_in_repo=json_path, repo_id=dataset_repo, repo_type="dataset", commit_message="Add candidate JSON record", ) api.upload_file( path_or_fileobj=anon_pdf_bytes, path_in_repo=pdf_path, repo_id=dataset_repo, repo_type="dataset", commit_message="Add anonymized PDF", ) return {"status": "ok", "dataset_repo": dataset_repo, "files": [parquet_path, json_path, pdf_path]}