Corin1998 commited on
Commit
17716c9
·
verified ·
1 Parent(s): 333180d

Create storage.py

Browse files
Files changed (1) hide show
  1. pipelines/storage.py +53 -0
pipelines/storage.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import json
4
+ import pandas as pd
5
+ from huggingface_hub import HfApi
6
+
7
+ def _as_parquet_bytes(record: dict) -> bytes:
8
+ df = pd.DataFrame([record])
9
+ buf = io.BytesIO()
10
+ df.to_parquet(buf, index=False)
11
+ return buf.getvalue()
12
+
13
+ def persist_to_hf(
14
+ dataset_repo: str,
15
+ record: dict,
16
+ anon_pdf_bytes: bytes,
17
+ parquet_path: str,
18
+ json_path: str,
19
+ pdf_path: str,
20
+ ):
21
+ token = os.environ.get("HF_TOKEN")
22
+ if not token:
23
+ return {"error": "HF_TOKEN not set"}
24
+
25
+ api = HfApi(token=token)
26
+
27
+ pq_bytes = _as_parquet_bytes(record)
28
+ api.upload_file(
29
+ path_or_fileobj=pq_bytes,
30
+ path_in_repo=parquet_path,
31
+ repo_id=dataset_repo,
32
+ repo_type="dataset",
33
+ commit_message="Add candidate parquet record",
34
+ )
35
+
36
+ js_bytes = json.dumps(record, ensure_ascii=False, indent=2).encode("utf-8")
37
+ api.upload_file(
38
+ path_or_fileobj=js_bytes,
39
+ path_in_repo=json_path,
40
+ repo_id=dataset_repo,
41
+ repo_type="dataset",
42
+ commit_message="Add candidate JSON record",
43
+ )
44
+
45
+ api.upload_file(
46
+ path_or_fileobj=anon_pdf_bytes,
47
+ path_in_repo=pdf_path,
48
+ repo_id=dataset_repo,
49
+ repo_type="dataset",
50
+ commit_message="Add anonymized PDF",
51
+ )
52
+
53
+ return {"status": "ok", "dataset_repo": dataset_repo, "files": [parquet_path, json_path, pdf_path]}