deepmage121 commited on
Commit
051d3f3
·
1 Parent(s): d7012da

post_merge fix with reset option

Browse files
Files changed (1) hide show
  1. post_merge.py +40 -39
post_merge.py CHANGED
@@ -3,9 +3,10 @@
3
  import io
4
  import json
5
  import logging
 
6
  from datetime import datetime, timezone
7
 
8
- from huggingface_hub import HfApi, hf_hub_download
9
  from huggingface_hub.utils import EntryNotFoundError
10
 
11
  from dedup import DATASET_REPO_ID, compute_fingerprint, compute_sha256
@@ -131,54 +132,54 @@ def update_dataset_card(api: HfApi) -> None:
131
  def full_rebuild() -> dict:
132
  """Rebuild manifest.json from scratch and regenerate the dataset card.
133
 
134
- Scans ALL data files on main, recomputes every hash/fingerprint,
135
- and writes a fresh manifest (replacing the old one entirely).
136
  """
137
  logger.info("Starting full rebuild of manifest + dataset card")
138
 
139
- all_data_files: list[str] = []
140
- for entry in api.list_repo_tree(
141
  repo_id=DATASET_REPO_ID,
142
  repo_type="dataset",
143
  revision="main",
144
- recursive=True,
145
- ):
146
- if not hasattr(entry, "rfilename"):
147
- continue
148
- path = entry.rfilename
149
- if path.startswith("data/") and (path.endswith(".json") or path.endswith(".jsonl")):
150
- all_data_files.append(path)
151
-
152
- logger.info("Found %d data files to index", len(all_data_files))
153
 
 
 
154
  now = datetime.now(timezone.utc).isoformat()
155
  manifest = {"files": {}}
156
 
157
- for file_path in all_data_files:
158
- try:
159
- local_path = hf_hub_download(
160
- repo_id=DATASET_REPO_ID,
161
- filename=file_path,
162
- repo_type="dataset",
163
- revision="main",
164
- )
165
- with open(local_path, "rb") as f:
166
- content = f.read()
167
-
168
- sha256 = compute_sha256(content)
169
- if file_path.endswith(".json"):
170
- fingerprint = compute_fingerprint(content)
171
- else:
172
- fingerprint = sha256
173
-
174
- manifest["files"][file_path] = {
175
- "sha256": sha256,
176
- "fingerprint": fingerprint,
177
- "added_at": now,
178
- }
179
- logger.info("Indexed %s", file_path)
180
- except Exception:
181
- logger.exception("Failed to index %s", file_path)
 
 
 
 
 
182
 
183
  # Upload fresh manifest
184
  manifest_bytes = json.dumps(manifest, indent=2, sort_keys=True).encode()
 
3
  import io
4
  import json
5
  import logging
6
+ import os
7
  from datetime import datetime, timezone
8
 
9
+ from huggingface_hub import HfApi, hf_hub_download, snapshot_download
10
  from huggingface_hub.utils import EntryNotFoundError
11
 
12
  from dedup import DATASET_REPO_ID, compute_fingerprint, compute_sha256
 
132
  def full_rebuild() -> dict:
133
  """Rebuild manifest.json from scratch and regenerate the dataset card.
134
 
135
+ Downloads the entire dataset at once via snapshot_download, then walks
136
+ the local directory to compute hashes/fingerprints for all data files.
137
  """
138
  logger.info("Starting full rebuild of manifest + dataset card")
139
 
140
+ # Download entire dataset in one shot
141
+ local_dir = snapshot_download(
142
  repo_id=DATASET_REPO_ID,
143
  repo_type="dataset",
144
  revision="main",
145
+ )
146
+ logger.info("Downloaded dataset snapshot to %s", local_dir)
 
 
 
 
 
 
 
147
 
148
+ # Walk local data/ directory to find all data files
149
+ data_root = os.path.join(local_dir, "data")
150
  now = datetime.now(timezone.utc).isoformat()
151
  manifest = {"files": {}}
152
 
153
+ if not os.path.isdir(data_root):
154
+ logger.warning("No data/ directory found in snapshot")
155
+ else:
156
+ for dirpath, _, filenames in os.walk(data_root):
157
+ for filename in filenames:
158
+ if not (filename.endswith(".json") or filename.endswith(".jsonl")):
159
+ continue
160
+ local_path = os.path.join(dirpath, filename)
161
+ # Convert to repo-relative path (data/...)
162
+ repo_path = os.path.relpath(local_path, local_dir)
163
+ try:
164
+ with open(local_path, "rb") as f:
165
+ content = f.read()
166
+
167
+ sha256 = compute_sha256(content)
168
+ if filename.endswith(".json"):
169
+ fingerprint = compute_fingerprint(content)
170
+ else:
171
+ fingerprint = sha256
172
+
173
+ manifest["files"][repo_path] = {
174
+ "sha256": sha256,
175
+ "fingerprint": fingerprint,
176
+ "added_at": now,
177
+ }
178
+ logger.info("Indexed %s", repo_path)
179
+ except Exception:
180
+ logger.exception("Failed to index %s", repo_path)
181
+
182
+ logger.info("Indexed %d data files total", len(manifest["files"]))
183
 
184
  # Upload fresh manifest
185
  manifest_bytes = json.dumps(manifest, indent=2, sort_keys=True).encode()