deepmage121 commited on
Commit
d7012da
·
1 Parent(s): 0591d06

post_merge fix with reset option

Browse files
Files changed (1) hide show
  1. post_merge.py +82 -0
post_merge.py CHANGED
@@ -128,6 +128,79 @@ def update_dataset_card(api: HfApi) -> None:
128
  logger.info("Updated dataset card with %d configs: %s", len(configs), configs)
129
 
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  def handle_merge(pr_num: int) -> dict:
132
  """Run all post-merge actions for a PR."""
133
  logger.info("Handling merge for PR #%d", pr_num)
@@ -175,3 +248,12 @@ def handle_merge(pr_num: int) -> dict:
175
  "pr": pr_num,
176
  "files_added_to_manifest": len(untracked),
177
  }
 
 
 
 
 
 
 
 
 
 
128
  logger.info("Updated dataset card with %d configs: %s", len(configs), configs)
129
 
130
 
131
+ def full_rebuild() -> dict:
132
+ """Rebuild manifest.json from scratch and regenerate the dataset card.
133
+
134
+ Scans ALL data files on main, recomputes every hash/fingerprint,
135
+ and writes a fresh manifest (replacing the old one entirely).
136
+ """
137
+ logger.info("Starting full rebuild of manifest + dataset card")
138
+
139
+ all_data_files: list[str] = []
140
+ for entry in api.list_repo_tree(
141
+ repo_id=DATASET_REPO_ID,
142
+ repo_type="dataset",
143
+ revision="main",
144
+ recursive=True,
145
+ ):
146
+ if not hasattr(entry, "rfilename"):
147
+ continue
148
+ path = entry.rfilename
149
+ if path.startswith("data/") and (path.endswith(".json") or path.endswith(".jsonl")):
150
+ all_data_files.append(path)
151
+
152
+ logger.info("Found %d data files to index", len(all_data_files))
153
+
154
+ now = datetime.now(timezone.utc).isoformat()
155
+ manifest = {"files": {}}
156
+
157
+ for file_path in all_data_files:
158
+ try:
159
+ local_path = hf_hub_download(
160
+ repo_id=DATASET_REPO_ID,
161
+ filename=file_path,
162
+ repo_type="dataset",
163
+ revision="main",
164
+ )
165
+ with open(local_path, "rb") as f:
166
+ content = f.read()
167
+
168
+ sha256 = compute_sha256(content)
169
+ if file_path.endswith(".json"):
170
+ fingerprint = compute_fingerprint(content)
171
+ else:
172
+ fingerprint = sha256
173
+
174
+ manifest["files"][file_path] = {
175
+ "sha256": sha256,
176
+ "fingerprint": fingerprint,
177
+ "added_at": now,
178
+ }
179
+ logger.info("Indexed %s", file_path)
180
+ except Exception:
181
+ logger.exception("Failed to index %s", file_path)
182
+
183
+ # Upload fresh manifest
184
+ manifest_bytes = json.dumps(manifest, indent=2, sort_keys=True).encode()
185
+ api.upload_file(
186
+ path_or_fileobj=io.BytesIO(manifest_bytes),
187
+ path_in_repo="manifest.json",
188
+ repo_id=DATASET_REPO_ID,
189
+ repo_type="dataset",
190
+ commit_message="Full rebuild of manifest.json",
191
+ )
192
+ logger.info("Uploaded rebuilt manifest.json (%d files)", len(manifest["files"]))
193
+
194
+ # Regenerate dataset card
195
+ update_dataset_card(api)
196
+
197
+ return {
198
+ "status": "ok",
199
+ "action": "full_rebuild",
200
+ "files_indexed": len(manifest["files"]),
201
+ }
202
+
203
+
204
  def handle_merge(pr_num: int) -> dict:
205
  """Run all post-merge actions for a PR."""
206
  logger.info("Handling merge for PR #%d", pr_num)
 
248
  "pr": pr_num,
249
  "files_added_to_manifest": len(untracked),
250
  }
251
+
252
+
253
+ if __name__ == "__main__":
254
+ logging.basicConfig(
255
+ level=logging.INFO,
256
+ format="%(asctime)s %(levelname)s %(name)s: %(message)s",
257
+ )
258
+ result = full_rebuild()
259
+ print(json.dumps(result, indent=2))