Spaces:

broadfield-dev
/

noding

Sleeping

App Files Files Community

broadfield-dev commited on 9 days ago

Commit

eb771ed

verified ·

1 Parent(s): 70aa1ed

Update dataset_gen.py

Browse files

Files changed (1) hide show

dataset_gen.py +14 -11

dataset_gen.py CHANGED Viewed

@@ -6,14 +6,15 @@ from huggingface_hub import HfApi
 OUTPUT_FILE = "pystructure_dataset.jsonl"
-def create_dataset_entry(code):
     """
-    Parses code and appends a training example to the local JSONL file.
     """
     graph_data = parse_source_to_graph(code)
     if "error" in graph_data:
-        return {"status": "error", "message": graph_data["error"]}
     # Flatten vectors for ML input
     vectors = [n['vec'] for n in graph_data['nodes']]
@@ -32,6 +33,16 @@ def create_dataset_entry(code):
             "edges": graph_data['connections']
         }
     }
     with open(OUTPUT_FILE, 'a') as f:
         f.write(json.dumps(entry) + '\n')
@@ -80,8 +91,6 @@ def upload_to_hub(token, repo_name_input):
             return {"status": "error", "message": "Invalid HF Token. Please check your write token."}
         # 2. Resolve Repo ID
-        # If user typed "my-dataset", convert to "username/my-dataset"
-        # If user typed "username/my-dataset", keep it as is
         if "/" in repo_name_input:
             full_repo_id = repo_name_input
         else:
@@ -96,9 +105,6 @@ def upload_to_hub(token, repo_name_input):
         )
         # 4. Upload with Sharding (Simulated Append)
-        # We assume the local file contains *new* data.
-        # We upload it with a unique timestamp filename.
-        # HF 'datasets' library will automatically load ALL jsonl files in the directory.
         timestamp = int(datetime.now().timestamp())
         remote_filename = f"data_shard_{timestamp}.jsonl"
@@ -109,9 +115,6 @@ def upload_to_hub(token, repo_name_input):
             repo_type="dataset"
         )
-        # Optional: Rename local file to avoid re-uploading duplicate data next time?
-        # For now, we leave it as is, but this logic implies the user manages their local file.
         return {
             "status": "success",
             "message": f"Successfully appended data to https://huggingface.co/datasets/{full_repo_id} (File: {remote_filename})"

 OUTPUT_FILE = "pystructure_dataset.jsonl"
+def build_dataset_entry(code):
     """
+    Pure function: Takes code, returns the dataset entry dictionary.
+    Does NOT save to disk.
     """
     graph_data = parse_source_to_graph(code)
     if "error" in graph_data:
+        return {"error": graph_data["error"]}
     # Flatten vectors for ML input
     vectors = [n['vec'] for n in graph_data['nodes']]
             "edges": graph_data['connections']
         }
     }
+    return entry
+def create_dataset_entry(code):
+    """
+    Uses build_dataset_entry and appends it to the local JSONL file.
+    """
+    entry = build_dataset_entry(code)
+    if "error" in entry:
+        return {"status": "error", "message": entry["error"]}
     with open(OUTPUT_FILE, 'a') as f:
         f.write(json.dumps(entry) + '\n')
             return {"status": "error", "message": "Invalid HF Token. Please check your write token."}
         # 2. Resolve Repo ID
         if "/" in repo_name_input:
             full_repo_id = repo_name_input
         else:
         )
         # 4. Upload with Sharding (Simulated Append)
         timestamp = int(datetime.now().timestamp())
         remote_filename = f"data_shard_{timestamp}.jsonl"
             repo_type="dataset"
         )
         return {
             "status": "success",
             "message": f"Successfully appended data to https://huggingface.co/datasets/{full_repo_id} (File: {remote_filename})"