broadfield-dev commited on
Commit
eb771ed
·
verified ·
1 Parent(s): 70aa1ed

Update dataset_gen.py

Browse files
Files changed (1) hide show
  1. dataset_gen.py +14 -11
dataset_gen.py CHANGED
@@ -6,14 +6,15 @@ from huggingface_hub import HfApi
6
 
7
  OUTPUT_FILE = "pystructure_dataset.jsonl"
8
 
9
- def create_dataset_entry(code):
10
  """
11
- Parses code and appends a training example to the local JSONL file.
 
12
  """
13
  graph_data = parse_source_to_graph(code)
14
 
15
  if "error" in graph_data:
16
- return {"status": "error", "message": graph_data["error"]}
17
 
18
  # Flatten vectors for ML input
19
  vectors = [n['vec'] for n in graph_data['nodes']]
@@ -32,6 +33,16 @@ def create_dataset_entry(code):
32
  "edges": graph_data['connections']
33
  }
34
  }
 
 
 
 
 
 
 
 
 
 
35
 
36
  with open(OUTPUT_FILE, 'a') as f:
37
  f.write(json.dumps(entry) + '\n')
@@ -80,8 +91,6 @@ def upload_to_hub(token, repo_name_input):
80
  return {"status": "error", "message": "Invalid HF Token. Please check your write token."}
81
 
82
  # 2. Resolve Repo ID
83
- # If user typed "my-dataset", convert to "username/my-dataset"
84
- # If user typed "username/my-dataset", keep it as is
85
  if "/" in repo_name_input:
86
  full_repo_id = repo_name_input
87
  else:
@@ -96,9 +105,6 @@ def upload_to_hub(token, repo_name_input):
96
  )
97
 
98
  # 4. Upload with Sharding (Simulated Append)
99
- # We assume the local file contains *new* data.
100
- # We upload it with a unique timestamp filename.
101
- # HF 'datasets' library will automatically load ALL jsonl files in the directory.
102
  timestamp = int(datetime.now().timestamp())
103
  remote_filename = f"data_shard_{timestamp}.jsonl"
104
 
@@ -109,9 +115,6 @@ def upload_to_hub(token, repo_name_input):
109
  repo_type="dataset"
110
  )
111
 
112
- # Optional: Rename local file to avoid re-uploading duplicate data next time?
113
- # For now, we leave it as is, but this logic implies the user manages their local file.
114
-
115
  return {
116
  "status": "success",
117
  "message": f"Successfully appended data to https://huggingface.co/datasets/{full_repo_id} (File: {remote_filename})"
 
6
 
7
  OUTPUT_FILE = "pystructure_dataset.jsonl"
8
 
9
+ def build_dataset_entry(code):
10
  """
11
+ Pure function: Takes code, returns the dataset entry dictionary.
12
+ Does NOT save to disk.
13
  """
14
  graph_data = parse_source_to_graph(code)
15
 
16
  if "error" in graph_data:
17
+ return {"error": graph_data["error"]}
18
 
19
  # Flatten vectors for ML input
20
  vectors = [n['vec'] for n in graph_data['nodes']]
 
33
  "edges": graph_data['connections']
34
  }
35
  }
36
+ return entry
37
+
38
+ def create_dataset_entry(code):
39
+ """
40
+ Uses build_dataset_entry and appends it to the local JSONL file.
41
+ """
42
+ entry = build_dataset_entry(code)
43
+
44
+ if "error" in entry:
45
+ return {"status": "error", "message": entry["error"]}
46
 
47
  with open(OUTPUT_FILE, 'a') as f:
48
  f.write(json.dumps(entry) + '\n')
 
91
  return {"status": "error", "message": "Invalid HF Token. Please check your write token."}
92
 
93
  # 2. Resolve Repo ID
 
 
94
  if "/" in repo_name_input:
95
  full_repo_id = repo_name_input
96
  else:
 
105
  )
106
 
107
  # 4. Upload with Sharding (Simulated Append)
 
 
 
108
  timestamp = int(datetime.now().timestamp())
109
  remote_filename = f"data_shard_{timestamp}.jsonl"
110
 
 
115
  repo_type="dataset"
116
  )
117
 
 
 
 
118
  return {
119
  "status": "success",
120
  "message": f"Successfully appended data to https://huggingface.co/datasets/{full_repo_id} (File: {remote_filename})"