broadfield-dev commited on
Commit
a2c589c
·
verified ·
1 Parent(s): 6bc59c8

Update dataset_gen.py

Browse files
Files changed (1) hide show
  1. dataset_gen.py +26 -25
dataset_gen.py CHANGED
@@ -3,35 +3,36 @@ import os
3
  from parser import parse_source_to_graph
4
  from datetime import datetime
5
 
6
- def create_hf_dataset(code_samples, output_file="software_structure_dataset.jsonl"):
 
 
7
  """
8
- Takes a list of code strings, parses them, and saves them
9
- in a format ready for Hugging Face 'datasets'.
10
  """
11
- data_entries = []
12
 
13
- for idx, code in enumerate(code_samples):
14
- graph_data = parse_source_to_graph(code)
15
-
16
- if "error" in graph_data:
17
- continue
18
-
19
- # Flatten vectors for ML input
20
- vectors = [n['vector'] for n in graph_data['nodes']]
21
 
22
- entry = {
23
- "id": f"sample_{idx}_{int(datetime.now().timestamp())}",
24
- "source_code": code,
25
- "graph_nodes": json.dumps(graph_data['nodes']),
26
- "graph_edges": json.dumps(graph_data['connections']),
27
- "structural_vectors": vectors, # The core feature for training
28
- "node_count": len(graph_data['nodes'])
 
 
 
 
 
 
 
29
  }
30
- data_entries.append(entry)
31
 
32
- # Write to JSONL
33
- with open(output_file, 'w') as f:
34
- for entry in data_entries:
35
- f.write(json.dumps(entry) + '\n')
36
 
37
- return output_file
 
3
  from parser import parse_source_to_graph
4
  from datetime import datetime
5
 
6
+ OUTPUT_FILE = "pystructure_dataset.jsonl"
7
+
8
+ def create_dataset_entry(code):
9
  """
10
+ Parses code and appends a training example to the JSONL file.
 
11
  """
12
+ graph_data = parse_source_to_graph(code)
13
 
14
+ if "error" in graph_data:
15
+ return {"status": "error", "message": graph_data["error"]}
 
 
 
 
 
 
16
 
17
+ vectors = [n['vector'] for n in graph_data['nodes']]
18
+
19
+ entry = {
20
+ "id": f"sample_{int(datetime.now().timestamp())}",
21
+ "timestamp": datetime.now().isoformat(),
22
+ "source_code": code,
23
+ "graph_structure": {
24
+ "nodes": [n['id'] for n in graph_data['nodes']],
25
+ "edges": graph_data['connections']
26
+ },
27
+ "structural_vectors": vectors,
28
+ "meta": {
29
+ "node_count": len(graph_data['nodes']),
30
+ "max_depth": max([n['level'] for n in graph_data['nodes']]) if graph_data['nodes'] else 0
31
  }
32
+ }
33
 
34
+ # Append to JSONL file
35
+ with open(OUTPUT_FILE, 'a') as f:
36
+ f.write(json.dumps(entry) + '\n')
 
37
 
38
+ return {"status": "success", "file": OUTPUT_FILE, "entry_id": entry['id']}