broadfield-dev commited on
Commit
1831bf3
·
verified ·
1 Parent(s): 3c3a4be

Update dataset_gen.py

Browse files
Files changed (1) hide show
  1. dataset_gen.py +59 -13
dataset_gen.py CHANGED
@@ -7,23 +7,26 @@ from huggingface_hub import HfApi
7
  OUTPUT_FILE = "pystructure_dataset.jsonl"
8
 
9
  def create_dataset_entry(code):
 
 
 
10
  graph_data = parse_source_to_graph(code)
11
 
12
  if "error" in graph_data:
13
  return {"status": "error", "message": graph_data["error"]}
14
 
 
15
  vectors = [n['vec'] for n in graph_data['nodes']]
16
 
17
  entry = {
18
  "id": f"sample_{int(datetime.now().timestamp())}",
19
  "timestamp": datetime.now().isoformat(),
20
- "source_code": code, # We keep full source for training
21
  "meta": {
22
  "node_count": len(graph_data['nodes']),
23
  "max_depth": max([n['lvl'] for n in graph_data['nodes']]) if graph_data['nodes'] else 0,
24
- "snippet": code[:50].replace('\n', ' ') + "..." # For UI preview
25
  },
26
- # Store compact structure for training
27
  "structure": {
28
  "vectors": vectors,
29
  "edges": graph_data['connections']
@@ -36,7 +39,9 @@ def create_dataset_entry(code):
36
  return {"status": "success", "id": entry['id']}
37
 
38
  def get_dataset_stats():
39
- """Reads metadata from the JSONL file without loading heavy source code."""
 
 
40
  entries = []
41
  if not os.path.exists(OUTPUT_FILE):
42
  return []
@@ -45,7 +50,6 @@ def get_dataset_stats():
45
  for line in f:
46
  try:
47
  data = json.loads(line)
48
- # Only return lightweight info for the UI table
49
  entries.append({
50
  "id": data['id'],
51
  "timestamp": data['timestamp'],
@@ -54,22 +58,64 @@ def get_dataset_stats():
54
  })
55
  except:
56
  continue
57
- return entries[::-1] # Newest first
58
 
59
- def upload_to_hub(token, repo_id):
60
- """Pushes the local JSONL file to Hugging Face."""
 
 
 
 
61
  if not os.path.exists(OUTPUT_FILE):
62
- return {"status": "error", "message": "No dataset found."}
63
 
64
  try:
65
  api = HfApi(token=token)
66
- # Upload the specific file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  api.upload_file(
68
  path_or_fileobj=OUTPUT_FILE,
69
- path_in_repo="dataset.jsonl",
70
- repo_id=repo_id,
71
  repo_type="dataset"
72
  )
73
- return {"status": "success", "message": f"Uploaded to https://huggingface.co/datasets/{repo_id}"}
 
 
 
 
 
 
 
 
74
  except Exception as e:
75
  return {"status": "error", "message": str(e)}
 
7
  OUTPUT_FILE = "pystructure_dataset.jsonl"
8
 
9
  def create_dataset_entry(code):
10
+ """
11
+ Parses code and appends a training example to the local JSONL file.
12
+ """
13
  graph_data = parse_source_to_graph(code)
14
 
15
  if "error" in graph_data:
16
  return {"status": "error", "message": graph_data["error"]}
17
 
18
+ # Flatten vectors for ML input
19
  vectors = [n['vec'] for n in graph_data['nodes']]
20
 
21
  entry = {
22
  "id": f"sample_{int(datetime.now().timestamp())}",
23
  "timestamp": datetime.now().isoformat(),
24
+ "source_code": code,
25
  "meta": {
26
  "node_count": len(graph_data['nodes']),
27
  "max_depth": max([n['lvl'] for n in graph_data['nodes']]) if graph_data['nodes'] else 0,
28
+ "snippet": code[:50].replace('\n', ' ') + "..."
29
  },
 
30
  "structure": {
31
  "vectors": vectors,
32
  "edges": graph_data['connections']
 
39
  return {"status": "success", "id": entry['id']}
40
 
41
  def get_dataset_stats():
42
+ """
43
+ Reads metadata from the local JSONL file for the UI table.
44
+ """
45
  entries = []
46
  if not os.path.exists(OUTPUT_FILE):
47
  return []
 
50
  for line in f:
51
  try:
52
  data = json.loads(line)
 
53
  entries.append({
54
  "id": data['id'],
55
  "timestamp": data['timestamp'],
 
58
  })
59
  except:
60
  continue
61
+ return entries[::-1] # Return newest first
62
 
63
+ def upload_to_hub(token, repo_name_input):
64
+ """
65
+ 1. Autodetects username from token.
66
+ 2. Creates repo if it doesn't exist.
67
+ 3. Uploads the local file as a unique shard to 'append' to the dataset.
68
+ """
69
  if not os.path.exists(OUTPUT_FILE):
70
+ return {"status": "error", "message": "No local dataset found to upload."}
71
 
72
  try:
73
  api = HfApi(token=token)
74
+
75
+ # 1. Auto-detect Username
76
+ try:
77
+ user_info = api.whoami()
78
+ username = user_info['name']
79
+ except Exception:
80
+ return {"status": "error", "message": "Invalid HF Token. Please check your write token."}
81
+
82
+ # 2. Resolve Repo ID
83
+ # If user typed "my-dataset", convert to "username/my-dataset"
84
+ # If user typed "username/my-dataset", keep it as is
85
+ if "/" in repo_name_input:
86
+ full_repo_id = repo_name_input
87
+ else:
88
+ full_repo_id = f"{username}/{repo_name_input}"
89
+
90
+ # 3. Create Repo (Idempotent)
91
+ # exist_ok=True prevents errors if the repo already exists
92
+ api.create_repo(
93
+ repo_id=full_repo_id,
94
+ repo_type="dataset",
95
+ exist_ok=True
96
+ )
97
+
98
+ # 4. Upload with Sharding (Simulated Append)
99
+ # We assume the local file contains *new* data.
100
+ # We upload it with a unique timestamp filename.
101
+ # HF 'datasets' library will automatically load ALL jsonl files in the directory.
102
+ timestamp = int(datetime.now().timestamp())
103
+ remote_filename = f"data_shard_{timestamp}.jsonl"
104
+
105
  api.upload_file(
106
  path_or_fileobj=OUTPUT_FILE,
107
+ path_in_repo=remote_filename,
108
+ repo_id=full_repo_id,
109
  repo_type="dataset"
110
  )
111
+
112
+ # Optional: Rename local file to avoid re-uploading duplicate data next time?
113
+ # For now, we leave it as is, but this logic implies the user manages their local file.
114
+
115
+ return {
116
+ "status": "success",
117
+ "message": f"Successfully appended data to https://huggingface.co/datasets/{full_repo_id} (File: {remote_filename})"
118
+ }
119
+
120
  except Exception as e:
121
  return {"status": "error", "message": str(e)}