noding / dataset_gen.py
broadfield-dev's picture
Update dataset_gen.py
eb771ed verified
import json
import os
from parser import parse_source_to_graph
from datetime import datetime
from huggingface_hub import HfApi
OUTPUT_FILE = "pystructure_dataset.jsonl"
def build_dataset_entry(code):
"""
Pure function: Takes code, returns the dataset entry dictionary.
Does NOT save to disk.
"""
graph_data = parse_source_to_graph(code)
if "error" in graph_data:
return {"error": graph_data["error"]}
# Flatten vectors for ML input
vectors = [n['vec'] for n in graph_data['nodes']]
entry = {
"id": f"sample_{int(datetime.now().timestamp())}",
"timestamp": datetime.now().isoformat(),
"source_code": code,
"meta": {
"node_count": len(graph_data['nodes']),
"max_depth": max([n['lvl'] for n in graph_data['nodes']]) if graph_data['nodes'] else 0,
"snippet": code[:50].replace('\n', ' ') + "..."
},
"structure": {
"vectors": vectors,
"edges": graph_data['connections']
}
}
return entry
def create_dataset_entry(code):
"""
Uses build_dataset_entry and appends it to the local JSONL file.
"""
entry = build_dataset_entry(code)
if "error" in entry:
return {"status": "error", "message": entry["error"]}
with open(OUTPUT_FILE, 'a') as f:
f.write(json.dumps(entry) + '\n')
return {"status": "success", "id": entry['id']}
def get_dataset_stats():
"""
Reads metadata from the local JSONL file for the UI table.
"""
entries = []
if not os.path.exists(OUTPUT_FILE):
return []
with open(OUTPUT_FILE, 'r') as f:
for line in f:
try:
data = json.loads(line)
entries.append({
"id": data['id'],
"timestamp": data['timestamp'],
"node_count": data['meta']['node_count'],
"snippet": data['meta']['snippet']
})
except:
continue
return entries[::-1] # Return newest first
def upload_to_hub(token, repo_name_input):
"""
1. Autodetects username from token.
2. Creates repo if it doesn't exist.
3. Uploads the local file as a unique shard to 'append' to the dataset.
"""
if not os.path.exists(OUTPUT_FILE):
return {"status": "error", "message": "No local dataset found to upload."}
try:
api = HfApi(token=token)
# 1. Auto-detect Username
try:
user_info = api.whoami()
username = user_info['name']
except Exception:
return {"status": "error", "message": "Invalid HF Token. Please check your write token."}
# 2. Resolve Repo ID
if "/" in repo_name_input:
full_repo_id = repo_name_input
else:
full_repo_id = f"{username}/{repo_name_input}"
# 3. Create Repo (Idempotent)
# exist_ok=True prevents errors if the repo already exists
api.create_repo(
repo_id=full_repo_id,
repo_type="dataset",
exist_ok=True
)
# 4. Upload with Sharding (Simulated Append)
timestamp = int(datetime.now().timestamp())
remote_filename = f"data_shard_{timestamp}.jsonl"
api.upload_file(
path_or_fileobj=OUTPUT_FILE,
path_in_repo=remote_filename,
repo_id=full_repo_id,
repo_type="dataset"
)
return {
"status": "success",
"message": f"Successfully appended data to https://huggingface.co/datasets/{full_repo_id} (File: {remote_filename})"
}
except Exception as e:
return {"status": "error", "message": str(e)}