Spaces:
Sleeping
Sleeping
File size: 3,814 Bytes
2b890a2 09375cc 2b890a2 a2c589c eb771ed 1831bf3 eb771ed 1831bf3 a2c589c 2b890a2 a2c589c eb771ed 2b890a2 1831bf3 09375cc a2c589c 1831bf3 a2c589c 09375cc 1831bf3 09375cc 2b890a2 a2c589c eb771ed 2b890a2 a2c589c 2b890a2 09375cc 1831bf3 09375cc 1831bf3 09375cc 1831bf3 09375cc 1831bf3 09375cc 1831bf3 09375cc 1831bf3 09375cc 1831bf3 09375cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import json
import os
from parser import parse_source_to_graph
from datetime import datetime
from huggingface_hub import HfApi
OUTPUT_FILE = "pystructure_dataset.jsonl"
def build_dataset_entry(code):
"""
Pure function: Takes code, returns the dataset entry dictionary.
Does NOT save to disk.
"""
graph_data = parse_source_to_graph(code)
if "error" in graph_data:
return {"error": graph_data["error"]}
# Flatten vectors for ML input
vectors = [n['vec'] for n in graph_data['nodes']]
entry = {
"id": f"sample_{int(datetime.now().timestamp())}",
"timestamp": datetime.now().isoformat(),
"source_code": code,
"meta": {
"node_count": len(graph_data['nodes']),
"max_depth": max([n['lvl'] for n in graph_data['nodes']]) if graph_data['nodes'] else 0,
"snippet": code[:50].replace('\n', ' ') + "..."
},
"structure": {
"vectors": vectors,
"edges": graph_data['connections']
}
}
return entry
def create_dataset_entry(code):
"""
Uses build_dataset_entry and appends it to the local JSONL file.
"""
entry = build_dataset_entry(code)
if "error" in entry:
return {"status": "error", "message": entry["error"]}
with open(OUTPUT_FILE, 'a') as f:
f.write(json.dumps(entry) + '\n')
return {"status": "success", "id": entry['id']}
def get_dataset_stats():
"""
Reads metadata from the local JSONL file for the UI table.
"""
entries = []
if not os.path.exists(OUTPUT_FILE):
return []
with open(OUTPUT_FILE, 'r') as f:
for line in f:
try:
data = json.loads(line)
entries.append({
"id": data['id'],
"timestamp": data['timestamp'],
"node_count": data['meta']['node_count'],
"snippet": data['meta']['snippet']
})
except:
continue
return entries[::-1] # Return newest first
def upload_to_hub(token, repo_name_input):
"""
1. Autodetects username from token.
2. Creates repo if it doesn't exist.
3. Uploads the local file as a unique shard to 'append' to the dataset.
"""
if not os.path.exists(OUTPUT_FILE):
return {"status": "error", "message": "No local dataset found to upload."}
try:
api = HfApi(token=token)
# 1. Auto-detect Username
try:
user_info = api.whoami()
username = user_info['name']
except Exception:
return {"status": "error", "message": "Invalid HF Token. Please check your write token."}
# 2. Resolve Repo ID
if "/" in repo_name_input:
full_repo_id = repo_name_input
else:
full_repo_id = f"{username}/{repo_name_input}"
# 3. Create Repo (Idempotent)
# exist_ok=True prevents errors if the repo already exists
api.create_repo(
repo_id=full_repo_id,
repo_type="dataset",
exist_ok=True
)
# 4. Upload with Sharding (Simulated Append)
timestamp = int(datetime.now().timestamp())
remote_filename = f"data_shard_{timestamp}.jsonl"
api.upload_file(
path_or_fileobj=OUTPUT_FILE,
path_in_repo=remote_filename,
repo_id=full_repo_id,
repo_type="dataset"
)
return {
"status": "success",
"message": f"Successfully appended data to https://huggingface.co/datasets/{full_repo_id} (File: {remote_filename})"
}
except Exception as e:
return {"status": "error", "message": str(e)} |