Spaces:

broadfield-dev
/

noding

Sleeping

App Files Files Community

noding / dataset_gen.py

broadfield-dev

Update dataset_gen.py

eb771ed verified 9 days ago

raw

history blame contribute delete

3.81 kB

	import json
	import os
	from parser import parse_source_to_graph
	from datetime import datetime
	from huggingface_hub import HfApi

	OUTPUT_FILE = "pystructure_dataset.jsonl"

	def build_dataset_entry(code):
	"""
	Pure function: Takes code, returns the dataset entry dictionary.
	Does NOT save to disk.
	"""
	graph_data = parse_source_to_graph(code)

	if "error" in graph_data:
	return {"error": graph_data["error"]}

	# Flatten vectors for ML input
	vectors = [n['vec'] for n in graph_data['nodes']]

	entry = {
	"id": f"sample_{int(datetime.now().timestamp())}",
	"timestamp": datetime.now().isoformat(),
	"source_code": code,
	"meta": {
	"node_count": len(graph_data['nodes']),
	"max_depth": max([n['lvl'] for n in graph_data['nodes']]) if graph_data['nodes'] else 0,
	"snippet": code[:50].replace('\n', ' ') + "..."
	},
	"structure": {
	"vectors": vectors,
	"edges": graph_data['connections']
	}
	}
	return entry

	def create_dataset_entry(code):
	"""
	Uses build_dataset_entry and appends it to the local JSONL file.
	"""
	entry = build_dataset_entry(code)

	if "error" in entry:
	return {"status": "error", "message": entry["error"]}

	with open(OUTPUT_FILE, 'a') as f:
	f.write(json.dumps(entry) + '\n')

	return {"status": "success", "id": entry['id']}

	def get_dataset_stats():
	"""
	Reads metadata from the local JSONL file for the UI table.
	"""
	entries = []
	if not os.path.exists(OUTPUT_FILE):
	return []

	with open(OUTPUT_FILE, 'r') as f:
	for line in f:
	try:
	data = json.loads(line)
	entries.append({
	"id": data['id'],
	"timestamp": data['timestamp'],
	"node_count": data['meta']['node_count'],
	"snippet": data['meta']['snippet']
	})
	except:
	continue
	return entries[::-1] # Return newest first

	def upload_to_hub(token, repo_name_input):
	"""
	1. Autodetects username from token.
	2. Creates repo if it doesn't exist.
	3. Uploads the local file as a unique shard to 'append' to the dataset.
	"""
	if not os.path.exists(OUTPUT_FILE):
	return {"status": "error", "message": "No local dataset found to upload."}

	try:
	api = HfApi(token=token)

	# 1. Auto-detect Username
	try:
	user_info = api.whoami()
	username = user_info['name']
	except Exception:
	return {"status": "error", "message": "Invalid HF Token. Please check your write token."}

	# 2. Resolve Repo ID
	if "/" in repo_name_input:
	full_repo_id = repo_name_input
	else:
	full_repo_id = f"{username}/{repo_name_input}"

	# 3. Create Repo (Idempotent)
	# exist_ok=True prevents errors if the repo already exists
	api.create_repo(
	repo_id=full_repo_id,
	repo_type="dataset",
	exist_ok=True
	)

	# 4. Upload with Sharding (Simulated Append)
	timestamp = int(datetime.now().timestamp())
	remote_filename = f"data_shard_{timestamp}.jsonl"

	api.upload_file(
	path_or_fileobj=OUTPUT_FILE,
	path_in_repo=remote_filename,
	repo_id=full_repo_id,
	repo_type="dataset"
	)

	return {
	"status": "success",
	"message": f"Successfully appended data to https://huggingface.co/datasets/{full_repo_id} (File: {remote_filename})"
	}

	except Exception as e:
	return {"status": "error", "message": str(e)}