import json import os from parser import parse_source_to_graph from datetime import datetime from huggingface_hub import HfApi OUTPUT_FILE = "pystructure_dataset.jsonl" def build_dataset_entry(code): """ Pure function: Takes code, returns the dataset entry dictionary. Does NOT save to disk. """ graph_data = parse_source_to_graph(code) if "error" in graph_data: return {"error": graph_data["error"]} # Flatten vectors for ML input vectors = [n['vec'] for n in graph_data['nodes']] entry = { "id": f"sample_{int(datetime.now().timestamp())}", "timestamp": datetime.now().isoformat(), "source_code": code, "meta": { "node_count": len(graph_data['nodes']), "max_depth": max([n['lvl'] for n in graph_data['nodes']]) if graph_data['nodes'] else 0, "snippet": code[:50].replace('\n', ' ') + "..." }, "structure": { "vectors": vectors, "edges": graph_data['connections'] } } return entry def create_dataset_entry(code): """ Uses build_dataset_entry and appends it to the local JSONL file. """ entry = build_dataset_entry(code) if "error" in entry: return {"status": "error", "message": entry["error"]} with open(OUTPUT_FILE, 'a') as f: f.write(json.dumps(entry) + '\n') return {"status": "success", "id": entry['id']} def get_dataset_stats(): """ Reads metadata from the local JSONL file for the UI table. """ entries = [] if not os.path.exists(OUTPUT_FILE): return [] with open(OUTPUT_FILE, 'r') as f: for line in f: try: data = json.loads(line) entries.append({ "id": data['id'], "timestamp": data['timestamp'], "node_count": data['meta']['node_count'], "snippet": data['meta']['snippet'] }) except: continue return entries[::-1] # Return newest first def upload_to_hub(token, repo_name_input): """ 1. Autodetects username from token. 2. Creates repo if it doesn't exist. 3. Uploads the local file as a unique shard to 'append' to the dataset. """ if not os.path.exists(OUTPUT_FILE): return {"status": "error", "message": "No local dataset found to upload."} try: api = HfApi(token=token) # 1. Auto-detect Username try: user_info = api.whoami() username = user_info['name'] except Exception: return {"status": "error", "message": "Invalid HF Token. Please check your write token."} # 2. Resolve Repo ID if "/" in repo_name_input: full_repo_id = repo_name_input else: full_repo_id = f"{username}/{repo_name_input}" # 3. Create Repo (Idempotent) # exist_ok=True prevents errors if the repo already exists api.create_repo( repo_id=full_repo_id, repo_type="dataset", exist_ok=True ) # 4. Upload with Sharding (Simulated Append) timestamp = int(datetime.now().timestamp()) remote_filename = f"data_shard_{timestamp}.jsonl" api.upload_file( path_or_fileobj=OUTPUT_FILE, path_in_repo=remote_filename, repo_id=full_repo_id, repo_type="dataset" ) return { "status": "success", "message": f"Successfully appended data to https://huggingface.co/datasets/{full_repo_id} (File: {remote_filename})" } except Exception as e: return {"status": "error", "message": str(e)}