Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| from parser import parse_source_to_graph | |
| from datetime import datetime | |
| from huggingface_hub import HfApi | |
| OUTPUT_FILE = "pystructure_dataset.jsonl" | |
| def build_dataset_entry(code): | |
| """ | |
| Pure function: Takes code, returns the dataset entry dictionary. | |
| Does NOT save to disk. | |
| """ | |
| graph_data = parse_source_to_graph(code) | |
| if "error" in graph_data: | |
| return {"error": graph_data["error"]} | |
| # Flatten vectors for ML input | |
| vectors = [n['vec'] for n in graph_data['nodes']] | |
| entry = { | |
| "id": f"sample_{int(datetime.now().timestamp())}", | |
| "timestamp": datetime.now().isoformat(), | |
| "source_code": code, | |
| "meta": { | |
| "node_count": len(graph_data['nodes']), | |
| "max_depth": max([n['lvl'] for n in graph_data['nodes']]) if graph_data['nodes'] else 0, | |
| "snippet": code[:50].replace('\n', ' ') + "..." | |
| }, | |
| "structure": { | |
| "vectors": vectors, | |
| "edges": graph_data['connections'] | |
| } | |
| } | |
| return entry | |
| def create_dataset_entry(code): | |
| """ | |
| Uses build_dataset_entry and appends it to the local JSONL file. | |
| """ | |
| entry = build_dataset_entry(code) | |
| if "error" in entry: | |
| return {"status": "error", "message": entry["error"]} | |
| with open(OUTPUT_FILE, 'a') as f: | |
| f.write(json.dumps(entry) + '\n') | |
| return {"status": "success", "id": entry['id']} | |
| def get_dataset_stats(): | |
| """ | |
| Reads metadata from the local JSONL file for the UI table. | |
| """ | |
| entries = [] | |
| if not os.path.exists(OUTPUT_FILE): | |
| return [] | |
| with open(OUTPUT_FILE, 'r') as f: | |
| for line in f: | |
| try: | |
| data = json.loads(line) | |
| entries.append({ | |
| "id": data['id'], | |
| "timestamp": data['timestamp'], | |
| "node_count": data['meta']['node_count'], | |
| "snippet": data['meta']['snippet'] | |
| }) | |
| except: | |
| continue | |
| return entries[::-1] # Return newest first | |
| def upload_to_hub(token, repo_name_input): | |
| """ | |
| 1. Autodetects username from token. | |
| 2. Creates repo if it doesn't exist. | |
| 3. Uploads the local file as a unique shard to 'append' to the dataset. | |
| """ | |
| if not os.path.exists(OUTPUT_FILE): | |
| return {"status": "error", "message": "No local dataset found to upload."} | |
| try: | |
| api = HfApi(token=token) | |
| # 1. Auto-detect Username | |
| try: | |
| user_info = api.whoami() | |
| username = user_info['name'] | |
| except Exception: | |
| return {"status": "error", "message": "Invalid HF Token. Please check your write token."} | |
| # 2. Resolve Repo ID | |
| if "/" in repo_name_input: | |
| full_repo_id = repo_name_input | |
| else: | |
| full_repo_id = f"{username}/{repo_name_input}" | |
| # 3. Create Repo (Idempotent) | |
| # exist_ok=True prevents errors if the repo already exists | |
| api.create_repo( | |
| repo_id=full_repo_id, | |
| repo_type="dataset", | |
| exist_ok=True | |
| ) | |
| # 4. Upload with Sharding (Simulated Append) | |
| timestamp = int(datetime.now().timestamp()) | |
| remote_filename = f"data_shard_{timestamp}.jsonl" | |
| api.upload_file( | |
| path_or_fileobj=OUTPUT_FILE, | |
| path_in_repo=remote_filename, | |
| repo_id=full_repo_id, | |
| repo_type="dataset" | |
| ) | |
| return { | |
| "status": "success", | |
| "message": f"Successfully appended data to https://huggingface.co/datasets/{full_repo_id} (File: {remote_filename})" | |
| } | |
| except Exception as e: | |
| return {"status": "error", "message": str(e)} |