Spaces:

broadfield-dev
/

noding

Sleeping

File size: 3,814 Bytes

import json
import os
from parser import parse_source_to_graph
from datetime import datetime
from huggingface_hub import HfApi

OUTPUT_FILE = "pystructure_dataset.jsonl"

def build_dataset_entry(code):
    """
    Pure function: Takes code, returns the dataset entry dictionary.
    Does NOT save to disk.
    """
    graph_data = parse_source_to_graph(code)
    
    if "error" in graph_data:
        return {"error": graph_data["error"]}
        
    # Flatten vectors for ML input
    vectors = [n['vec'] for n in graph_data['nodes']]
    
    entry = {
        "id": f"sample_{int(datetime.now().timestamp())}",
        "timestamp": datetime.now().isoformat(),
        "source_code": code,
        "meta": {
            "node_count": len(graph_data['nodes']),
            "max_depth": max([n['lvl'] for n in graph_data['nodes']]) if graph_data['nodes'] else 0,
            "snippet": code[:50].replace('\n', ' ') + "..."
        },
        "structure": {
            "vectors": vectors,
            "edges": graph_data['connections']
        }
    }
    return entry

def create_dataset_entry(code):
    """
    Uses build_dataset_entry and appends it to the local JSONL file.
    """
    entry = build_dataset_entry(code)
    
    if "error" in entry:
        return {"status": "error", "message": entry["error"]}

    with open(OUTPUT_FILE, 'a') as f:
        f.write(json.dumps(entry) + '\n')
            
    return {"status": "success", "id": entry['id']}

def get_dataset_stats():
    """
    Reads metadata from the local JSONL file for the UI table.
    """
    entries = []
    if not os.path.exists(OUTPUT_FILE):
        return []
        
    with open(OUTPUT_FILE, 'r') as f:
        for line in f:
            try:
                data = json.loads(line)
                entries.append({
                    "id": data['id'],
                    "timestamp": data['timestamp'],
                    "node_count": data['meta']['node_count'],
                    "snippet": data['meta']['snippet']
                })
            except:
                continue
    return entries[::-1] # Return newest first

def upload_to_hub(token, repo_name_input):
    """
    1. Autodetects username from token.
    2. Creates repo if it doesn't exist.
    3. Uploads the local file as a unique shard to 'append' to the dataset.
    """
    if not os.path.exists(OUTPUT_FILE):
        return {"status": "error", "message": "No local dataset found to upload."}
        
    try:
        api = HfApi(token=token)

        # 1. Auto-detect Username
        try:
            user_info = api.whoami()
            username = user_info['name']
        except Exception:
            return {"status": "error", "message": "Invalid HF Token. Please check your write token."}

        # 2. Resolve Repo ID
        if "/" in repo_name_input:
            full_repo_id = repo_name_input
        else:
            full_repo_id = f"{username}/{repo_name_input}"

        # 3. Create Repo (Idempotent)
        # exist_ok=True prevents errors if the repo already exists
        api.create_repo(
            repo_id=full_repo_id, 
            repo_type="dataset", 
            exist_ok=True
        )

        # 4. Upload with Sharding (Simulated Append)
        timestamp = int(datetime.now().timestamp())
        remote_filename = f"data_shard_{timestamp}.jsonl"

        api.upload_file(
            path_or_fileobj=OUTPUT_FILE,
            path_in_repo=remote_filename,
            repo_id=full_repo_id,
            repo_type="dataset"
        )
        
        return {
            "status": "success", 
            "message": f"Successfully appended data to https://huggingface.co/datasets/{full_repo_id} (File: {remote_filename})"
        }

    except Exception as e:
        return {"status": "error", "message": str(e)}