File size: 3,814 Bytes
2b890a2
 
 
 
09375cc
2b890a2
a2c589c
 
eb771ed
1831bf3
eb771ed
 
1831bf3
a2c589c
2b890a2
a2c589c
eb771ed
2b890a2
1831bf3
09375cc
a2c589c
 
 
 
1831bf3
a2c589c
 
09375cc
1831bf3
09375cc
 
 
 
2b890a2
a2c589c
eb771ed
 
 
 
 
 
 
 
 
 
2b890a2
a2c589c
 
2b890a2
09375cc
 
 
1831bf3
 
 
09375cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1831bf3
09375cc
1831bf3
 
 
 
 
 
09375cc
1831bf3
09375cc
 
 
1831bf3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09375cc
 
1831bf3
 
09375cc
 
1831bf3
 
 
 
 
 
09375cc
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import json
import os
from parser import parse_source_to_graph
from datetime import datetime
from huggingface_hub import HfApi

OUTPUT_FILE = "pystructure_dataset.jsonl"

def build_dataset_entry(code):
    """
    Pure function: Takes code, returns the dataset entry dictionary.
    Does NOT save to disk.
    """
    graph_data = parse_source_to_graph(code)
    
    if "error" in graph_data:
        return {"error": graph_data["error"]}
        
    # Flatten vectors for ML input
    vectors = [n['vec'] for n in graph_data['nodes']]
    
    entry = {
        "id": f"sample_{int(datetime.now().timestamp())}",
        "timestamp": datetime.now().isoformat(),
        "source_code": code,
        "meta": {
            "node_count": len(graph_data['nodes']),
            "max_depth": max([n['lvl'] for n in graph_data['nodes']]) if graph_data['nodes'] else 0,
            "snippet": code[:50].replace('\n', ' ') + "..."
        },
        "structure": {
            "vectors": vectors,
            "edges": graph_data['connections']
        }
    }
    return entry

def create_dataset_entry(code):
    """
    Uses build_dataset_entry and appends it to the local JSONL file.
    """
    entry = build_dataset_entry(code)
    
    if "error" in entry:
        return {"status": "error", "message": entry["error"]}

    with open(OUTPUT_FILE, 'a') as f:
        f.write(json.dumps(entry) + '\n')
            
    return {"status": "success", "id": entry['id']}

def get_dataset_stats():
    """
    Reads metadata from the local JSONL file for the UI table.
    """
    entries = []
    if not os.path.exists(OUTPUT_FILE):
        return []
        
    with open(OUTPUT_FILE, 'r') as f:
        for line in f:
            try:
                data = json.loads(line)
                entries.append({
                    "id": data['id'],
                    "timestamp": data['timestamp'],
                    "node_count": data['meta']['node_count'],
                    "snippet": data['meta']['snippet']
                })
            except:
                continue
    return entries[::-1] # Return newest first

def upload_to_hub(token, repo_name_input):
    """
    1. Autodetects username from token.
    2. Creates repo if it doesn't exist.
    3. Uploads the local file as a unique shard to 'append' to the dataset.
    """
    if not os.path.exists(OUTPUT_FILE):
        return {"status": "error", "message": "No local dataset found to upload."}
        
    try:
        api = HfApi(token=token)

        # 1. Auto-detect Username
        try:
            user_info = api.whoami()
            username = user_info['name']
        except Exception:
            return {"status": "error", "message": "Invalid HF Token. Please check your write token."}

        # 2. Resolve Repo ID
        if "/" in repo_name_input:
            full_repo_id = repo_name_input
        else:
            full_repo_id = f"{username}/{repo_name_input}"

        # 3. Create Repo (Idempotent)
        # exist_ok=True prevents errors if the repo already exists
        api.create_repo(
            repo_id=full_repo_id, 
            repo_type="dataset", 
            exist_ok=True
        )

        # 4. Upload with Sharding (Simulated Append)
        timestamp = int(datetime.now().timestamp())
        remote_filename = f"data_shard_{timestamp}.jsonl"

        api.upload_file(
            path_or_fileobj=OUTPUT_FILE,
            path_in_repo=remote_filename,
            repo_id=full_repo_id,
            repo_type="dataset"
        )
        
        return {
            "status": "success", 
            "message": f"Successfully appended data to https://huggingface.co/datasets/{full_repo_id} (File: {remote_filename})"
        }

    except Exception as e:
        return {"status": "error", "message": str(e)}