File size: 1,762 Bytes
954b9d0
1804ce0
 
 
 
 
 
3595c1e
 
1804ce0
 
954b9d0
1804ce0
 
954b9d0
3595c1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1804ce0
 
 
 
 
3595c1e
 
 
1804ce0
954b9d0
1804ce0
 
 
 
 
 
 
 
 
954b9d0
1804ce0
954b9d0
 
1804ce0
954b9d0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from huggingface_hub import HfApi
from config.settings import (
    DATASET_ID,
    DATASET_VECTOR_STORE_PATH,
    DATASET_CHAT_HISTORY_PATH,
    DATASET_FINE_TUNED_PATH,
    DATASET_ANNOTATIONS_PATH,
    DATASET_TRAINING_DATA_PATH,
    DATASET_TRAINING_LOGS_PATH,
    HF_TOKEN
)

api = HfApi(token=HF_TOKEN)
dataset_name = DATASET_ID

def ensure_dataset_directory(directory: str) -> None:
    """
    Check if directory exists in dataset and create if not
    """
    try:
        files = api.list_repo_files(repo_id=dataset_name, repo_type="dataset")
        if not any(f.startswith(f"{directory}/") for f in files):
            api.upload_file(
                path_or_fileobj=b"",
                path_in_repo=f"{directory}/.gitkeep",
                repo_id=dataset_name,
                repo_type="dataset"
            )
            print(f"✓ Created missing directory: {directory}")
    except Exception as e:
        print(f"Error ensuring directory {directory}: {str(e)}")

# Ensure training directories exist
ensure_dataset_directory(DATASET_TRAINING_DATA_PATH)
ensure_dataset_directory(DATASET_TRAINING_LOGS_PATH)

# Initialize dataset structure
directories = [
    DATASET_VECTOR_STORE_PATH,
    DATASET_CHAT_HISTORY_PATH,
    DATASET_FINE_TUNED_PATH,
    DATASET_ANNOTATIONS_PATH,
    DATASET_TRAINING_DATA_PATH,
    DATASET_TRAINING_LOGS_PATH
]

try:
    for directory in directories:
        api.upload_file(
            path_or_fileobj=b"",
            path_in_repo=f"{directory}/.gitkeep",
            repo_id=dataset_name,
            repo_type="dataset"
        )
        print(f"✓ Created directory: {directory}")

    print("\nDataset structure successfully initialized!")

except Exception as e:
    print(f"Error occurred: {str(e)}")