File size: 1,613 Bytes
f56ede2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import argparse
import yaml
from datasets import load_dataset


def load_config(config_path):
    with open(config_path, 'r') as file:
        return yaml.safe_load(file)


def download_huggingface_dataset(config):
    # Get dataset details from config
    dataset_name = config['dataset_name']
    local_dir = config['local_dir']

    # Split dataset name into user_name and model_hub_name
    user_name, model_hub_name = dataset_name.split('/')

    # Login using e.g. `huggingface-cli login` to access this dataset
    ds = load_dataset(dataset_name, cache_dir=local_dir)

    # Print information for verification
    print(f"User Name: {user_name}")
    print(f"Model Hub Name: {model_hub_name}")
    print(f"Dataset saved to: {local_dir}")
    print(f"Dataset info: {ds}")


if __name__ == "__main__":
    # Set up argument parser
    parser = argparse.ArgumentParser(description="Download dataset from Hugging Face")
    parser.add_argument('--config_path', 
                        type=str, 
                        default='configs/datasets_info.yaml', 
                        help='Path to the dataset configuration YAML file')
    
    args = parser.parse_args()

    # Load configuration from YAML file
    configs = load_config(args.config_path)

    # Iterate through the list of configurations
    for config in configs:
        # Download dataset if platform is HuggingFace
        if config['platform'] == 'HuggingFace':
            download_huggingface_dataset(config)
        else:
            print(f"Unsupported platform: {config['platform']}")