File size: 1,613 Bytes
f56ede2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import argparse
import yaml
from datasets import load_dataset
def load_config(config_path):
with open(config_path, 'r') as file:
return yaml.safe_load(file)
def download_huggingface_dataset(config):
# Get dataset details from config
dataset_name = config['dataset_name']
local_dir = config['local_dir']
# Split dataset name into user_name and model_hub_name
user_name, model_hub_name = dataset_name.split('/')
# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset(dataset_name, cache_dir=local_dir)
# Print information for verification
print(f"User Name: {user_name}")
print(f"Model Hub Name: {model_hub_name}")
print(f"Dataset saved to: {local_dir}")
print(f"Dataset info: {ds}")
if __name__ == "__main__":
# Set up argument parser
parser = argparse.ArgumentParser(description="Download dataset from Hugging Face")
parser.add_argument('--config_path',
type=str,
default='configs/datasets_info.yaml',
help='Path to the dataset configuration YAML file')
args = parser.parse_args()
# Load configuration from YAML file
configs = load_config(args.config_path)
# Iterate through the list of configurations
for config in configs:
# Download dataset if platform is HuggingFace
if config['platform'] == 'HuggingFace':
download_huggingface_dataset(config)
else:
print(f"Unsupported platform: {config['platform']}") |