File size: 2,192 Bytes
f43af3c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
from datasets import load_dataset
def load_data_from_hf(hf_dir=None, local_dir=None):
if hf_dir:
ds = load_dataset(hf_dir)
else:
ds = load_dataset('json', data_files=local_dir)
print("Dataset structure:")
print(ds)
# Print available features for validation split
print("\nValidation split features:")
print(ds['validation'].features)
# Try to access metadata fields if they exist, otherwise show available data
try:
print('\ndim process: ' + str(ds['validation'].data['dim_process'][0].as_py()))
except (KeyError, IndexError):
print("dim_process field not found in dataset")
try:
print('num seqs: ' + str(ds['validation'].data['num_seqs'][0].as_py()))
except (KeyError, IndexError):
print("num_seqs field not found in dataset")
try:
print('avg seq len: ' + str(ds['validation'].data['avg_seq_len'][0].as_py()))
except (KeyError, IndexError):
print("avg_seq_len field not found in dataset")
try:
print('min seq len: ' + str(ds['validation'].data['min_seq_len'][0].as_py()))
except (KeyError, IndexError):
print("min_seq_len field not found in dataset")
try:
print('max seq len: ' + str(ds['validation'].data['max_seq_len'][0].as_py()))
except (KeyError, IndexError):
print("max_seq_len field not found in dataset")
# Show actual data structure
print("\nFirst few examples from validation split:")
for i, example in enumerate(ds['validation']):
if i < 3: # Show first 3 examples
print(f"Example {i}:")
for key, value in example.items():
if isinstance(value, list) and len(value) > 10:
print(f" {key}: {value[:5]}... (length: {len(value)})")
else:
print(f" {key}: {value}")
else:
break
return ds
if __name__ == '__main__':
# in case one fails to load from hf directly
# one can load the json data file locally
# load_data_from_hf(hf_dir=None, local_dir={'validation':'dev.json'})
load_data_from_hf(hf_dir='easytpp/taxi') |