mamba / dataset.cli.py
flpelerin's picture
Update file dataset.cli.py
97e7631
raw
history blame contribute delete
568 Bytes
from argparse import ArgumentParser
from datasets import load_dataset
parser = ArgumentParser(
prog='Dataset downloader and concatenator',
description=''
)
if __name__ == '__main__':
parser.add_argument('-p', '--remote_path')
parser.add_argument('-o', '--output_file', default='dataset.txt')
args = parser.parse_args()
dataset = load_dataset(args.remote_path)
text = ''.join(s for s in dataset['train']['text']).encode('ascii', 'ignore').decode('ascii')
with open(args.output_file, 'w') as f:
f.write(text)