File size: 568 Bytes
88e8547 cd299d2 88e8547 7bfd78e 88e8547 cd299d2 97e7631 88e8547 97e7631 88e8547 97e7631 88e8547 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
from argparse import ArgumentParser
from datasets import load_dataset
parser = ArgumentParser(
prog='Dataset downloader and concatenator',
description=''
)
if __name__ == '__main__':
parser.add_argument('-p', '--remote_path')
parser.add_argument('-o', '--output_file', default='dataset.txt')
args = parser.parse_args()
dataset = load_dataset(args.remote_path)
text = ''.join(s for s in dataset['train']['text']).encode('ascii', 'ignore').decode('ascii')
with open(args.output_file, 'w') as f:
f.write(text)
|