from argparse import ArgumentParser from datasets import load_dataset parser = ArgumentParser( prog='Dataset downloader and concatenator', description='' ) if __name__ == '__main__': parser.add_argument('-p', '--remote_path') parser.add_argument('-o', '--output_file', default='dataset.txt') args = parser.parse_args() dataset = load_dataset(args.remote_path) text = ''.join(s for s in dataset['train']['text']).encode('ascii', 'ignore').decode('ascii') with open(args.output_file, 'w') as f: f.write(text)