| from argparse import ArgumentParser | |
| from datasets import load_dataset | |
| parser = ArgumentParser( | |
| prog='Dataset downloader and concatenator', | |
| description='' | |
| ) | |
| if __name__ == '__main__': | |
| parser.add_argument('-p', '--remote_path') | |
| parser.add_argument('-o', '--output_file', default='dataset.txt') | |
| args = parser.parse_args() | |
| dataset = load_dataset(args.remote_path) | |
| text = ''.join(s for s in dataset['train']['text']).encode('ascii', 'ignore').decode('ascii') | |
| with open(args.output_file, 'w') as f: | |
| f.write(text) | |