File size: 568 Bytes
88e8547
 
 
 
cd299d2
88e8547
 
 
 
 
 
 
 
 
 
7bfd78e
88e8547
 
cd299d2
97e7631
 
88e8547
97e7631
88e8547
 
97e7631
88e8547
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28



from argparse import ArgumentParser
from datasets import load_dataset



parser = ArgumentParser(
    prog='Dataset downloader and concatenator',
    description=''
)



if __name__ == '__main__':
    
    parser.add_argument('-p', '--remote_path')
    parser.add_argument('-o', '--output_file', default='dataset.txt')
    
    args = parser.parse_args()

    dataset = load_dataset(args.remote_path)
    text = ''.join(s for s in dataset['train']['text']).encode('ascii', 'ignore').decode('ascii')

    with open(args.output_file, 'w') as f:
        f.write(text)