File size: 5,124 Bytes
120dee6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>
#include <errno.h>


#define MAX_VOCAB_SIZE 4096
#define MAX_WORD_LEN   12


#include "tokenizer.h"
#include "utils.h"




static int strequ(char *a, char *b) {
    for (int i = 0; a[i] && b[i]; ++i)
        if (a[i] != b[i])
            return 0;

    return 1;
}

void help(int argc, char *argv[]) {
    printf("Usage: %s [arg1] [arg2] ... [argN]\n", argv[0]);
    printf("Arguments:\n");
    printf("  --help                              show this help\n");
    printf("  --dataset_path             <str>    path to dataset\n");
    printf("  --dataset_output_path      <str>    path to dataset output\n");
    printf("  --vocab_path               <str>    path to vocabulary\n");
    printf("  --vocab_output_path        <str>    path to vocabulary output\n");
    printf("  --tokenizer_path           <str>    path to tokenizer\n");
    printf("  --tokenizer_output_path    <str>    path to tokenizer output\n");
    printf("\n");
    exit(0);
}




void tokenizer_print_vocab(tokenizer_t *t) {
    printf("vocab size: %d\n", t->vocab_size);

    for (int i = 0; i < t->vocab_size; ++i) {
        char *str = t->vocab[i];
        int len = strlen(str);
        printf("id: %d, len: %d, str: %s\n", i, len, str);
    }
}

void tokenize_whole_file(tokenizer_t *t, char *fname[2]) {
    int infd  = open(fname[0], O_RDONLY),
        outfd = open(fname[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);

    if (infd < 0)  ERROR("failed to open %s\n", fname[0]);
    if (outfd < 0) ERROR("failed to open %s\n", fname[1]);

    for (int i = 0; ; ++i) {
        if   (rdchar(infd) == EOF) break;
        else seekback(infd, 1);

        int id = t->encode_file(t, infd);
        if (id == EOF) { (void)rdchar(infd); continue; }
        
        write(outfd, &id, sizeof(id));
        
        if (i % 100000 == 0) { 
            int ncurrent = lseek(infd, 0, SEEK_CUR),
                ntotal = 0;

            ntotal = lseek(infd, 0, SEEK_END);
            (void)   lseek(infd, ncurrent, SEEK_SET);

            printf("iteration %d, encoded %d (%s), proccessed %d of %d (%.2f%%)\n", i, id, t->decode(t, id), ncurrent, ntotal, ((float)ncurrent / ntotal) * 100);
        
            if (ntotal - ncurrent < 100) break;
        }
    }
}
#define tokenize_whole_file(T, ...) tokenize_whole_file(T, (char*[2]){__VA_ARGS__})




int main(int argc, char *argv[]) {
    tokenizer_t *tokenizer = Tokenizer(NULL);


    char *dataset_path          = "data/dataset.txt",
         *dataset_output_path   = "bin/dataset.bin",
         *vocab_path            = "data/vocab.txt",
         *vocab_output_path     = NULL,
         *tokenizer_path        = NULL,
         *tokenizer_output_path = "bin/tokenizer.bin";


    for (int i = 1; i < argc; i += 2) {
        if      (strequ(argv[i], "--dataset_path"))             dataset_path             = argv[i + 1], printf("setting dataset path to \"%s\"\n",           dataset_path);
        else if (strequ(argv[i], "--dataset_output_path"))      dataset_output_path      = argv[i + 1], printf("setting dataset output path to \"%s\"\n",    dataset_output_path);
        else if (strequ(argv[i], "--vocab_path"))               vocab_path               = argv[i + 1], printf("setting vocabulary path to \"%s\"\n",        vocab_path);
        else if (strequ(argv[i], "--vocab_output_path"))        vocab_output_path        = argv[i + 1], printf("setting vocabulary output path to \"%s\"\n", vocab_output_path);
        else if (strequ(argv[i], "--tokenizer_path"))           tokenizer_path           = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n",  tokenizer_path);
        else if (strequ(argv[i], "--tokenizer_output_path"))    tokenizer_output_path    = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n",  tokenizer_output_path);
        else if (strequ(argv[i], "--help")) help(argc, argv);
        else                                ERROR("unknown option \"%s\"\n", argv[i]);
    }


    CALLBACK_ON_FILE(vocab_path,
        INFO("loading vocabulary from \"%s\"\n", vocab_path);
        tokenizer->load_vocab(tokenizer, vocab_path);
        tokenizer_print_vocab(tokenizer);
    );

    CALLBACK_ON_FILE(tokenizer_path,
        INFO("loading tokenizer from \"%s\"\n", tokenizer_path);
        tokenizer->load_tokenizer(tokenizer, tokenizer_path);
    );

    CALLBACK_ON_FILE(vocab_output_path,
        INFO("exporting vocabulary to \"%s\"\n", vocab_output_path);
        tokenizer->save_vocab(tokenizer, vocab_output_path)
    );

    CALLBACK_ON_FILE(tokenizer_output_path,
        INFO("exporting tokenizer to \"%s\"\n", tokenizer_output_path);
        tokenizer->save_tokenizer(tokenizer, tokenizer_output_path)
    );

    CALLBACK_ON_FILE(dataset_path,
        CALLBACK_ON_FILE(dataset_output_path,
            INFO("loading dataset from \"%s\" and exporting it to \"%s\"\n", dataset_path, dataset_output_path);
            tokenize_whole_file(tokenizer, dataset_path, dataset_output_path);
        );
    );

    return EXIT_SUCCESS;
}