File size: 5,660 Bytes
120dee6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#pragma once

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>


#include "utils.h"




#ifndef MAX_VOCAB_SIZE
#define MAX_VOCAB_SIZE 32000
#endif

#ifndef MAX_WORD_LEN
#define MAX_WORD_LEN 16
#endif




STRUCTURE(tokenizer_t,
    int vocab_size;
    char vocab[MAX_VOCAB_SIZE][MAX_WORD_LEN];

    int   (*add_word) (tokenizer_t *, char *);
	
	int   (*encode_word) (tokenizer_t *, char *);
    int   (*encode_stream)   (tokenizer_t *, char **);
    int   (*encode_file)    (tokenizer_t *, int);

    char *(*decode)   (tokenizer_t *, int);
    char *(*decode_file)    (tokenizer_t *, int);

    void  (*save_vocab) (tokenizer_t *, char *);
	void  (*load_vocab)   (tokenizer_t *, char *);

    void  (*save_tokenizer) (tokenizer_t *, char *);
    void  (*load_tokenizer) (tokenizer_t *, char *);
);




char rdchar(int fd) { char c; return read(fd, &c, sizeof(c)) < sizeof(c) ? EOF : c; }
int  rdint(int fd)  { int  d; return read(fd, &d, sizeof(d)) < sizeof(d) ? EOF : d; }
void seekback(int fd, int n) { lseek(fd, -n, SEEK_CUR); }

char *strcpy(char *dst, const char *src) { for (; (*dst++ = *src++); ); return dst; }
int strcmp(const char *a, const char *b) {
    for (; *a && *a == *b; ++a, ++b);
    return *a - *b;
}




int tokenizer_add_word(tokenizer_t *t, char *word) {
    if (t->vocab_size >= MAX_VOCAB_SIZE) return -1;
    strcpy(t->vocab[t->vocab_size], word);
    return t->vocab_size++;
}


int tokenizer_encode_word(tokenizer_t *t, char* word) {
    int left = 0,
        right = t->vocab_size - 1;

    for (; left <= right; ) {
        int mid = left + (right - left) / 2,
            cmp = strcmp(word, t->vocab[mid]);
        
        if      (cmp == 0) return mid;
        else if (cmp < 0)  right = mid - 1;
        else               left  = mid + 1;
    }

    return -1;
}

int tokenizer_encode_stream(tokenizer_t *t, char **stream) {
    char word[MAX_WORD_LEN] = {};
    int id = -1, i = 0, j = 0;

    for (; (*stream)[i] && i < MAX_WORD_LEN; ++i) {
        word[i] = (*stream)[i];
    
        int tmp = t->encode_word(t, word);
        if (tmp != -1) id = tmp, j = i + 1;
    }

    *stream += j;
    return id;
}


int tokenizer_encode_file(tokenizer_t *t, int fd) {
    char c, word[MAX_WORD_LEN] = {};
    int id = -1, i = 0, j = 0;

    for (; (c = rdchar(fd)) != EOF && i < MAX_WORD_LEN; ++i) {
        word[i] = c;

        int tmp = t->encode_word(t, word);
        if (tmp != -1) id = tmp, j = i + 1;
    }

    seekback(fd, MAX_WORD_LEN - j + 1);
    return id;
}


char *tokenizer_decode(tokenizer_t *t, int id) { return t->vocab[id]; }

char *tokenizer_decode_file(tokenizer_t *t, int fd) {
    int id = rdint(fd);
    if (id == EOF) ERROR("read EOF from file\n");

    return t->decode(t, id);
}


void tokenizer_save_vocab(tokenizer_t *t, char *fname) {
	int fd = open(fname, O_WRONLY | O_CREAT | O_TRUNC, 0644);
	if (fd < 0) ERROR("failed to open \"%s\"\n", fname);

	int max_len = 0;
	for (int i = 0; i < t->vocab_size; ++i) {
		char *str = t->vocab[i];
		
		int len = strlen(str),
			n   = write(fd, str, len);

		max_len = len > max_len ? len : max_len;

		if (n != len) ERROR("failed to write to %s, only wrote %d bytes out of %d\n", fname, n, len);
	}

	printf("wrote %d tokens to file \"%s\"\nMax token length was %d\n", t->vocab_size, fname, max_len);
	close(fd);
}

void tokenizer_load_vocab(tokenizer_t *t, char *fname) {
    int fd = open(fname, O_RDONLY);
    if (fd < 0) ERROR("failed to open \"%s\"\n", fname);

    char c, word[MAX_WORD_LEN];
    for (; (c = rdchar(fd)) != EOF; ) {
        for (int i = 0; i < MAX_WORD_LEN; ++i, c = rdchar(fd)) {
            word[i] = c;

            if (word[i] == EOF || word[i] == '\n') {
                word[i] = '\0'; 
                break; 
            }
        }

        t->add_word(t, word);
    }
}


void tokenizer_save_tokenizer(tokenizer_t *t, char *fname) {
    int fd = open(fname, O_WRONLY | O_CREAT | O_TRUNC, 0644);
    if (fd < 0) ERROR("failed to open \"%s\"\n", fname);

    int n = write(fd, t->vocab, sizeof(t->vocab));
    if (n != sizeof(t->vocab)) ERROR("failed to write to %s, only wrote %d bytes out of %ld\n", fname, n, sizeof(t->vocab));

    printf("wrote %d bytes (%d tokens) to \"%s\"\n", n, t->vocab_size, fname);
    close(fd);
}

void tokenizer_load_tokenizer(tokenizer_t *t, char *fname) {
    int fd = open(fname, O_RDONLY);
    if (fd < 0) ERROR("failed to open \"%s\"\n", fname);

    int n = read(fd, t->vocab, sizeof(t->vocab));
    if (n != sizeof(t->vocab)) ERROR("failed to read from %s, only read %d bytes out of %ld\n", fname, n, sizeof(t->vocab));

    t->vocab_size = n / MAX_WORD_LEN;

    printf("read %d bytes (%d tokens) from \"%s\"\n", n, t->vocab_size, fname);
    close(fd);
}




tokenizer_t _tokenizer;
tokenizer_t *Tokenizer(char *fname) {
    _tokenizer = (tokenizer_t) {
        .vocab_size       = 0,

		.add_word         = tokenizer_add_word,
		
		.encode_word      = tokenizer_encode_word,
		.encode_stream    = tokenizer_encode_stream,
		.encode_file      = tokenizer_encode_file,

		.decode           = tokenizer_decode,
		.decode_file      = tokenizer_decode_file,

		.save_vocab       = tokenizer_save_vocab,
		.load_vocab       = tokenizer_load_vocab,
		
		.save_tokenizer   = tokenizer_save_tokenizer,
		.load_tokenizer   = tokenizer_load_tokenizer
    };

    if (fname) _tokenizer.load_tokenizer(&_tokenizer, fname);

    INFO("vocabulary size: %d (%d max)\n", _tokenizer.vocab_size, MAX_VOCAB_SIZE);
    INFO("max token length: %d\n", MAX_WORD_LEN);
    INFO("size of structure: %ld bytes\n", sizeof(tokenizer_t));

    return &_tokenizer; 
}