flopml commited on
Commit
120dee6
·
0 Parent(s):

tracking with lfs

Browse files
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ data/dataset_tinystories-v2_100k-rows.txt filter=lfs diff=lfs merge=lfs -text
2
+ data filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Ignore everyhing in bin/
2
+
3
+ bin/
4
+ bin/dataset.bin
5
+ bin/tokenizer
6
+ bin/vocab.bin
Makefile ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CC := gcc
2
+ CFLAGS := -O3 -static -march=native -ffast-math #-Wall -Wextra -Werror
3
+ CLIBS := -lc
4
+
5
+ TARGET := bin/tokenizer
6
+ SRCS := $(wildcard src/*.c)
7
+
8
+
9
+
10
+ all: $(TARGET)
11
+
12
+ clean:
13
+ rm -f $(TARGET) bin/*
14
+
15
+ run: all
16
+ ./$(TARGET) \
17
+ --dataset_path data/dataset_tinystories-v2_100k-rows.txt \
18
+ --vocab_path data/vocab_tinstories-v2_size-4096_wordlen-12.txt \
19
+ --dataset_output_path bin/dataset_tinystories-v2_100k-rows.bin \
20
+ --tokenizer_output_path bin/tokenizer_tinystories-v2_size-4096_wordlen-12.bin
21
+
22
+
23
+
24
+ $(TARGET): $(SRCS)
25
+ $(CC) $(CFLAGS) $(CLIBS) -o $@ $^
26
+
27
+
28
+
29
+ .PHONY: all clean run
README.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Implements the flop tokenizer, a sub-word tokenizer for autoregressive language modeling.
2
+
3
+
4
+ TODO:
5
+ - Better printing during encoding of file and loading / exporting?
6
+ - Include Python script for BPE training
7
+ - Add time to logging during encoding
data/.gitkeep ADDED
File without changes
data/dataset_tinystories-v2_100k-rows.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:407ca81af9dabb73461834d630fa9c1e58d16ae951d0cede99f1b8a3238a214f
3
+ size 80573031
src/.gitkeep ADDED
File without changes
src/main.c ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <stdio.h>
2
+ #include <stdlib.h>
3
+ #include <string.h>
4
+ #include <fcntl.h>
5
+ #include <sys/stat.h>
6
+ #include <unistd.h>
7
+ #include <errno.h>
8
+
9
+
10
+ #define MAX_VOCAB_SIZE 4096
11
+ #define MAX_WORD_LEN 12
12
+
13
+
14
+ #include "tokenizer.h"
15
+ #include "utils.h"
16
+
17
+
18
+
19
+
20
+ static int strequ(char *a, char *b) {
21
+ for (int i = 0; a[i] && b[i]; ++i)
22
+ if (a[i] != b[i])
23
+ return 0;
24
+
25
+ return 1;
26
+ }
27
+
28
+ void help(int argc, char *argv[]) {
29
+ printf("Usage: %s [arg1] [arg2] ... [argN]\n", argv[0]);
30
+ printf("Arguments:\n");
31
+ printf(" --help show this help\n");
32
+ printf(" --dataset_path <str> path to dataset\n");
33
+ printf(" --dataset_output_path <str> path to dataset output\n");
34
+ printf(" --vocab_path <str> path to vocabulary\n");
35
+ printf(" --vocab_output_path <str> path to vocabulary output\n");
36
+ printf(" --tokenizer_path <str> path to tokenizer\n");
37
+ printf(" --tokenizer_output_path <str> path to tokenizer output\n");
38
+ printf("\n");
39
+ exit(0);
40
+ }
41
+
42
+
43
+
44
+
45
+ void tokenizer_print_vocab(tokenizer_t *t) {
46
+ printf("vocab size: %d\n", t->vocab_size);
47
+
48
+ for (int i = 0; i < t->vocab_size; ++i) {
49
+ char *str = t->vocab[i];
50
+ int len = strlen(str);
51
+ printf("id: %d, len: %d, str: %s\n", i, len, str);
52
+ }
53
+ }
54
+
55
+ void tokenize_whole_file(tokenizer_t *t, char *fname[2]) {
56
+ int infd = open(fname[0], O_RDONLY),
57
+ outfd = open(fname[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
58
+
59
+ if (infd < 0) ERROR("failed to open %s\n", fname[0]);
60
+ if (outfd < 0) ERROR("failed to open %s\n", fname[1]);
61
+
62
+ for (int i = 0; ; ++i) {
63
+ if (rdchar(infd) == EOF) break;
64
+ else seekback(infd, 1);
65
+
66
+ int id = t->encode_file(t, infd);
67
+ if (id == EOF) { (void)rdchar(infd); continue; }
68
+
69
+ write(outfd, &id, sizeof(id));
70
+
71
+ if (i % 100000 == 0) {
72
+ int ncurrent = lseek(infd, 0, SEEK_CUR),
73
+ ntotal = 0;
74
+
75
+ ntotal = lseek(infd, 0, SEEK_END);
76
+ (void) lseek(infd, ncurrent, SEEK_SET);
77
+
78
+ printf("iteration %d, encoded %d (%s), proccessed %d of %d (%.2f%%)\n", i, id, t->decode(t, id), ncurrent, ntotal, ((float)ncurrent / ntotal) * 100);
79
+
80
+ if (ntotal - ncurrent < 100) break;
81
+ }
82
+ }
83
+ }
84
+ #define tokenize_whole_file(T, ...) tokenize_whole_file(T, (char*[2]){__VA_ARGS__})
85
+
86
+
87
+
88
+
89
+ int main(int argc, char *argv[]) {
90
+ tokenizer_t *tokenizer = Tokenizer(NULL);
91
+
92
+
93
+ char *dataset_path = "data/dataset.txt",
94
+ *dataset_output_path = "bin/dataset.bin",
95
+ *vocab_path = "data/vocab.txt",
96
+ *vocab_output_path = NULL,
97
+ *tokenizer_path = NULL,
98
+ *tokenizer_output_path = "bin/tokenizer.bin";
99
+
100
+
101
+ for (int i = 1; i < argc; i += 2) {
102
+ if (strequ(argv[i], "--dataset_path")) dataset_path = argv[i + 1], printf("setting dataset path to \"%s\"\n", dataset_path);
103
+ else if (strequ(argv[i], "--dataset_output_path")) dataset_output_path = argv[i + 1], printf("setting dataset output path to \"%s\"\n", dataset_output_path);
104
+ else if (strequ(argv[i], "--vocab_path")) vocab_path = argv[i + 1], printf("setting vocabulary path to \"%s\"\n", vocab_path);
105
+ else if (strequ(argv[i], "--vocab_output_path")) vocab_output_path = argv[i + 1], printf("setting vocabulary output path to \"%s\"\n", vocab_output_path);
106
+ else if (strequ(argv[i], "--tokenizer_path")) tokenizer_path = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n", tokenizer_path);
107
+ else if (strequ(argv[i], "--tokenizer_output_path")) tokenizer_output_path = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n", tokenizer_output_path);
108
+ else if (strequ(argv[i], "--help")) help(argc, argv);
109
+ else ERROR("unknown option \"%s\"\n", argv[i]);
110
+ }
111
+
112
+
113
+ CALLBACK_ON_FILE(vocab_path,
114
+ INFO("loading vocabulary from \"%s\"\n", vocab_path);
115
+ tokenizer->load_vocab(tokenizer, vocab_path);
116
+ tokenizer_print_vocab(tokenizer);
117
+ );
118
+
119
+ CALLBACK_ON_FILE(tokenizer_path,
120
+ INFO("loading tokenizer from \"%s\"\n", tokenizer_path);
121
+ tokenizer->load_tokenizer(tokenizer, tokenizer_path);
122
+ );
123
+
124
+ CALLBACK_ON_FILE(vocab_output_path,
125
+ INFO("exporting vocabulary to \"%s\"\n", vocab_output_path);
126
+ tokenizer->save_vocab(tokenizer, vocab_output_path)
127
+ );
128
+
129
+ CALLBACK_ON_FILE(tokenizer_output_path,
130
+ INFO("exporting tokenizer to \"%s\"\n", tokenizer_output_path);
131
+ tokenizer->save_tokenizer(tokenizer, tokenizer_output_path)
132
+ );
133
+
134
+ CALLBACK_ON_FILE(dataset_path,
135
+ CALLBACK_ON_FILE(dataset_output_path,
136
+ INFO("loading dataset from \"%s\" and exporting it to \"%s\"\n", dataset_path, dataset_output_path);
137
+ tokenize_whole_file(tokenizer, dataset_path, dataset_output_path);
138
+ );
139
+ );
140
+
141
+ return EXIT_SUCCESS;
142
+ }
src/tokenizer.h ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <stdio.h>
4
+ #include <stdlib.h>
5
+ #include <string.h>
6
+ #include <unistd.h>
7
+ #include <fcntl.h>
8
+
9
+
10
+ #include "utils.h"
11
+
12
+
13
+
14
+
15
+ #ifndef MAX_VOCAB_SIZE
16
+ #define MAX_VOCAB_SIZE 32000
17
+ #endif
18
+
19
+ #ifndef MAX_WORD_LEN
20
+ #define MAX_WORD_LEN 16
21
+ #endif
22
+
23
+
24
+
25
+
26
+ STRUCTURE(tokenizer_t,
27
+ int vocab_size;
28
+ char vocab[MAX_VOCAB_SIZE][MAX_WORD_LEN];
29
+
30
+ int (*add_word) (tokenizer_t *, char *);
31
+
32
+ int (*encode_word) (tokenizer_t *, char *);
33
+ int (*encode_stream) (tokenizer_t *, char **);
34
+ int (*encode_file) (tokenizer_t *, int);
35
+
36
+ char *(*decode) (tokenizer_t *, int);
37
+ char *(*decode_file) (tokenizer_t *, int);
38
+
39
+ void (*save_vocab) (tokenizer_t *, char *);
40
+ void (*load_vocab) (tokenizer_t *, char *);
41
+
42
+ void (*save_tokenizer) (tokenizer_t *, char *);
43
+ void (*load_tokenizer) (tokenizer_t *, char *);
44
+ );
45
+
46
+
47
+
48
+
49
+ char rdchar(int fd) { char c; return read(fd, &c, sizeof(c)) < sizeof(c) ? EOF : c; }
50
+ int rdint(int fd) { int d; return read(fd, &d, sizeof(d)) < sizeof(d) ? EOF : d; }
51
+ void seekback(int fd, int n) { lseek(fd, -n, SEEK_CUR); }
52
+
53
+ char *strcpy(char *dst, const char *src) { for (; (*dst++ = *src++); ); return dst; }
54
+ int strcmp(const char *a, const char *b) {
55
+ for (; *a && *a == *b; ++a, ++b);
56
+ return *a - *b;
57
+ }
58
+
59
+
60
+
61
+
62
+ int tokenizer_add_word(tokenizer_t *t, char *word) {
63
+ if (t->vocab_size >= MAX_VOCAB_SIZE) return -1;
64
+ strcpy(t->vocab[t->vocab_size], word);
65
+ return t->vocab_size++;
66
+ }
67
+
68
+
69
+ int tokenizer_encode_word(tokenizer_t *t, char* word) {
70
+ int left = 0,
71
+ right = t->vocab_size - 1;
72
+
73
+ for (; left <= right; ) {
74
+ int mid = left + (right - left) / 2,
75
+ cmp = strcmp(word, t->vocab[mid]);
76
+
77
+ if (cmp == 0) return mid;
78
+ else if (cmp < 0) right = mid - 1;
79
+ else left = mid + 1;
80
+ }
81
+
82
+ return -1;
83
+ }
84
+
85
+ int tokenizer_encode_stream(tokenizer_t *t, char **stream) {
86
+ char word[MAX_WORD_LEN] = {};
87
+ int id = -1, i = 0, j = 0;
88
+
89
+ for (; (*stream)[i] && i < MAX_WORD_LEN; ++i) {
90
+ word[i] = (*stream)[i];
91
+
92
+ int tmp = t->encode_word(t, word);
93
+ if (tmp != -1) id = tmp, j = i + 1;
94
+ }
95
+
96
+ *stream += j;
97
+ return id;
98
+ }
99
+
100
+
101
+ int tokenizer_encode_file(tokenizer_t *t, int fd) {
102
+ char c, word[MAX_WORD_LEN] = {};
103
+ int id = -1, i = 0, j = 0;
104
+
105
+ for (; (c = rdchar(fd)) != EOF && i < MAX_WORD_LEN; ++i) {
106
+ word[i] = c;
107
+
108
+ int tmp = t->encode_word(t, word);
109
+ if (tmp != -1) id = tmp, j = i + 1;
110
+ }
111
+
112
+ seekback(fd, MAX_WORD_LEN - j + 1);
113
+ return id;
114
+ }
115
+
116
+
117
+ char *tokenizer_decode(tokenizer_t *t, int id) { return t->vocab[id]; }
118
+
119
+ char *tokenizer_decode_file(tokenizer_t *t, int fd) {
120
+ int id = rdint(fd);
121
+ if (id == EOF) ERROR("read EOF from file\n");
122
+
123
+ return t->decode(t, id);
124
+ }
125
+
126
+
127
+ void tokenizer_save_vocab(tokenizer_t *t, char *fname) {
128
+ int fd = open(fname, O_WRONLY | O_CREAT | O_TRUNC, 0644);
129
+ if (fd < 0) ERROR("failed to open \"%s\"\n", fname);
130
+
131
+ int max_len = 0;
132
+ for (int i = 0; i < t->vocab_size; ++i) {
133
+ char *str = t->vocab[i];
134
+
135
+ int len = strlen(str),
136
+ n = write(fd, str, len);
137
+
138
+ max_len = len > max_len ? len : max_len;
139
+
140
+ if (n != len) ERROR("failed to write to %s, only wrote %d bytes out of %d\n", fname, n, len);
141
+ }
142
+
143
+ printf("wrote %d tokens to file \"%s\"\nMax token length was %d\n", t->vocab_size, fname, max_len);
144
+ close(fd);
145
+ }
146
+
147
+ void tokenizer_load_vocab(tokenizer_t *t, char *fname) {
148
+ int fd = open(fname, O_RDONLY);
149
+ if (fd < 0) ERROR("failed to open \"%s\"\n", fname);
150
+
151
+ char c, word[MAX_WORD_LEN];
152
+ for (; (c = rdchar(fd)) != EOF; ) {
153
+ for (int i = 0; i < MAX_WORD_LEN; ++i, c = rdchar(fd)) {
154
+ word[i] = c;
155
+
156
+ if (word[i] == EOF || word[i] == '\n') {
157
+ word[i] = '\0';
158
+ break;
159
+ }
160
+ }
161
+
162
+ t->add_word(t, word);
163
+ }
164
+ }
165
+
166
+
167
+ void tokenizer_save_tokenizer(tokenizer_t *t, char *fname) {
168
+ int fd = open(fname, O_WRONLY | O_CREAT | O_TRUNC, 0644);
169
+ if (fd < 0) ERROR("failed to open \"%s\"\n", fname);
170
+
171
+ int n = write(fd, t->vocab, sizeof(t->vocab));
172
+ if (n != sizeof(t->vocab)) ERROR("failed to write to %s, only wrote %d bytes out of %ld\n", fname, n, sizeof(t->vocab));
173
+
174
+ printf("wrote %d bytes (%d tokens) to \"%s\"\n", n, t->vocab_size, fname);
175
+ close(fd);
176
+ }
177
+
178
+ void tokenizer_load_tokenizer(tokenizer_t *t, char *fname) {
179
+ int fd = open(fname, O_RDONLY);
180
+ if (fd < 0) ERROR("failed to open \"%s\"\n", fname);
181
+
182
+ int n = read(fd, t->vocab, sizeof(t->vocab));
183
+ if (n != sizeof(t->vocab)) ERROR("failed to read from %s, only read %d bytes out of %ld\n", fname, n, sizeof(t->vocab));
184
+
185
+ t->vocab_size = n / MAX_WORD_LEN;
186
+
187
+ printf("read %d bytes (%d tokens) from \"%s\"\n", n, t->vocab_size, fname);
188
+ close(fd);
189
+ }
190
+
191
+
192
+
193
+
194
+ tokenizer_t _tokenizer;
195
+ tokenizer_t *Tokenizer(char *fname) {
196
+ _tokenizer = (tokenizer_t) {
197
+ .vocab_size = 0,
198
+
199
+ .add_word = tokenizer_add_word,
200
+
201
+ .encode_word = tokenizer_encode_word,
202
+ .encode_stream = tokenizer_encode_stream,
203
+ .encode_file = tokenizer_encode_file,
204
+
205
+ .decode = tokenizer_decode,
206
+ .decode_file = tokenizer_decode_file,
207
+
208
+ .save_vocab = tokenizer_save_vocab,
209
+ .load_vocab = tokenizer_load_vocab,
210
+
211
+ .save_tokenizer = tokenizer_save_tokenizer,
212
+ .load_tokenizer = tokenizer_load_tokenizer
213
+ };
214
+
215
+ if (fname) _tokenizer.load_tokenizer(&_tokenizer, fname);
216
+
217
+ INFO("vocabulary size: %d (%d max)\n", _tokenizer.vocab_size, MAX_VOCAB_SIZE);
218
+ INFO("max token length: %d\n", MAX_WORD_LEN);
219
+ INFO("size of structure: %ld bytes\n", sizeof(tokenizer_t));
220
+
221
+ return &_tokenizer;
222
+ }
src/utils.h ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+
4
+
5
+
6
+ #define STRUCTURE(NAME, ...) \
7
+ typedef struct NAME NAME; struct NAME { __VA_ARGS__ }
8
+
9
+
10
+
11
+
12
+ #define HOLD() (void)fgetc(stdin);
13
+
14
+
15
+
16
+
17
+ #define LOG(FD, COLOR, ...) fprintf(FD, COLOR), fprintf(FD, "%s:%d: ", __FILE__, __LINE__), fprintf(FD, __VA_ARGS__), fprintf(FD, "\033[0m"), fflush(FD)
18
+ #define COMMENT(...) LOG(stdout, "\033[38;5;78m", __VA_ARGS__)
19
+ #define WARNING(...) LOG(stdout, "\033[38;5;208m", __VA_ARGS__), HOLD()
20
+ #define INFO(...) LOG(stdout, "\033[38;5;228m", __VA_ARGS__)
21
+ #define ERROR(...) { LOG(stderr, "\033[38;5;196m", __VA_ARGS__); exit(EXIT_FAILURE); }
22
+
23
+
24
+
25
+
26
+ #define CALLBACK_ON_FILE(FNAME, ...) \
27
+ if (FNAME != NULL) { __VA_ARGS__; } \
28
+ else WARNING("\"%s\" not set, skipping", #FNAME);