Commit
·
120dee6
0
Parent(s):
tracking with lfs
Browse files- .gitattributes +2 -0
- .gitignore +6 -0
- Makefile +29 -0
- README.md +7 -0
- data/.gitkeep +0 -0
- data/dataset_tinystories-v2_100k-rows.txt +3 -0
- src/.gitkeep +0 -0
- src/main.c +142 -0
- src/tokenizer.h +222 -0
- src/utils.h +28 -0
.gitattributes
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data/dataset_tinystories-v2_100k-rows.txt filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
data filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ignore everyhing in bin/
|
| 2 |
+
|
| 3 |
+
bin/
|
| 4 |
+
bin/dataset.bin
|
| 5 |
+
bin/tokenizer
|
| 6 |
+
bin/vocab.bin
|
Makefile
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CC := gcc
|
| 2 |
+
CFLAGS := -O3 -static -march=native -ffast-math #-Wall -Wextra -Werror
|
| 3 |
+
CLIBS := -lc
|
| 4 |
+
|
| 5 |
+
TARGET := bin/tokenizer
|
| 6 |
+
SRCS := $(wildcard src/*.c)
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
all: $(TARGET)
|
| 11 |
+
|
| 12 |
+
clean:
|
| 13 |
+
rm -f $(TARGET) bin/*
|
| 14 |
+
|
| 15 |
+
run: all
|
| 16 |
+
./$(TARGET) \
|
| 17 |
+
--dataset_path data/dataset_tinystories-v2_100k-rows.txt \
|
| 18 |
+
--vocab_path data/vocab_tinstories-v2_size-4096_wordlen-12.txt \
|
| 19 |
+
--dataset_output_path bin/dataset_tinystories-v2_100k-rows.bin \
|
| 20 |
+
--tokenizer_output_path bin/tokenizer_tinystories-v2_size-4096_wordlen-12.bin
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
$(TARGET): $(SRCS)
|
| 25 |
+
$(CC) $(CFLAGS) $(CLIBS) -o $@ $^
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
.PHONY: all clean run
|
README.md
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Implements the flop tokenizer, a sub-word tokenizer for autoregressive language modeling.
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
TODO:
|
| 5 |
+
- Better printing during encoding of file and loading / exporting?
|
| 6 |
+
- Include Python script for BPE training
|
| 7 |
+
- Add time to logging during encoding
|
data/.gitkeep
ADDED
|
File without changes
|
data/dataset_tinystories-v2_100k-rows.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:407ca81af9dabb73461834d630fa9c1e58d16ae951d0cede99f1b8a3238a214f
|
| 3 |
+
size 80573031
|
src/.gitkeep
ADDED
|
File without changes
|
src/main.c
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <stdio.h>
|
| 2 |
+
#include <stdlib.h>
|
| 3 |
+
#include <string.h>
|
| 4 |
+
#include <fcntl.h>
|
| 5 |
+
#include <sys/stat.h>
|
| 6 |
+
#include <unistd.h>
|
| 7 |
+
#include <errno.h>
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
#define MAX_VOCAB_SIZE 4096
|
| 11 |
+
#define MAX_WORD_LEN 12
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
#include "tokenizer.h"
|
| 15 |
+
#include "utils.h"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
static int strequ(char *a, char *b) {
|
| 21 |
+
for (int i = 0; a[i] && b[i]; ++i)
|
| 22 |
+
if (a[i] != b[i])
|
| 23 |
+
return 0;
|
| 24 |
+
|
| 25 |
+
return 1;
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
void help(int argc, char *argv[]) {
|
| 29 |
+
printf("Usage: %s [arg1] [arg2] ... [argN]\n", argv[0]);
|
| 30 |
+
printf("Arguments:\n");
|
| 31 |
+
printf(" --help show this help\n");
|
| 32 |
+
printf(" --dataset_path <str> path to dataset\n");
|
| 33 |
+
printf(" --dataset_output_path <str> path to dataset output\n");
|
| 34 |
+
printf(" --vocab_path <str> path to vocabulary\n");
|
| 35 |
+
printf(" --vocab_output_path <str> path to vocabulary output\n");
|
| 36 |
+
printf(" --tokenizer_path <str> path to tokenizer\n");
|
| 37 |
+
printf(" --tokenizer_output_path <str> path to tokenizer output\n");
|
| 38 |
+
printf("\n");
|
| 39 |
+
exit(0);
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
void tokenizer_print_vocab(tokenizer_t *t) {
|
| 46 |
+
printf("vocab size: %d\n", t->vocab_size);
|
| 47 |
+
|
| 48 |
+
for (int i = 0; i < t->vocab_size; ++i) {
|
| 49 |
+
char *str = t->vocab[i];
|
| 50 |
+
int len = strlen(str);
|
| 51 |
+
printf("id: %d, len: %d, str: %s\n", i, len, str);
|
| 52 |
+
}
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
void tokenize_whole_file(tokenizer_t *t, char *fname[2]) {
|
| 56 |
+
int infd = open(fname[0], O_RDONLY),
|
| 57 |
+
outfd = open(fname[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
|
| 58 |
+
|
| 59 |
+
if (infd < 0) ERROR("failed to open %s\n", fname[0]);
|
| 60 |
+
if (outfd < 0) ERROR("failed to open %s\n", fname[1]);
|
| 61 |
+
|
| 62 |
+
for (int i = 0; ; ++i) {
|
| 63 |
+
if (rdchar(infd) == EOF) break;
|
| 64 |
+
else seekback(infd, 1);
|
| 65 |
+
|
| 66 |
+
int id = t->encode_file(t, infd);
|
| 67 |
+
if (id == EOF) { (void)rdchar(infd); continue; }
|
| 68 |
+
|
| 69 |
+
write(outfd, &id, sizeof(id));
|
| 70 |
+
|
| 71 |
+
if (i % 100000 == 0) {
|
| 72 |
+
int ncurrent = lseek(infd, 0, SEEK_CUR),
|
| 73 |
+
ntotal = 0;
|
| 74 |
+
|
| 75 |
+
ntotal = lseek(infd, 0, SEEK_END);
|
| 76 |
+
(void) lseek(infd, ncurrent, SEEK_SET);
|
| 77 |
+
|
| 78 |
+
printf("iteration %d, encoded %d (%s), proccessed %d of %d (%.2f%%)\n", i, id, t->decode(t, id), ncurrent, ntotal, ((float)ncurrent / ntotal) * 100);
|
| 79 |
+
|
| 80 |
+
if (ntotal - ncurrent < 100) break;
|
| 81 |
+
}
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
#define tokenize_whole_file(T, ...) tokenize_whole_file(T, (char*[2]){__VA_ARGS__})
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
int main(int argc, char *argv[]) {
|
| 90 |
+
tokenizer_t *tokenizer = Tokenizer(NULL);
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
char *dataset_path = "data/dataset.txt",
|
| 94 |
+
*dataset_output_path = "bin/dataset.bin",
|
| 95 |
+
*vocab_path = "data/vocab.txt",
|
| 96 |
+
*vocab_output_path = NULL,
|
| 97 |
+
*tokenizer_path = NULL,
|
| 98 |
+
*tokenizer_output_path = "bin/tokenizer.bin";
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
for (int i = 1; i < argc; i += 2) {
|
| 102 |
+
if (strequ(argv[i], "--dataset_path")) dataset_path = argv[i + 1], printf("setting dataset path to \"%s\"\n", dataset_path);
|
| 103 |
+
else if (strequ(argv[i], "--dataset_output_path")) dataset_output_path = argv[i + 1], printf("setting dataset output path to \"%s\"\n", dataset_output_path);
|
| 104 |
+
else if (strequ(argv[i], "--vocab_path")) vocab_path = argv[i + 1], printf("setting vocabulary path to \"%s\"\n", vocab_path);
|
| 105 |
+
else if (strequ(argv[i], "--vocab_output_path")) vocab_output_path = argv[i + 1], printf("setting vocabulary output path to \"%s\"\n", vocab_output_path);
|
| 106 |
+
else if (strequ(argv[i], "--tokenizer_path")) tokenizer_path = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n", tokenizer_path);
|
| 107 |
+
else if (strequ(argv[i], "--tokenizer_output_path")) tokenizer_output_path = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n", tokenizer_output_path);
|
| 108 |
+
else if (strequ(argv[i], "--help")) help(argc, argv);
|
| 109 |
+
else ERROR("unknown option \"%s\"\n", argv[i]);
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
CALLBACK_ON_FILE(vocab_path,
|
| 114 |
+
INFO("loading vocabulary from \"%s\"\n", vocab_path);
|
| 115 |
+
tokenizer->load_vocab(tokenizer, vocab_path);
|
| 116 |
+
tokenizer_print_vocab(tokenizer);
|
| 117 |
+
);
|
| 118 |
+
|
| 119 |
+
CALLBACK_ON_FILE(tokenizer_path,
|
| 120 |
+
INFO("loading tokenizer from \"%s\"\n", tokenizer_path);
|
| 121 |
+
tokenizer->load_tokenizer(tokenizer, tokenizer_path);
|
| 122 |
+
);
|
| 123 |
+
|
| 124 |
+
CALLBACK_ON_FILE(vocab_output_path,
|
| 125 |
+
INFO("exporting vocabulary to \"%s\"\n", vocab_output_path);
|
| 126 |
+
tokenizer->save_vocab(tokenizer, vocab_output_path)
|
| 127 |
+
);
|
| 128 |
+
|
| 129 |
+
CALLBACK_ON_FILE(tokenizer_output_path,
|
| 130 |
+
INFO("exporting tokenizer to \"%s\"\n", tokenizer_output_path);
|
| 131 |
+
tokenizer->save_tokenizer(tokenizer, tokenizer_output_path)
|
| 132 |
+
);
|
| 133 |
+
|
| 134 |
+
CALLBACK_ON_FILE(dataset_path,
|
| 135 |
+
CALLBACK_ON_FILE(dataset_output_path,
|
| 136 |
+
INFO("loading dataset from \"%s\" and exporting it to \"%s\"\n", dataset_path, dataset_output_path);
|
| 137 |
+
tokenize_whole_file(tokenizer, dataset_path, dataset_output_path);
|
| 138 |
+
);
|
| 139 |
+
);
|
| 140 |
+
|
| 141 |
+
return EXIT_SUCCESS;
|
| 142 |
+
}
|
src/tokenizer.h
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <stdio.h>
|
| 4 |
+
#include <stdlib.h>
|
| 5 |
+
#include <string.h>
|
| 6 |
+
#include <unistd.h>
|
| 7 |
+
#include <fcntl.h>
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
#include "utils.h"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
#ifndef MAX_VOCAB_SIZE
|
| 16 |
+
#define MAX_VOCAB_SIZE 32000
|
| 17 |
+
#endif
|
| 18 |
+
|
| 19 |
+
#ifndef MAX_WORD_LEN
|
| 20 |
+
#define MAX_WORD_LEN 16
|
| 21 |
+
#endif
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
STRUCTURE(tokenizer_t,
|
| 27 |
+
int vocab_size;
|
| 28 |
+
char vocab[MAX_VOCAB_SIZE][MAX_WORD_LEN];
|
| 29 |
+
|
| 30 |
+
int (*add_word) (tokenizer_t *, char *);
|
| 31 |
+
|
| 32 |
+
int (*encode_word) (tokenizer_t *, char *);
|
| 33 |
+
int (*encode_stream) (tokenizer_t *, char **);
|
| 34 |
+
int (*encode_file) (tokenizer_t *, int);
|
| 35 |
+
|
| 36 |
+
char *(*decode) (tokenizer_t *, int);
|
| 37 |
+
char *(*decode_file) (tokenizer_t *, int);
|
| 38 |
+
|
| 39 |
+
void (*save_vocab) (tokenizer_t *, char *);
|
| 40 |
+
void (*load_vocab) (tokenizer_t *, char *);
|
| 41 |
+
|
| 42 |
+
void (*save_tokenizer) (tokenizer_t *, char *);
|
| 43 |
+
void (*load_tokenizer) (tokenizer_t *, char *);
|
| 44 |
+
);
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
char rdchar(int fd) { char c; return read(fd, &c, sizeof(c)) < sizeof(c) ? EOF : c; }
|
| 50 |
+
int rdint(int fd) { int d; return read(fd, &d, sizeof(d)) < sizeof(d) ? EOF : d; }
|
| 51 |
+
void seekback(int fd, int n) { lseek(fd, -n, SEEK_CUR); }
|
| 52 |
+
|
| 53 |
+
char *strcpy(char *dst, const char *src) { for (; (*dst++ = *src++); ); return dst; }
|
| 54 |
+
int strcmp(const char *a, const char *b) {
|
| 55 |
+
for (; *a && *a == *b; ++a, ++b);
|
| 56 |
+
return *a - *b;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
int tokenizer_add_word(tokenizer_t *t, char *word) {
|
| 63 |
+
if (t->vocab_size >= MAX_VOCAB_SIZE) return -1;
|
| 64 |
+
strcpy(t->vocab[t->vocab_size], word);
|
| 65 |
+
return t->vocab_size++;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
int tokenizer_encode_word(tokenizer_t *t, char* word) {
|
| 70 |
+
int left = 0,
|
| 71 |
+
right = t->vocab_size - 1;
|
| 72 |
+
|
| 73 |
+
for (; left <= right; ) {
|
| 74 |
+
int mid = left + (right - left) / 2,
|
| 75 |
+
cmp = strcmp(word, t->vocab[mid]);
|
| 76 |
+
|
| 77 |
+
if (cmp == 0) return mid;
|
| 78 |
+
else if (cmp < 0) right = mid - 1;
|
| 79 |
+
else left = mid + 1;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
return -1;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
int tokenizer_encode_stream(tokenizer_t *t, char **stream) {
|
| 86 |
+
char word[MAX_WORD_LEN] = {};
|
| 87 |
+
int id = -1, i = 0, j = 0;
|
| 88 |
+
|
| 89 |
+
for (; (*stream)[i] && i < MAX_WORD_LEN; ++i) {
|
| 90 |
+
word[i] = (*stream)[i];
|
| 91 |
+
|
| 92 |
+
int tmp = t->encode_word(t, word);
|
| 93 |
+
if (tmp != -1) id = tmp, j = i + 1;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
*stream += j;
|
| 97 |
+
return id;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
int tokenizer_encode_file(tokenizer_t *t, int fd) {
|
| 102 |
+
char c, word[MAX_WORD_LEN] = {};
|
| 103 |
+
int id = -1, i = 0, j = 0;
|
| 104 |
+
|
| 105 |
+
for (; (c = rdchar(fd)) != EOF && i < MAX_WORD_LEN; ++i) {
|
| 106 |
+
word[i] = c;
|
| 107 |
+
|
| 108 |
+
int tmp = t->encode_word(t, word);
|
| 109 |
+
if (tmp != -1) id = tmp, j = i + 1;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
seekback(fd, MAX_WORD_LEN - j + 1);
|
| 113 |
+
return id;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
char *tokenizer_decode(tokenizer_t *t, int id) { return t->vocab[id]; }
|
| 118 |
+
|
| 119 |
+
char *tokenizer_decode_file(tokenizer_t *t, int fd) {
|
| 120 |
+
int id = rdint(fd);
|
| 121 |
+
if (id == EOF) ERROR("read EOF from file\n");
|
| 122 |
+
|
| 123 |
+
return t->decode(t, id);
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
void tokenizer_save_vocab(tokenizer_t *t, char *fname) {
|
| 128 |
+
int fd = open(fname, O_WRONLY | O_CREAT | O_TRUNC, 0644);
|
| 129 |
+
if (fd < 0) ERROR("failed to open \"%s\"\n", fname);
|
| 130 |
+
|
| 131 |
+
int max_len = 0;
|
| 132 |
+
for (int i = 0; i < t->vocab_size; ++i) {
|
| 133 |
+
char *str = t->vocab[i];
|
| 134 |
+
|
| 135 |
+
int len = strlen(str),
|
| 136 |
+
n = write(fd, str, len);
|
| 137 |
+
|
| 138 |
+
max_len = len > max_len ? len : max_len;
|
| 139 |
+
|
| 140 |
+
if (n != len) ERROR("failed to write to %s, only wrote %d bytes out of %d\n", fname, n, len);
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
printf("wrote %d tokens to file \"%s\"\nMax token length was %d\n", t->vocab_size, fname, max_len);
|
| 144 |
+
close(fd);
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
void tokenizer_load_vocab(tokenizer_t *t, char *fname) {
|
| 148 |
+
int fd = open(fname, O_RDONLY);
|
| 149 |
+
if (fd < 0) ERROR("failed to open \"%s\"\n", fname);
|
| 150 |
+
|
| 151 |
+
char c, word[MAX_WORD_LEN];
|
| 152 |
+
for (; (c = rdchar(fd)) != EOF; ) {
|
| 153 |
+
for (int i = 0; i < MAX_WORD_LEN; ++i, c = rdchar(fd)) {
|
| 154 |
+
word[i] = c;
|
| 155 |
+
|
| 156 |
+
if (word[i] == EOF || word[i] == '\n') {
|
| 157 |
+
word[i] = '\0';
|
| 158 |
+
break;
|
| 159 |
+
}
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
t->add_word(t, word);
|
| 163 |
+
}
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
void tokenizer_save_tokenizer(tokenizer_t *t, char *fname) {
|
| 168 |
+
int fd = open(fname, O_WRONLY | O_CREAT | O_TRUNC, 0644);
|
| 169 |
+
if (fd < 0) ERROR("failed to open \"%s\"\n", fname);
|
| 170 |
+
|
| 171 |
+
int n = write(fd, t->vocab, sizeof(t->vocab));
|
| 172 |
+
if (n != sizeof(t->vocab)) ERROR("failed to write to %s, only wrote %d bytes out of %ld\n", fname, n, sizeof(t->vocab));
|
| 173 |
+
|
| 174 |
+
printf("wrote %d bytes (%d tokens) to \"%s\"\n", n, t->vocab_size, fname);
|
| 175 |
+
close(fd);
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
void tokenizer_load_tokenizer(tokenizer_t *t, char *fname) {
|
| 179 |
+
int fd = open(fname, O_RDONLY);
|
| 180 |
+
if (fd < 0) ERROR("failed to open \"%s\"\n", fname);
|
| 181 |
+
|
| 182 |
+
int n = read(fd, t->vocab, sizeof(t->vocab));
|
| 183 |
+
if (n != sizeof(t->vocab)) ERROR("failed to read from %s, only read %d bytes out of %ld\n", fname, n, sizeof(t->vocab));
|
| 184 |
+
|
| 185 |
+
t->vocab_size = n / MAX_WORD_LEN;
|
| 186 |
+
|
| 187 |
+
printf("read %d bytes (%d tokens) from \"%s\"\n", n, t->vocab_size, fname);
|
| 188 |
+
close(fd);
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
tokenizer_t _tokenizer;
|
| 195 |
+
tokenizer_t *Tokenizer(char *fname) {
|
| 196 |
+
_tokenizer = (tokenizer_t) {
|
| 197 |
+
.vocab_size = 0,
|
| 198 |
+
|
| 199 |
+
.add_word = tokenizer_add_word,
|
| 200 |
+
|
| 201 |
+
.encode_word = tokenizer_encode_word,
|
| 202 |
+
.encode_stream = tokenizer_encode_stream,
|
| 203 |
+
.encode_file = tokenizer_encode_file,
|
| 204 |
+
|
| 205 |
+
.decode = tokenizer_decode,
|
| 206 |
+
.decode_file = tokenizer_decode_file,
|
| 207 |
+
|
| 208 |
+
.save_vocab = tokenizer_save_vocab,
|
| 209 |
+
.load_vocab = tokenizer_load_vocab,
|
| 210 |
+
|
| 211 |
+
.save_tokenizer = tokenizer_save_tokenizer,
|
| 212 |
+
.load_tokenizer = tokenizer_load_tokenizer
|
| 213 |
+
};
|
| 214 |
+
|
| 215 |
+
if (fname) _tokenizer.load_tokenizer(&_tokenizer, fname);
|
| 216 |
+
|
| 217 |
+
INFO("vocabulary size: %d (%d max)\n", _tokenizer.vocab_size, MAX_VOCAB_SIZE);
|
| 218 |
+
INFO("max token length: %d\n", MAX_WORD_LEN);
|
| 219 |
+
INFO("size of structure: %ld bytes\n", sizeof(tokenizer_t));
|
| 220 |
+
|
| 221 |
+
return &_tokenizer;
|
| 222 |
+
}
|
src/utils.h
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
#define STRUCTURE(NAME, ...) \
|
| 7 |
+
typedef struct NAME NAME; struct NAME { __VA_ARGS__ }
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
#define HOLD() (void)fgetc(stdin);
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
#define LOG(FD, COLOR, ...) fprintf(FD, COLOR), fprintf(FD, "%s:%d: ", __FILE__, __LINE__), fprintf(FD, __VA_ARGS__), fprintf(FD, "\033[0m"), fflush(FD)
|
| 18 |
+
#define COMMENT(...) LOG(stdout, "\033[38;5;78m", __VA_ARGS__)
|
| 19 |
+
#define WARNING(...) LOG(stdout, "\033[38;5;208m", __VA_ARGS__), HOLD()
|
| 20 |
+
#define INFO(...) LOG(stdout, "\033[38;5;228m", __VA_ARGS__)
|
| 21 |
+
#define ERROR(...) { LOG(stderr, "\033[38;5;196m", __VA_ARGS__); exit(EXIT_FAILURE); }
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
#define CALLBACK_ON_FILE(FNAME, ...) \
|
| 27 |
+
if (FNAME != NULL) { __VA_ARGS__; } \
|
| 28 |
+
else WARNING("\"%s\" not set, skipping", #FNAME);
|