|
|
#pragma once |
|
|
|
|
|
#include <stdio.h> |
|
|
#include <stdlib.h> |
|
|
#include <string.h> |
|
|
#include <unistd.h> |
|
|
#include <fcntl.h> |
|
|
|
|
|
|
|
|
#include "utils.h" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef MAX_VOCAB_SIZE |
|
|
#define MAX_VOCAB_SIZE 32000 |
|
|
#endif |
|
|
|
|
|
#ifndef MAX_WORD_LEN |
|
|
#define MAX_WORD_LEN 16 |
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
STRUCTURE(tokenizer_t, |
|
|
int vocab_size; |
|
|
char vocab[MAX_VOCAB_SIZE][MAX_WORD_LEN]; |
|
|
|
|
|
int (*add_word) (tokenizer_t *, char *); |
|
|
|
|
|
int (*encode_word) (tokenizer_t *, char *); |
|
|
int (*encode_stream) (tokenizer_t *, char **); |
|
|
int (*encode_file) (tokenizer_t *, int); |
|
|
|
|
|
char *(*decode) (tokenizer_t *, int); |
|
|
char *(*decode_file) (tokenizer_t *, int); |
|
|
|
|
|
void (*save_vocab) (tokenizer_t *, char *); |
|
|
void (*load_vocab) (tokenizer_t *, char *); |
|
|
|
|
|
void (*save_tokenizer) (tokenizer_t *, char *); |
|
|
void (*load_tokenizer) (tokenizer_t *, char *); |
|
|
); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
char rdchar(int fd) { char c; return read(fd, &c, sizeof(c)) < sizeof(c) ? EOF : c; } |
|
|
int rdint(int fd) { int d; return read(fd, &d, sizeof(d)) < sizeof(d) ? EOF : d; } |
|
|
void seekback(int fd, int n) { lseek(fd, -n, SEEK_CUR); } |
|
|
|
|
|
char *strcpy(char *dst, const char *src) { for (; (*dst++ = *src++); ); return dst; } |
|
|
int strcmp(const char *a, const char *b) { |
|
|
for (; *a && *a == *b; ++a, ++b); |
|
|
return *a - *b; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int tokenizer_add_word(tokenizer_t *t, char *word) { |
|
|
if (t->vocab_size >= MAX_VOCAB_SIZE) return -1; |
|
|
strcpy(t->vocab[t->vocab_size], word); |
|
|
return t->vocab_size++; |
|
|
} |
|
|
|
|
|
|
|
|
int tokenizer_encode_word(tokenizer_t *t, char* word) { |
|
|
int left = 0, |
|
|
right = t->vocab_size - 1; |
|
|
|
|
|
for (; left <= right; ) { |
|
|
int mid = left + (right - left) / 2, |
|
|
cmp = strcmp(word, t->vocab[mid]); |
|
|
|
|
|
if (cmp == 0) return mid; |
|
|
else if (cmp < 0) right = mid - 1; |
|
|
else left = mid + 1; |
|
|
} |
|
|
|
|
|
return -1; |
|
|
} |
|
|
|
|
|
int tokenizer_encode_stream(tokenizer_t *t, char **stream) { |
|
|
char word[MAX_WORD_LEN] = {}; |
|
|
int id = -1, i = 0, j = 0; |
|
|
|
|
|
for (; (*stream)[i] && i < MAX_WORD_LEN; ++i) { |
|
|
word[i] = (*stream)[i]; |
|
|
|
|
|
int tmp = t->encode_word(t, word); |
|
|
if (tmp != -1) id = tmp, j = i + 1; |
|
|
} |
|
|
|
|
|
*stream += j; |
|
|
return id; |
|
|
} |
|
|
|
|
|
|
|
|
int tokenizer_encode_file(tokenizer_t *t, int fd) { |
|
|
char c, word[MAX_WORD_LEN] = {}; |
|
|
int id = -1, i = 0, j = 0; |
|
|
|
|
|
for (; (c = rdchar(fd)) != EOF && i < MAX_WORD_LEN; ++i) { |
|
|
word[i] = c; |
|
|
|
|
|
int tmp = t->encode_word(t, word); |
|
|
if (tmp != -1) id = tmp, j = i + 1; |
|
|
} |
|
|
|
|
|
seekback(fd, MAX_WORD_LEN - j + 1); |
|
|
return id; |
|
|
} |
|
|
|
|
|
|
|
|
char *tokenizer_decode(tokenizer_t *t, int id) { return t->vocab[id]; } |
|
|
|
|
|
char *tokenizer_decode_file(tokenizer_t *t, int fd) { |
|
|
int id = rdint(fd); |
|
|
if (id == EOF) ERROR("read EOF from file\n"); |
|
|
|
|
|
return t->decode(t, id); |
|
|
} |
|
|
|
|
|
|
|
|
void tokenizer_save_vocab(tokenizer_t *t, char *fname) { |
|
|
int fd = open(fname, O_WRONLY | O_CREAT | O_TRUNC, 0644); |
|
|
if (fd < 0) ERROR("failed to open \"%s\"\n", fname); |
|
|
|
|
|
int max_len = 0; |
|
|
for (int i = 0; i < t->vocab_size; ++i) { |
|
|
char *str = t->vocab[i]; |
|
|
|
|
|
int len = strlen(str), |
|
|
n = write(fd, str, len); |
|
|
|
|
|
max_len = len > max_len ? len : max_len; |
|
|
|
|
|
if (n != len) ERROR("failed to write to %s, only wrote %d bytes out of %d\n", fname, n, len); |
|
|
} |
|
|
|
|
|
printf("wrote %d tokens to file \"%s\"\nMax token length was %d\n", t->vocab_size, fname, max_len); |
|
|
close(fd); |
|
|
} |
|
|
|
|
|
void tokenizer_load_vocab(tokenizer_t *t, char *fname) { |
|
|
int fd = open(fname, O_RDONLY); |
|
|
if (fd < 0) ERROR("failed to open \"%s\"\n", fname); |
|
|
|
|
|
char c, word[MAX_WORD_LEN]; |
|
|
for (; (c = rdchar(fd)) != EOF; ) { |
|
|
for (int i = 0; i < MAX_WORD_LEN; ++i, c = rdchar(fd)) { |
|
|
word[i] = c; |
|
|
|
|
|
if (word[i] == EOF || word[i] == '\n') { |
|
|
word[i] = '\0'; |
|
|
break; |
|
|
} |
|
|
} |
|
|
|
|
|
t->add_word(t, word); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void tokenizer_save_tokenizer(tokenizer_t *t, char *fname) { |
|
|
int fd = open(fname, O_WRONLY | O_CREAT | O_TRUNC, 0644); |
|
|
if (fd < 0) ERROR("failed to open \"%s\"\n", fname); |
|
|
|
|
|
int n = write(fd, t->vocab, sizeof(t->vocab)); |
|
|
if (n != sizeof(t->vocab)) ERROR("failed to write to %s, only wrote %d bytes out of %ld\n", fname, n, sizeof(t->vocab)); |
|
|
|
|
|
printf("wrote %d bytes (%d tokens) to \"%s\"\n", n, t->vocab_size, fname); |
|
|
close(fd); |
|
|
} |
|
|
|
|
|
void tokenizer_load_tokenizer(tokenizer_t *t, char *fname) { |
|
|
int fd = open(fname, O_RDONLY); |
|
|
if (fd < 0) ERROR("failed to open \"%s\"\n", fname); |
|
|
|
|
|
int n = read(fd, t->vocab, sizeof(t->vocab)); |
|
|
if (n != sizeof(t->vocab)) ERROR("failed to read from %s, only read %d bytes out of %ld\n", fname, n, sizeof(t->vocab)); |
|
|
|
|
|
t->vocab_size = n / MAX_WORD_LEN; |
|
|
|
|
|
printf("read %d bytes (%d tokens) from \"%s\"\n", n, t->vocab_size, fname); |
|
|
close(fd); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer_t _tokenizer; |
|
|
tokenizer_t *Tokenizer(char *fname) { |
|
|
_tokenizer = (tokenizer_t) { |
|
|
.vocab_size = 0, |
|
|
|
|
|
.add_word = tokenizer_add_word, |
|
|
|
|
|
.encode_word = tokenizer_encode_word, |
|
|
.encode_stream = tokenizer_encode_stream, |
|
|
.encode_file = tokenizer_encode_file, |
|
|
|
|
|
.decode = tokenizer_decode, |
|
|
.decode_file = tokenizer_decode_file, |
|
|
|
|
|
.save_vocab = tokenizer_save_vocab, |
|
|
.load_vocab = tokenizer_load_vocab, |
|
|
|
|
|
.save_tokenizer = tokenizer_save_tokenizer, |
|
|
.load_tokenizer = tokenizer_load_tokenizer |
|
|
}; |
|
|
|
|
|
if (fname) _tokenizer.load_tokenizer(&_tokenizer, fname); |
|
|
|
|
|
INFO("vocabulary size: %d (%d max)\n", _tokenizer.vocab_size, MAX_VOCAB_SIZE); |
|
|
INFO("max token length: %d\n", MAX_WORD_LEN); |
|
|
INFO("size of structure: %ld bytes\n", sizeof(tokenizer_t)); |
|
|
|
|
|
return &_tokenizer; |
|
|
} |