|
|
#pragma once |
|
|
|
|
|
|
|
|
#include <stdint.h> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extern char _binary_tokenizer_bin_start[]; |
|
|
extern char _binary_tokenizer_bin_end[]; |
|
|
|
|
|
|
|
|
|
|
|
#define MAX_WORD_LEN 24 |
|
|
|
|
|
typedef struct __attribute__((packed)) token_t { |
|
|
uint8_t byte; |
|
|
uint16_t prev; |
|
|
} token_t; |
|
|
|
|
|
|
|
|
typedef struct Tokenizer Tokenizer; |
|
|
struct Tokenizer { |
|
|
token_t *vocab; |
|
|
|
|
|
uint16_t (*get_vocab_size) (void); |
|
|
|
|
|
uint16_t (*find) (Tokenizer *, uint8_t, uint16_t); |
|
|
uint16_t (*encode) (Tokenizer *, uint8_t **); |
|
|
uint8_t *(*decode) (Tokenizer *, uint16_t); |
|
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
static uint16_t TokenizerGetVocabSize() { return (_binary_tokenizer_bin_end - _binary_tokenizer_bin_start) / 3; } |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static uint16_t TokenizerFind(Tokenizer *tokenizer, uint8_t byte, uint16_t prev) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (uint16_t i = prev; i < tokenizer->get_vocab_size(); ++i) |
|
|
if (tokenizer->vocab[i].byte == byte && tokenizer->vocab[i].prev == prev) |
|
|
return i; |
|
|
|
|
|
return 0; |
|
|
} |
|
|
|
|
|
|
|
|
static uint16_t TokenizerEncode(Tokenizer *tokenizer, uint8_t **seed_text) { |
|
|
|
|
|
uint16_t prev = 0; |
|
|
for (; **seed_text; ++*seed_text) { |
|
|
uint16_t next = tokenizer->find(tokenizer, **seed_text, prev); |
|
|
if (next == 0) break; |
|
|
prev = next; |
|
|
} |
|
|
|
|
|
return prev; |
|
|
} |
|
|
|
|
|
|
|
|
static uint8_t *TokenizerDecode(Tokenizer *tokenizer, uint16_t token) { |
|
|
|
|
|
static uint8_t dest[MAX_WORD_LEN + 1]; |
|
|
dest[MAX_WORD_LEN] = '\0'; |
|
|
|
|
|
uint16_t prev = token; |
|
|
uint16_t i = MAX_WORD_LEN - 1; |
|
|
|
|
|
for (; prev && i > 0; prev = tokenizer->vocab[prev].prev, --i) |
|
|
dest[i] = tokenizer->vocab[prev].byte; |
|
|
|
|
|
return dest + i + 1; |
|
|
} |
|
|
|
|
|
|
|
|
Tokenizer tokenizer = { |
|
|
.vocab = (token_t *) _binary_tokenizer_bin_start, |
|
|
|
|
|
.get_vocab_size = TokenizerGetVocabSize, |
|
|
|
|
|
.find = TokenizerFind, |
|
|
.encode = TokenizerEncode, |
|
|
.decode = TokenizerDecode |
|
|
}; |
|
|
|
|
|
|