tracking with lfs

Browse files

Files changed (10) hide show

.gitattributes +2 -0
.gitignore +6 -0
Makefile +29 -0
README.md +7 -0
data/.gitkeep +0 -0
data/dataset_tinystories-v2_100k-rows.txt +3 -0
src/.gitkeep +0 -0
src/main.c +142 -0
src/tokenizer.h +222 -0
src/utils.h +28 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ data/dataset_tinystories-v2_100k-rows.txt filter=lfs diff=lfs merge=lfs -text
2	+ data filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+# Ignore everyhing in bin/
+bin/
+bin/dataset.bin
+bin/tokenizer
+bin/vocab.bin

Makefile ADDED Viewed

	@@ -0,0 +1,29 @@

+CC		:= gcc
+CFLAGS	:= -O3 -static -march=native -ffast-math #-Wall -Wextra -Werror
+CLIBS	:= -lc
+TARGET	:= bin/tokenizer
+SRCS	:= $(wildcard src/*.c)
+all:	$(TARGET)
+clean:
+	rm -f $(TARGET) bin/*
+run:	all
+	./$(TARGET) \
+		--dataset_path          data/dataset_tinystories-v2_100k-rows.txt \
+		--vocab_path            data/vocab_tinstories-v2_size-4096_wordlen-12.txt \
+		--dataset_output_path   bin/dataset_tinystories-v2_100k-rows.bin \
+		--tokenizer_output_path bin/tokenizer_tinystories-v2_size-4096_wordlen-12.bin
+$(TARGET):	$(SRCS)
+	$(CC) $(CFLAGS) $(CLIBS) -o $@ $^
+.PHONY:	all clean run

README.md ADDED Viewed

	@@ -0,0 +1,7 @@

+Implements the flop tokenizer, a sub-word tokenizer for autoregressive language modeling.
+TODO:
+    - Better printing during encoding of file and loading / exporting?
+    - Include Python script for BPE training
+    - Add time to logging during encoding

data/.gitkeep ADDED Viewed

File without changes

data/dataset_tinystories-v2_100k-rows.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:407ca81af9dabb73461834d630fa9c1e58d16ae951d0cede99f1b8a3238a214f
+size 80573031

src/.gitkeep ADDED Viewed

File without changes

src/main.c ADDED Viewed

	@@ -0,0 +1,142 @@

+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <errno.h>
+#define MAX_VOCAB_SIZE 4096
+#define MAX_WORD_LEN   12
+#include "tokenizer.h"
+#include "utils.h"
+static int strequ(char *a, char *b) {
+    for (int i = 0; a[i] && b[i]; ++i)
+        if (a[i] != b[i])
+            return 0;
+    return 1;
+}
+void help(int argc, char *argv[]) {
+    printf("Usage: %s [arg1] [arg2] ... [argN]\n", argv[0]);
+    printf("Arguments:\n");
+    printf("  --help                              show this help\n");
+    printf("  --dataset_path             <str>    path to dataset\n");
+    printf("  --dataset_output_path      <str>    path to dataset output\n");
+    printf("  --vocab_path               <str>    path to vocabulary\n");
+    printf("  --vocab_output_path        <str>    path to vocabulary output\n");
+    printf("  --tokenizer_path           <str>    path to tokenizer\n");
+    printf("  --tokenizer_output_path    <str>    path to tokenizer output\n");
+    printf("\n");
+    exit(0);
+}
+void tokenizer_print_vocab(tokenizer_t *t) {
+    printf("vocab size: %d\n", t->vocab_size);
+    for (int i = 0; i < t->vocab_size; ++i) {
+        char *str = t->vocab[i];
+        int len = strlen(str);
+        printf("id: %d, len: %d, str: %s\n", i, len, str);
+    }
+}
+void tokenize_whole_file(tokenizer_t *t, char *fname[2]) {
+    int infd  = open(fname[0], O_RDONLY),
+        outfd = open(fname[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
+    if (infd < 0)  ERROR("failed to open %s\n", fname[0]);
+    if (outfd < 0) ERROR("failed to open %s\n", fname[1]);
+    for (int i = 0; ; ++i) {
+        if   (rdchar(infd) == EOF) break;
+        else seekback(infd, 1);
+        int id = t->encode_file(t, infd);
+        if (id == EOF) { (void)rdchar(infd); continue; }
+        write(outfd, &id, sizeof(id));
+        if (i % 100000 == 0) {
+            int ncurrent = lseek(infd, 0, SEEK_CUR),
+                ntotal = 0;
+            ntotal = lseek(infd, 0, SEEK_END);
+            (void)   lseek(infd, ncurrent, SEEK_SET);
+            printf("iteration %d, encoded %d (%s), proccessed %d of %d (%.2f%%)\n", i, id, t->decode(t, id), ncurrent, ntotal, ((float)ncurrent / ntotal) * 100);
+            if (ntotal - ncurrent < 100) break;
+        }
+    }
+}
+#define tokenize_whole_file(T, ...) tokenize_whole_file(T, (char*[2]){__VA_ARGS__})
+int main(int argc, char *argv[]) {
+    tokenizer_t *tokenizer = Tokenizer(NULL);
+    char *dataset_path          = "data/dataset.txt",
+         *dataset_output_path   = "bin/dataset.bin",
+         *vocab_path            = "data/vocab.txt",
+         *vocab_output_path     = NULL,
+         *tokenizer_path        = NULL,
+         *tokenizer_output_path = "bin/tokenizer.bin";
+    for (int i = 1; i < argc; i += 2) {
+        if      (strequ(argv[i], "--dataset_path"))             dataset_path             = argv[i + 1], printf("setting dataset path to \"%s\"\n",           dataset_path);
+        else if (strequ(argv[i], "--dataset_output_path"))      dataset_output_path      = argv[i + 1], printf("setting dataset output path to \"%s\"\n",    dataset_output_path);
+        else if (strequ(argv[i], "--vocab_path"))               vocab_path               = argv[i + 1], printf("setting vocabulary path to \"%s\"\n",        vocab_path);
+        else if (strequ(argv[i], "--vocab_output_path"))        vocab_output_path        = argv[i + 1], printf("setting vocabulary output path to \"%s\"\n", vocab_output_path);
+        else if (strequ(argv[i], "--tokenizer_path"))           tokenizer_path           = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n",  tokenizer_path);
+        else if (strequ(argv[i], "--tokenizer_output_path"))    tokenizer_output_path    = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n",  tokenizer_output_path);
+        else if (strequ(argv[i], "--help")) help(argc, argv);
+        else                                ERROR("unknown option \"%s\"\n", argv[i]);
+    }
+    CALLBACK_ON_FILE(vocab_path,
+        INFO("loading vocabulary from \"%s\"\n", vocab_path);
+        tokenizer->load_vocab(tokenizer, vocab_path);
+        tokenizer_print_vocab(tokenizer);
+    );
+    CALLBACK_ON_FILE(tokenizer_path,
+        INFO("loading tokenizer from \"%s\"\n", tokenizer_path);
+        tokenizer->load_tokenizer(tokenizer, tokenizer_path);
+    );
+    CALLBACK_ON_FILE(vocab_output_path,
+        INFO("exporting vocabulary to \"%s\"\n", vocab_output_path);
+        tokenizer->save_vocab(tokenizer, vocab_output_path)
+    );
+    CALLBACK_ON_FILE(tokenizer_output_path,
+        INFO("exporting tokenizer to \"%s\"\n", tokenizer_output_path);
+        tokenizer->save_tokenizer(tokenizer, tokenizer_output_path)
+    );
+    CALLBACK_ON_FILE(dataset_path,
+        CALLBACK_ON_FILE(dataset_output_path,
+            INFO("loading dataset from \"%s\" and exporting it to \"%s\"\n", dataset_path, dataset_output_path);
+            tokenize_whole_file(tokenizer, dataset_path, dataset_output_path);
+        );
+    );
+    return EXIT_SUCCESS;
+}

src/tokenizer.h ADDED Viewed

	@@ -0,0 +1,222 @@

+#pragma once
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include "utils.h"
+#ifndef MAX_VOCAB_SIZE
+#define MAX_VOCAB_SIZE 32000
+#endif
+#ifndef MAX_WORD_LEN
+#define MAX_WORD_LEN 16
+#endif
+STRUCTURE(tokenizer_t,
+    int vocab_size;
+    char vocab[MAX_VOCAB_SIZE][MAX_WORD_LEN];
+    int   (*add_word) (tokenizer_t *, char *);
+	int   (*encode_word) (tokenizer_t *, char *);
+    int   (*encode_stream)   (tokenizer_t *, char **);
+    int   (*encode_file)    (tokenizer_t *, int);
+    char *(*decode)   (tokenizer_t *, int);
+    char *(*decode_file)    (tokenizer_t *, int);
+    void  (*save_vocab) (tokenizer_t *, char *);
+	void  (*load_vocab)   (tokenizer_t *, char *);
+    void  (*save_tokenizer) (tokenizer_t *, char *);
+    void  (*load_tokenizer) (tokenizer_t *, char *);
+);
+char rdchar(int fd) { char c; return read(fd, &c, sizeof(c)) < sizeof(c) ? EOF : c; }
+int  rdint(int fd)  { int  d; return read(fd, &d, sizeof(d)) < sizeof(d) ? EOF : d; }
+void seekback(int fd, int n) { lseek(fd, -n, SEEK_CUR); }
+char *strcpy(char *dst, const char *src) { for (; (*dst++ = *src++); ); return dst; }
+int strcmp(const char *a, const char *b) {
+    for (; *a && *a == *b; ++a, ++b);
+    return *a - *b;
+}
+int tokenizer_add_word(tokenizer_t *t, char *word) {
+    if (t->vocab_size >= MAX_VOCAB_SIZE) return -1;
+    strcpy(t->vocab[t->vocab_size], word);
+    return t->vocab_size++;
+}
+int tokenizer_encode_word(tokenizer_t *t, char* word) {
+    int left = 0,
+        right = t->vocab_size - 1;
+    for (; left <= right; ) {
+        int mid = left + (right - left) / 2,
+            cmp = strcmp(word, t->vocab[mid]);
+        if      (cmp == 0) return mid;
+        else if (cmp < 0)  right = mid - 1;
+        else               left  = mid + 1;
+    }
+    return -1;
+}
+int tokenizer_encode_stream(tokenizer_t *t, char **stream) {
+    char word[MAX_WORD_LEN] = {};
+    int id = -1, i = 0, j = 0;
+    for (; (*stream)[i] && i < MAX_WORD_LEN; ++i) {
+        word[i] = (*stream)[i];
+        int tmp = t->encode_word(t, word);
+        if (tmp != -1) id = tmp, j = i + 1;
+    }
+    *stream += j;
+    return id;
+}
+int tokenizer_encode_file(tokenizer_t *t, int fd) {
+    char c, word[MAX_WORD_LEN] = {};
+    int id = -1, i = 0, j = 0;
+    for (; (c = rdchar(fd)) != EOF && i < MAX_WORD_LEN; ++i) {
+        word[i] = c;
+        int tmp = t->encode_word(t, word);
+        if (tmp != -1) id = tmp, j = i + 1;
+    }
+    seekback(fd, MAX_WORD_LEN - j + 1);
+    return id;
+}
+char *tokenizer_decode(tokenizer_t *t, int id) { return t->vocab[id]; }
+char *tokenizer_decode_file(tokenizer_t *t, int fd) {
+    int id = rdint(fd);
+    if (id == EOF) ERROR("read EOF from file\n");
+    return t->decode(t, id);
+}
+void tokenizer_save_vocab(tokenizer_t *t, char *fname) {
+	int fd = open(fname, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+	if (fd < 0) ERROR("failed to open \"%s\"\n", fname);
+	int max_len = 0;
+	for (int i = 0; i < t->vocab_size; ++i) {
+		char *str = t->vocab[i];
+		int len = strlen(str),
+			n   = write(fd, str, len);
+		max_len = len > max_len ? len : max_len;
+		if (n != len) ERROR("failed to write to %s, only wrote %d bytes out of %d\n", fname, n, len);
+	}
+	printf("wrote %d tokens to file \"%s\"\nMax token length was %d\n", t->vocab_size, fname, max_len);
+	close(fd);
+}
+void tokenizer_load_vocab(tokenizer_t *t, char *fname) {
+    int fd = open(fname, O_RDONLY);
+    if (fd < 0) ERROR("failed to open \"%s\"\n", fname);
+    char c, word[MAX_WORD_LEN];
+    for (; (c = rdchar(fd)) != EOF; ) {
+        for (int i = 0; i < MAX_WORD_LEN; ++i, c = rdchar(fd)) {
+            word[i] = c;
+            if (word[i] == EOF || word[i] == '\n') {
+                word[i] = '\0';
+                break;
+            }
+        }
+        t->add_word(t, word);
+    }
+}
+void tokenizer_save_tokenizer(tokenizer_t *t, char *fname) {
+    int fd = open(fname, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+    if (fd < 0) ERROR("failed to open \"%s\"\n", fname);
+    int n = write(fd, t->vocab, sizeof(t->vocab));
+    if (n != sizeof(t->vocab)) ERROR("failed to write to %s, only wrote %d bytes out of %ld\n", fname, n, sizeof(t->vocab));
+    printf("wrote %d bytes (%d tokens) to \"%s\"\n", n, t->vocab_size, fname);
+    close(fd);
+}
+void tokenizer_load_tokenizer(tokenizer_t *t, char *fname) {
+    int fd = open(fname, O_RDONLY);
+    if (fd < 0) ERROR("failed to open \"%s\"\n", fname);
+    int n = read(fd, t->vocab, sizeof(t->vocab));
+    if (n != sizeof(t->vocab)) ERROR("failed to read from %s, only read %d bytes out of %ld\n", fname, n, sizeof(t->vocab));
+    t->vocab_size = n / MAX_WORD_LEN;
+    printf("read %d bytes (%d tokens) from \"%s\"\n", n, t->vocab_size, fname);
+    close(fd);
+}
+tokenizer_t _tokenizer;
+tokenizer_t *Tokenizer(char *fname) {
+    _tokenizer = (tokenizer_t) {
+        .vocab_size       = 0,
+		.add_word         = tokenizer_add_word,
+		.encode_word      = tokenizer_encode_word,
+		.encode_stream    = tokenizer_encode_stream,
+		.encode_file      = tokenizer_encode_file,
+		.decode           = tokenizer_decode,
+		.decode_file      = tokenizer_decode_file,
+		.save_vocab       = tokenizer_save_vocab,
+		.load_vocab       = tokenizer_load_vocab,
+		.save_tokenizer   = tokenizer_save_tokenizer,
+		.load_tokenizer   = tokenizer_load_tokenizer
+    };
+    if (fname) _tokenizer.load_tokenizer(&_tokenizer, fname);
+    INFO("vocabulary size: %d (%d max)\n", _tokenizer.vocab_size, MAX_VOCAB_SIZE);
+    INFO("max token length: %d\n", MAX_WORD_LEN);
+    INFO("size of structure: %ld bytes\n", sizeof(tokenizer_t));
+    return &_tokenizer;
+}

src/utils.h ADDED Viewed

	@@ -0,0 +1,28 @@

+#pragma once
+#define STRUCTURE(NAME, ...) \
+    typedef struct NAME NAME; struct NAME { __VA_ARGS__ }
+#define HOLD() (void)fgetc(stdin);
+#define LOG(FD, COLOR, ...) fprintf(FD, COLOR), fprintf(FD, "%s:%d: ", __FILE__, __LINE__), fprintf(FD, __VA_ARGS__), fprintf(FD, "\033[0m"), fflush(FD)
+#define COMMENT(...)        LOG(stdout, "\033[38;5;78m", __VA_ARGS__)
+#define WARNING(...)        LOG(stdout, "\033[38;5;208m", __VA_ARGS__), HOLD()
+#define INFO(...)           LOG(stdout, "\033[38;5;228m", __VA_ARGS__)
+#define ERROR(...)        { LOG(stderr, "\033[38;5;196m", __VA_ARGS__); exit(EXIT_FAILURE); }
+#define CALLBACK_ON_FILE(FNAME, ...) \
+    if (FNAME != NULL) { __VA_ARGS__; } \
+    else WARNING("\"%s\" not set, skipping", #FNAME);