#include #include #include #include #include #include #include #define MAX_VOCAB_SIZE 4096 #define MAX_WORD_LEN 12 #include "tokenizer.h" #include "utils.h" static int strequ(char *a, char *b) { for (int i = 0; a[i] && b[i]; ++i) if (a[i] != b[i]) return 0; return 1; } void help(int argc, char *argv[]) { printf("Usage: %s [arg1] [arg2] ... [argN]\n", argv[0]); printf("Arguments:\n"); printf(" --help show this help\n"); printf(" --dataset_path path to dataset\n"); printf(" --dataset_output_path path to dataset output\n"); printf(" --vocab_path path to vocabulary\n"); printf(" --vocab_output_path path to vocabulary output\n"); printf(" --tokenizer_path path to tokenizer\n"); printf(" --tokenizer_output_path path to tokenizer output\n"); printf("\n"); exit(0); } void tokenizer_print_vocab(tokenizer_t *t) { printf("vocab size: %d\n", t->vocab_size); for (int i = 0; i < t->vocab_size; ++i) { char *str = t->vocab[i]; int len = strlen(str); printf("id: %d, len: %d, str: %s\n", i, len, str); } } void tokenize_whole_file(tokenizer_t *t, char *fname[2]) { int infd = open(fname[0], O_RDONLY), outfd = open(fname[1], O_WRONLY | O_CREAT | O_TRUNC, 0644); if (infd < 0) ERROR("failed to open %s\n", fname[0]); if (outfd < 0) ERROR("failed to open %s\n", fname[1]); for (int i = 0; ; ++i) { if (rdchar(infd) == EOF) break; else seekback(infd, 1); int id = t->encode_file(t, infd); if (id == EOF) { (void)rdchar(infd); continue; } write(outfd, &id, sizeof(id)); if (i % 100000 == 0) { int ncurrent = lseek(infd, 0, SEEK_CUR), ntotal = 0; ntotal = lseek(infd, 0, SEEK_END); (void) lseek(infd, ncurrent, SEEK_SET); printf("iteration %d, encoded %d (%s), proccessed %d of %d (%.2f%%)\n", i, id, t->decode(t, id), ncurrent, ntotal, ((float)ncurrent / ntotal) * 100); if (ntotal - ncurrent < 100) break; } } } #define tokenize_whole_file(T, ...) tokenize_whole_file(T, (char*[2]){__VA_ARGS__}) int main(int argc, char *argv[]) { tokenizer_t *tokenizer = Tokenizer(NULL); char *dataset_path = "data/dataset.txt", *dataset_output_path = "bin/dataset.bin", *vocab_path = "data/vocab.txt", *vocab_output_path = NULL, *tokenizer_path = NULL, *tokenizer_output_path = "bin/tokenizer.bin"; for (int i = 1; i < argc; i += 2) { if (strequ(argv[i], "--dataset_path")) dataset_path = argv[i + 1], printf("setting dataset path to \"%s\"\n", dataset_path); else if (strequ(argv[i], "--dataset_output_path")) dataset_output_path = argv[i + 1], printf("setting dataset output path to \"%s\"\n", dataset_output_path); else if (strequ(argv[i], "--vocab_path")) vocab_path = argv[i + 1], printf("setting vocabulary path to \"%s\"\n", vocab_path); else if (strequ(argv[i], "--vocab_output_path")) vocab_output_path = argv[i + 1], printf("setting vocabulary output path to \"%s\"\n", vocab_output_path); else if (strequ(argv[i], "--tokenizer_path")) tokenizer_path = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n", tokenizer_path); else if (strequ(argv[i], "--tokenizer_output_path")) tokenizer_output_path = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n", tokenizer_output_path); else if (strequ(argv[i], "--help")) help(argc, argv); else ERROR("unknown option \"%s\"\n", argv[i]); } CALLBACK_ON_FILE(vocab_path, INFO("loading vocabulary from \"%s\"\n", vocab_path); tokenizer->load_vocab(tokenizer, vocab_path); tokenizer_print_vocab(tokenizer); ); CALLBACK_ON_FILE(tokenizer_path, INFO("loading tokenizer from \"%s\"\n", tokenizer_path); tokenizer->load_tokenizer(tokenizer, tokenizer_path); ); CALLBACK_ON_FILE(vocab_output_path, INFO("exporting vocabulary to \"%s\"\n", vocab_output_path); tokenizer->save_vocab(tokenizer, vocab_output_path) ); CALLBACK_ON_FILE(tokenizer_output_path, INFO("exporting tokenizer to \"%s\"\n", tokenizer_output_path); tokenizer->save_tokenizer(tokenizer, tokenizer_output_path) ); CALLBACK_ON_FILE(dataset_path, CALLBACK_ON_FILE(dataset_output_path, INFO("loading dataset from \"%s\" and exporting it to \"%s\"\n", dataset_path, dataset_output_path); tokenize_whole_file(tokenizer, dataset_path, dataset_output_path); ); ); return EXIT_SUCCESS; }