tokenizer / c_src /main.c
flopml's picture
adding py_src directory
915f29f
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>
#include <errno.h>
#define MAX_VOCAB_SIZE 4096
#define MAX_WORD_LEN 12
#include "tokenizer.h"
#include "utils.h"
static int strequ(char *a, char *b) {
for (int i = 0; a[i] && b[i]; ++i)
if (a[i] != b[i])
return 0;
return 1;
}
void help(int argc, char *argv[]) {
printf("Usage: %s [arg1] [arg2] ... [argN]\n", argv[0]);
printf("Arguments:\n");
printf(" --help show this help\n");
printf(" --dataset_path <str> path to dataset\n");
printf(" --dataset_output_path <str> path to dataset output\n");
printf(" --vocab_path <str> path to vocabulary\n");
printf(" --vocab_output_path <str> path to vocabulary output\n");
printf(" --tokenizer_path <str> path to tokenizer\n");
printf(" --tokenizer_output_path <str> path to tokenizer output\n");
printf("\n");
exit(0);
}
void tokenizer_print_vocab(tokenizer_t *t) {
printf("vocab size: %d\n", t->vocab_size);
for (int i = 0; i < t->vocab_size; ++i) {
char *str = t->vocab[i];
int len = strlen(str);
printf("id: %d, len: %d, str: %s\n", i, len, str);
}
}
void tokenize_whole_file(tokenizer_t *t, char *fname[2]) {
int infd = open(fname[0], O_RDONLY),
outfd = open(fname[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (infd < 0) ERROR("failed to open %s\n", fname[0]);
if (outfd < 0) ERROR("failed to open %s\n", fname[1]);
for (int i = 0; ; ++i) {
if (rdchar(infd) == EOF) break;
else seekback(infd, 1);
int id = t->encode_file(t, infd);
if (id == EOF) { (void)rdchar(infd); continue; }
write(outfd, &id, sizeof(id));
if (i % 100000 == 0) {
int ncurrent = lseek(infd, 0, SEEK_CUR),
ntotal = 0;
ntotal = lseek(infd, 0, SEEK_END);
(void) lseek(infd, ncurrent, SEEK_SET);
printf("iteration %d, encoded %d (%s), proccessed %d of %d (%.2f%%)\n", i, id, t->decode(t, id), ncurrent, ntotal, ((float)ncurrent / ntotal) * 100);
if (ntotal - ncurrent < 100) break;
}
}
}
#define tokenize_whole_file(T, ...) tokenize_whole_file(T, (char*[2]){__VA_ARGS__})
int main(int argc, char *argv[]) {
tokenizer_t *tokenizer = Tokenizer(NULL);
char *dataset_path = "data/dataset.txt",
*dataset_output_path = "bin/dataset.bin",
*vocab_path = "data/vocab.txt",
*vocab_output_path = NULL,
*tokenizer_path = NULL,
*tokenizer_output_path = "bin/tokenizer.bin";
for (int i = 1; i < argc; i += 2) {
if (strequ(argv[i], "--dataset_path")) dataset_path = argv[i + 1], printf("setting dataset path to \"%s\"\n", dataset_path);
else if (strequ(argv[i], "--dataset_output_path")) dataset_output_path = argv[i + 1], printf("setting dataset output path to \"%s\"\n", dataset_output_path);
else if (strequ(argv[i], "--vocab_path")) vocab_path = argv[i + 1], printf("setting vocabulary path to \"%s\"\n", vocab_path);
else if (strequ(argv[i], "--vocab_output_path")) vocab_output_path = argv[i + 1], printf("setting vocabulary output path to \"%s\"\n", vocab_output_path);
else if (strequ(argv[i], "--tokenizer_path")) tokenizer_path = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n", tokenizer_path);
else if (strequ(argv[i], "--tokenizer_output_path")) tokenizer_output_path = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n", tokenizer_output_path);
else if (strequ(argv[i], "--help")) help(argc, argv);
else ERROR("unknown option \"%s\"\n", argv[i]);
}
CALLBACK_ON_FILE(vocab_path,
INFO("loading vocabulary from \"%s\"\n", vocab_path);
tokenizer->load_vocab(tokenizer, vocab_path);
tokenizer_print_vocab(tokenizer);
);
CALLBACK_ON_FILE(tokenizer_path,
INFO("loading tokenizer from \"%s\"\n", tokenizer_path);
tokenizer->load_tokenizer(tokenizer, tokenizer_path);
);
CALLBACK_ON_FILE(vocab_output_path,
INFO("exporting vocabulary to \"%s\"\n", vocab_output_path);
tokenizer->save_vocab(tokenizer, vocab_output_path)
);
CALLBACK_ON_FILE(tokenizer_output_path,
INFO("exporting tokenizer to \"%s\"\n", tokenizer_output_path);
tokenizer->save_tokenizer(tokenizer, tokenizer_output_path)
);
CALLBACK_ON_FILE(dataset_path,
CALLBACK_ON_FILE(dataset_output_path,
INFO("loading dataset from \"%s\" and exporting it to \"%s\"\n", dataset_path, dataset_output_path);
tokenize_whole_file(tokenizer, dataset_path, dataset_output_path);
);
);
return EXIT_SUCCESS;
}