|
|
#include <stdio.h> |
|
|
#include <stdlib.h> |
|
|
#include <string.h> |
|
|
#include <fcntl.h> |
|
|
#include <sys/stat.h> |
|
|
#include <unistd.h> |
|
|
#include <errno.h> |
|
|
|
|
|
|
|
|
#define MAX_VOCAB_SIZE 4096 |
|
|
#define MAX_WORD_LEN 12 |
|
|
|
|
|
|
|
|
#include "tokenizer.h" |
|
|
#include "utils.h" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static int strequ(char *a, char *b) { |
|
|
for (int i = 0; a[i] && b[i]; ++i) |
|
|
if (a[i] != b[i]) |
|
|
return 0; |
|
|
|
|
|
return 1; |
|
|
} |
|
|
|
|
|
void help(int argc, char *argv[]) { |
|
|
printf("Usage: %s [arg1] [arg2] ... [argN]\n", argv[0]); |
|
|
printf("Arguments:\n"); |
|
|
printf(" --help show this help\n"); |
|
|
printf(" --dataset_path <str> path to dataset\n"); |
|
|
printf(" --dataset_output_path <str> path to dataset output\n"); |
|
|
printf(" --vocab_path <str> path to vocabulary\n"); |
|
|
printf(" --vocab_output_path <str> path to vocabulary output\n"); |
|
|
printf(" --tokenizer_path <str> path to tokenizer\n"); |
|
|
printf(" --tokenizer_output_path <str> path to tokenizer output\n"); |
|
|
printf("\n"); |
|
|
exit(0); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void tokenizer_print_vocab(tokenizer_t *t) { |
|
|
printf("vocab size: %d\n", t->vocab_size); |
|
|
|
|
|
for (int i = 0; i < t->vocab_size; ++i) { |
|
|
char *str = t->vocab[i]; |
|
|
int len = strlen(str); |
|
|
printf("id: %d, len: %d, str: %s\n", i, len, str); |
|
|
} |
|
|
} |
|
|
|
|
|
void tokenize_whole_file(tokenizer_t *t, char *fname[2]) { |
|
|
int infd = open(fname[0], O_RDONLY), |
|
|
outfd = open(fname[1], O_WRONLY | O_CREAT | O_TRUNC, 0644); |
|
|
|
|
|
if (infd < 0) ERROR("failed to open %s\n", fname[0]); |
|
|
if (outfd < 0) ERROR("failed to open %s\n", fname[1]); |
|
|
|
|
|
for (int i = 0; ; ++i) { |
|
|
if (rdchar(infd) == EOF) break; |
|
|
else seekback(infd, 1); |
|
|
|
|
|
int id = t->encode_file(t, infd); |
|
|
if (id == EOF) { (void)rdchar(infd); continue; } |
|
|
|
|
|
write(outfd, &id, sizeof(id)); |
|
|
|
|
|
if (i % 100000 == 0) { |
|
|
int ncurrent = lseek(infd, 0, SEEK_CUR), |
|
|
ntotal = 0; |
|
|
|
|
|
ntotal = lseek(infd, 0, SEEK_END); |
|
|
(void) lseek(infd, ncurrent, SEEK_SET); |
|
|
|
|
|
printf("iteration %d, encoded %d (%s), proccessed %d of %d (%.2f%%)\n", i, id, t->decode(t, id), ncurrent, ntotal, ((float)ncurrent / ntotal) * 100); |
|
|
|
|
|
if (ntotal - ncurrent < 100) break; |
|
|
} |
|
|
} |
|
|
} |
|
|
#define tokenize_whole_file(T, ...) tokenize_whole_file(T, (char*[2]){__VA_ARGS__}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int main(int argc, char *argv[]) { |
|
|
tokenizer_t *tokenizer = Tokenizer(NULL); |
|
|
|
|
|
|
|
|
char *dataset_path = "data/dataset.txt", |
|
|
*dataset_output_path = "bin/dataset.bin", |
|
|
*vocab_path = "data/vocab.txt", |
|
|
*vocab_output_path = NULL, |
|
|
*tokenizer_path = NULL, |
|
|
*tokenizer_output_path = "bin/tokenizer.bin"; |
|
|
|
|
|
|
|
|
for (int i = 1; i < argc; i += 2) { |
|
|
if (strequ(argv[i], "--dataset_path")) dataset_path = argv[i + 1], printf("setting dataset path to \"%s\"\n", dataset_path); |
|
|
else if (strequ(argv[i], "--dataset_output_path")) dataset_output_path = argv[i + 1], printf("setting dataset output path to \"%s\"\n", dataset_output_path); |
|
|
else if (strequ(argv[i], "--vocab_path")) vocab_path = argv[i + 1], printf("setting vocabulary path to \"%s\"\n", vocab_path); |
|
|
else if (strequ(argv[i], "--vocab_output_path")) vocab_output_path = argv[i + 1], printf("setting vocabulary output path to \"%s\"\n", vocab_output_path); |
|
|
else if (strequ(argv[i], "--tokenizer_path")) tokenizer_path = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n", tokenizer_path); |
|
|
else if (strequ(argv[i], "--tokenizer_output_path")) tokenizer_output_path = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n", tokenizer_output_path); |
|
|
else if (strequ(argv[i], "--help")) help(argc, argv); |
|
|
else ERROR("unknown option \"%s\"\n", argv[i]); |
|
|
} |
|
|
|
|
|
|
|
|
CALLBACK_ON_FILE(vocab_path, |
|
|
INFO("loading vocabulary from \"%s\"\n", vocab_path); |
|
|
tokenizer->load_vocab(tokenizer, vocab_path); |
|
|
tokenizer_print_vocab(tokenizer); |
|
|
); |
|
|
|
|
|
CALLBACK_ON_FILE(tokenizer_path, |
|
|
INFO("loading tokenizer from \"%s\"\n", tokenizer_path); |
|
|
tokenizer->load_tokenizer(tokenizer, tokenizer_path); |
|
|
); |
|
|
|
|
|
CALLBACK_ON_FILE(vocab_output_path, |
|
|
INFO("exporting vocabulary to \"%s\"\n", vocab_output_path); |
|
|
tokenizer->save_vocab(tokenizer, vocab_output_path) |
|
|
); |
|
|
|
|
|
CALLBACK_ON_FILE(tokenizer_output_path, |
|
|
INFO("exporting tokenizer to \"%s\"\n", tokenizer_output_path); |
|
|
tokenizer->save_tokenizer(tokenizer, tokenizer_output_path) |
|
|
); |
|
|
|
|
|
CALLBACK_ON_FILE(dataset_path, |
|
|
CALLBACK_ON_FILE(dataset_output_path, |
|
|
INFO("loading dataset from \"%s\" and exporting it to \"%s\"\n", dataset_path, dataset_output_path); |
|
|
tokenize_whole_file(tokenizer, dataset_path, dataset_output_path); |
|
|
); |
|
|
); |
|
|
|
|
|
return EXIT_SUCCESS; |
|
|
} |
|
|
|