File size: 5,124 Bytes
120dee6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>
#include <errno.h>
#define MAX_VOCAB_SIZE 4096
#define MAX_WORD_LEN 12
#include "tokenizer.h"
#include "utils.h"
static int strequ(char *a, char *b) {
for (int i = 0; a[i] && b[i]; ++i)
if (a[i] != b[i])
return 0;
return 1;
}
void help(int argc, char *argv[]) {
printf("Usage: %s [arg1] [arg2] ... [argN]\n", argv[0]);
printf("Arguments:\n");
printf(" --help show this help\n");
printf(" --dataset_path <str> path to dataset\n");
printf(" --dataset_output_path <str> path to dataset output\n");
printf(" --vocab_path <str> path to vocabulary\n");
printf(" --vocab_output_path <str> path to vocabulary output\n");
printf(" --tokenizer_path <str> path to tokenizer\n");
printf(" --tokenizer_output_path <str> path to tokenizer output\n");
printf("\n");
exit(0);
}
void tokenizer_print_vocab(tokenizer_t *t) {
printf("vocab size: %d\n", t->vocab_size);
for (int i = 0; i < t->vocab_size; ++i) {
char *str = t->vocab[i];
int len = strlen(str);
printf("id: %d, len: %d, str: %s\n", i, len, str);
}
}
void tokenize_whole_file(tokenizer_t *t, char *fname[2]) {
int infd = open(fname[0], O_RDONLY),
outfd = open(fname[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (infd < 0) ERROR("failed to open %s\n", fname[0]);
if (outfd < 0) ERROR("failed to open %s\n", fname[1]);
for (int i = 0; ; ++i) {
if (rdchar(infd) == EOF) break;
else seekback(infd, 1);
int id = t->encode_file(t, infd);
if (id == EOF) { (void)rdchar(infd); continue; }
write(outfd, &id, sizeof(id));
if (i % 100000 == 0) {
int ncurrent = lseek(infd, 0, SEEK_CUR),
ntotal = 0;
ntotal = lseek(infd, 0, SEEK_END);
(void) lseek(infd, ncurrent, SEEK_SET);
printf("iteration %d, encoded %d (%s), proccessed %d of %d (%.2f%%)\n", i, id, t->decode(t, id), ncurrent, ntotal, ((float)ncurrent / ntotal) * 100);
if (ntotal - ncurrent < 100) break;
}
}
}
#define tokenize_whole_file(T, ...) tokenize_whole_file(T, (char*[2]){__VA_ARGS__})
int main(int argc, char *argv[]) {
tokenizer_t *tokenizer = Tokenizer(NULL);
char *dataset_path = "data/dataset.txt",
*dataset_output_path = "bin/dataset.bin",
*vocab_path = "data/vocab.txt",
*vocab_output_path = NULL,
*tokenizer_path = NULL,
*tokenizer_output_path = "bin/tokenizer.bin";
for (int i = 1; i < argc; i += 2) {
if (strequ(argv[i], "--dataset_path")) dataset_path = argv[i + 1], printf("setting dataset path to \"%s\"\n", dataset_path);
else if (strequ(argv[i], "--dataset_output_path")) dataset_output_path = argv[i + 1], printf("setting dataset output path to \"%s\"\n", dataset_output_path);
else if (strequ(argv[i], "--vocab_path")) vocab_path = argv[i + 1], printf("setting vocabulary path to \"%s\"\n", vocab_path);
else if (strequ(argv[i], "--vocab_output_path")) vocab_output_path = argv[i + 1], printf("setting vocabulary output path to \"%s\"\n", vocab_output_path);
else if (strequ(argv[i], "--tokenizer_path")) tokenizer_path = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n", tokenizer_path);
else if (strequ(argv[i], "--tokenizer_output_path")) tokenizer_output_path = argv[i + 1], printf("setting tokenizer output path to \"%s\"\n", tokenizer_output_path);
else if (strequ(argv[i], "--help")) help(argc, argv);
else ERROR("unknown option \"%s\"\n", argv[i]);
}
CALLBACK_ON_FILE(vocab_path,
INFO("loading vocabulary from \"%s\"\n", vocab_path);
tokenizer->load_vocab(tokenizer, vocab_path);
tokenizer_print_vocab(tokenizer);
);
CALLBACK_ON_FILE(tokenizer_path,
INFO("loading tokenizer from \"%s\"\n", tokenizer_path);
tokenizer->load_tokenizer(tokenizer, tokenizer_path);
);
CALLBACK_ON_FILE(vocab_output_path,
INFO("exporting vocabulary to \"%s\"\n", vocab_output_path);
tokenizer->save_vocab(tokenizer, vocab_output_path)
);
CALLBACK_ON_FILE(tokenizer_output_path,
INFO("exporting tokenizer to \"%s\"\n", tokenizer_output_path);
tokenizer->save_tokenizer(tokenizer, tokenizer_output_path)
);
CALLBACK_ON_FILE(dataset_path,
CALLBACK_ON_FILE(dataset_output_path,
INFO("loading dataset from \"%s\" and exporting it to \"%s\"\n", dataset_path, dataset_output_path);
tokenize_whole_file(tokenizer, dataset_path, dataset_output_path);
);
);
return EXIT_SUCCESS;
}
|