| | import os |
| | from tokenizers import Tokenizer |
| | from tokenizers.models import BPE |
| | from tokenizers.trainers import BpeTrainer |
| | from tokenizers.pre_tokenizers import Whitespace, ByteLevel |
| | from tokenizers.processors import TemplateProcessing |
| |
|
| | def train_sovereign_tokenizer(corpus_path, vocab_size=50257): |
| | """ |
| | Trains a custom BPE tokenizer optimized for Indic and Ecological technical terms. |
| | Target: 50,257 tokens (matches the model_config.yaml). |
| | """ |
| | |
| | |
| | tokenizer = Tokenizer(BPE(unk_token="<|unk|>")) |
| | |
| | |
| | |
| | tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True) |
| | |
| | |
| | |
| | trainer = BpeTrainer( |
| | vocab_size=vocab_size, |
| | min_frequency=2, |
| | special_tokens=[ |
| | "<|endoftext|>", |
| | "<|unk|>", |
| | "<|pad|>", |
| | "CATEGORY_SN", |
| | "CATEGORY_NE", |
| | "CATEGORY_IPN" |
| | ], |
| | show_progress=True, |
| | initial_alphabet=ByteLevel.alphabet() |
| | ) |
| |
|
| | |
| | print(f"Commencing Tokenizer Training on {corpus_path}...") |
| | files = [os.path.join(corpus_path, f) for f in os.listdir(corpus_path) if f.endswith(".txt")] |
| | tokenizer.train(files, trainer) |
| |
|
| | |
| | |
| | tokenizer.post_processor = TemplateProcessing( |
| | single="$A <|endoftext|>", |
| | special_tokens=[("<|endoftext|>", 0)], |
| | ) |
| |
|
| | |
| | tokenizer.save("data/processed/aravalli_tokenizer.json") |
| | print("Sovereign Tokenizer Enacted and Saved to data/processed/") |
| |
|
| | if __name__ == "__main__": |
| | |
| | if not os.path.exists("data/raw/"): |
| | os.makedirs("data/raw/") |
| | train_sovereign_tokenizer("data/raw/") |
| |
|