import os from tokenizers import Tokenizer from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer from tokenizers.pre_tokenizers import Whitespace, ByteLevel from tokenizers.processors import TemplateProcessing def train_sovereign_tokenizer(corpus_path, vocab_size=50257): """ Trains a custom BPE tokenizer optimized for Indic and Ecological technical terms. Target: 50,257 tokens (matches the model_config.yaml). """ # 1. Initialize an empty BPE model # ByteLevel ensures we can handle any UTF-8 character without [UNK] tokens tokenizer = Tokenizer(BPE(unk_token="<|unk|>")) # 2. Set the Pre-Tokenizer # We use ByteLevel to treat the text as a sequence of bytes tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True) # 3. Initialize the Trainer # We include special tokens for GOEC protocols (SN, NE, IPN) trainer = BpeTrainer( vocab_size=vocab_size, min_frequency=2, special_tokens=[ "<|endoftext|>", "<|unk|>", "<|pad|>", "CATEGORY_SN", "CATEGORY_NE", "CATEGORY_IPN" ], show_progress=True, initial_alphabet=ByteLevel.alphabet() ) # 4. Train on the Sovereign Corpus print(f"Commencing Tokenizer Training on {corpus_path}...") files = [os.path.join(corpus_path, f) for f in os.listdir(corpus_path) if f.endswith(".txt")] tokenizer.train(files, trainer) # 5. Post-Processing # Add template to handle start/end of sequences for the Secretariat tokenizer.post_processor = TemplateProcessing( single="$A <|endoftext|>", special_tokens=[("<|endoftext|>", 0)], ) # 6. Save the Sovereign Lens tokenizer.save("data/processed/aravalli_tokenizer.json") print("Sovereign Tokenizer Enacted and Saved to data/processed/") if __name__ == "__main__": # Ensure raw data exists before training if not os.path.exists("data/raw/"): os.makedirs("data/raw/") train_sovereign_tokenizer("data/raw/")