| | import sentencepiece as spm |
| | import json |
| |
|
| | |
| | sp = spm.SentencePieceProcessor() |
| | sp.Load("/Users/apple/Desktop/indictrans2/IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/custom_tokenizer.model") |
| |
|
| | |
| | vocab = {} |
| | for i in range(sp.GetPieceSize()): |
| | piece = sp.IdToPiece(i) |
| | vocab[piece] = i |
| |
|
| | print(f"Total vocab size: {len(vocab)}") |
| |
|
| | |
| | vocab_save_path = "/Users/apple/Desktop/indictrans2/IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/vocab.json" |
| | with open(vocab_save_path, 'w', encoding='utf-8') as f: |
| | json.dump(vocab, f, ensure_ascii=False, indent=4) |
| |
|
| | print(f"Vocab file saved at: {vocab_save_path}") |
| |
|