Spaces:
Running
Running
File size: 1,394 Bytes
3a3b216 e772457 3a3b216 8c66169 817f16e 3a3b216 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
from typing import List
from graphgen.bases import BaseTokenizer
from .tiktoken_tokenizer import TiktokenTokenizer
try:
from transformers import AutoTokenizer
_HF_AVAILABLE = True
except ImportError:
_HF_AVAILABLE = False
def get_tokenizer_impl(tokenizer_name: str = "cl100k_base") -> BaseTokenizer:
import tiktoken
if tokenizer_name in tiktoken.list_encoding_names():
return TiktokenTokenizer(model_name=tokenizer_name)
# 2. HuggingFace
if _HF_AVAILABLE:
from .hf_tokenizer import HFTokenizer
return HFTokenizer(model_name=tokenizer_name)
raise ValueError(
f"Unknown tokenizer {tokenizer_name} and HuggingFace not available."
)
class Tokenizer(BaseTokenizer):
"""
Encapsulates different tokenization implementations based on the specified model name.
"""
def __init__(self, model_name: str = "cl100k_base"):
super().__init__(model_name)
if not self.model_name:
raise ValueError("TOKENIZER_MODEL must be specified in the ENV variables.")
self._impl = get_tokenizer_impl(self.model_name)
def encode(self, text: str) -> List[int]:
return self._impl.encode(text)
def decode(self, token_ids: List[int]) -> str:
return self._impl.decode(token_ids)
def count_tokens(self, text: str) -> int:
return self._impl.count_tokens(text)
|