File size: 1,394 Bytes
3a3b216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e772457
 
3a3b216
 
 
 
 
 
 
 
 
 
 
 
8c66169
 
817f16e
 
3a3b216
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from typing import List

from graphgen.bases import BaseTokenizer

from .tiktoken_tokenizer import TiktokenTokenizer

try:
    from transformers import AutoTokenizer

    _HF_AVAILABLE = True
except ImportError:
    _HF_AVAILABLE = False


def get_tokenizer_impl(tokenizer_name: str = "cl100k_base") -> BaseTokenizer:
    import tiktoken

    if tokenizer_name in tiktoken.list_encoding_names():
        return TiktokenTokenizer(model_name=tokenizer_name)

    # 2. HuggingFace
    if _HF_AVAILABLE:
        from .hf_tokenizer import HFTokenizer

        return HFTokenizer(model_name=tokenizer_name)

    raise ValueError(
        f"Unknown tokenizer {tokenizer_name} and HuggingFace not available."
    )


class Tokenizer(BaseTokenizer):
    """
    Encapsulates different tokenization implementations based on the specified model name.
    """

    def __init__(self, model_name: str = "cl100k_base"):
        super().__init__(model_name)
        if not self.model_name:
            raise ValueError("TOKENIZER_MODEL must be specified in the ENV variables.")
        self._impl = get_tokenizer_impl(self.model_name)

    def encode(self, text: str) -> List[int]:
        return self._impl.encode(text)

    def decode(self, token_ids: List[int]) -> str:
        return self._impl.decode(token_ids)

    def count_tokens(self, text: str) -> int:
        return self._impl.count_tokens(text)