File size: 1,262 Bytes
3a3b216
 
 
 
 
 
 
8c66169
 
3a3b216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import List


class BaseTokenizer(ABC):
    def __init__(self, model_name: str = "cl100k_base"):
        self.model_name = model_name

    @abstractmethod
    def encode(self, text: str) -> List[int]:
        """Encode text -> token ids."""
        raise NotImplementedError

    @abstractmethod
    def decode(self, token_ids: List[int]) -> str:
        """Decode token ids -> text."""
        raise NotImplementedError

    def count_tokens(self, text: str) -> int:
        return len(self.encode(text))

    def chunk_by_token_size(
        self,
        content: str,
        *,
        overlap_token_size: int = 128,
        max_token_size: int = 1024,
    ) -> List[dict]:
        tokens = self.encode(content)
        results = []
        step = max_token_size - overlap_token_size
        for index, start in enumerate(range(0, len(tokens), step)):
            chunk_ids = tokens[start : start + max_token_size]
            results.append(
                {
                    "tokens": len(chunk_ids),
                    "content": self.decode(chunk_ids).strip(),
                    "chunk_order_index": index,
                }
            )
        return results