Spaces:
Running
Running
File size: 1,262 Bytes
3a3b216 8c66169 3a3b216 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import List
class BaseTokenizer(ABC):
def __init__(self, model_name: str = "cl100k_base"):
self.model_name = model_name
@abstractmethod
def encode(self, text: str) -> List[int]:
"""Encode text -> token ids."""
raise NotImplementedError
@abstractmethod
def decode(self, token_ids: List[int]) -> str:
"""Decode token ids -> text."""
raise NotImplementedError
def count_tokens(self, text: str) -> int:
return len(self.encode(text))
def chunk_by_token_size(
self,
content: str,
*,
overlap_token_size: int = 128,
max_token_size: int = 1024,
) -> List[dict]:
tokens = self.encode(content)
results = []
step = max_token_size - overlap_token_size
for index, start in enumerate(range(0, len(tokens), step)):
chunk_ids = tokens[start : start + max_token_size]
results.append(
{
"tokens": len(chunk_ids),
"content": self.decode(chunk_ids).strip(),
"chunk_order_index": index,
}
)
return results
|