Spaces:
Running
Running
| from dataclasses import dataclass | |
| from typing import List | |
| import tiktoken | |
| try: | |
| from transformers import AutoTokenizer | |
| TRANSFORMERS_AVAILABLE = True | |
| except ImportError: | |
| AutoTokenizer = None | |
| TRANSFORMERS_AVAILABLE = False | |
| def get_tokenizer(tokenizer_name: str = "cl100k_base"): | |
| """ | |
| Get a tokenizer instance by name. | |
| :param tokenizer_name: tokenizer name, tiktoken encoding name or Hugging Face model name | |
| :return: tokenizer instance | |
| """ | |
| if tokenizer_name in tiktoken.list_encoding_names(): | |
| return tiktoken.get_encoding(tokenizer_name) | |
| if TRANSFORMERS_AVAILABLE: | |
| try: | |
| return AutoTokenizer.from_pretrained(tokenizer_name) | |
| except Exception as e: | |
| raise ValueError(f"Failed to load tokenizer from Hugging Face: {e}") from e | |
| else: | |
| raise ValueError("Hugging Face Transformers is not available, please install it first.") | |
| class Tokenizer: | |
| model_name: str = "cl100k_base" | |
| def __post_init__(self): | |
| self.tokenizer = get_tokenizer(self.model_name) | |
| def encode_string(self, text: str) -> List[int]: | |
| """ | |
| Encode text to tokens | |
| :param text | |
| :return: tokens | |
| """ | |
| return self.tokenizer.encode(text) | |
| def decode_tokens(self, tokens: List[int]) -> str: | |
| """ | |
| Decode tokens to text | |
| :param tokens | |
| :return: text | |
| """ | |
| return self.tokenizer.decode(tokens) | |
| def chunk_by_token_size( | |
| self, content: str, overlap_token_size=128, max_token_size=1024 | |
| ): | |
| tokens = self.encode_string(content) | |
| results = [] | |
| for index, start in enumerate( | |
| range(0, len(tokens), max_token_size - overlap_token_size) | |
| ): | |
| chunk_content = self.decode_tokens( | |
| tokens[start : start + max_token_size] | |
| ) | |
| results.append( | |
| { | |
| "tokens": min(max_token_size, len(tokens) - start), | |
| "content": chunk_content.strip(), | |
| "chunk_order_index": index, | |
| } | |
| ) | |
| return results | |