Spaces:

chenzihong
/

GraphGen

Running

GraphGen / graphgen /bases /base_tokenizer.py

github-actions[bot]

Auto-sync from demo at Thu Oct 23 12:37:24 UTC 2025

8c66169 about 2 months ago

1.26 kB

	from __future__ import annotations

	from abc import ABC, abstractmethod
	from typing import List


	class BaseTokenizer(ABC):
	def __init__(self, model_name: str = "cl100k_base"):
	self.model_name = model_name

	@abstractmethod
	def encode(self, text: str) -> List[int]:
	"""Encode text -> token ids."""
	raise NotImplementedError

	@abstractmethod
	def decode(self, token_ids: List[int]) -> str:
	"""Decode token ids -> text."""
	raise NotImplementedError

	def count_tokens(self, text: str) -> int:
	return len(self.encode(text))

	def chunk_by_token_size(
	self,
	content: str,
	*,
	overlap_token_size: int = 128,
	max_token_size: int = 1024,
	) -> List[dict]:
	tokens = self.encode(content)
	results = []
	step = max_token_size - overlap_token_size
	for index, start in enumerate(range(0, len(tokens), step)):
	chunk_ids = tokens[start : start + max_token_size]
	results.append(
	{
	"tokens": len(chunk_ids),
	"content": self.decode(chunk_ids).strip(),
	"chunk_order_index": index,
	}
	)
	return results