Spaces:
Running
Running
File size: 981 Bytes
43d27f2 31086ae 43d27f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
import re
from typing import Any, List
from graphgen.bases.base_splitter import BaseSplitter
class CharacterSplitter(BaseSplitter):
"""Splitting text that looks at characters."""
def __init__(
self, separator: str = "\n\n", is_separator_regex: bool = False, **kwargs: Any
) -> None:
"""Create a new TextSplitter."""
super().__init__(**kwargs)
self._separator = separator
self._is_separator_regex = is_separator_regex
def split_text(self, text: str) -> List[str]:
"""Split incoming text and return chunks."""
# First we naively chunk the large input into a bunch of smaller ones.
separator = (
self._separator if self._is_separator_regex else re.escape(self._separator)
)
splits = self._split_text_with_regex(text, separator, self.keep_separator)
_separator = "" if self.keep_separator else self._separator
return self._merge_splits(splits, _separator)
|