Spaces:
Sleeping
Sleeping
File size: 2,123 Bytes
266d7bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
from src.config import settings
class TextSplitter:
"""Wrapper around LangChain's RecursiveCharacterTextSplitter for splitting
Markdown or HTML text into chunks with optional overlap.
Allows custom chunk sizes, overlaps, and separators, falling back
to settings defaults if not provided.
"""
def __init__(
self,
chunk_size: int | None = None,
chunk_overlap: int | None = None,
separators: list[str] | None = None,
):
"""Initialize a TextSplitter instance.
Args:
chunk_size (int | None): Maximum size of each chunk. Defaults to
`settings.text_splitter.chunk_size`.
chunk_overlap (int | None): Number of overlapping characters between chunks.
Defaults to `settings.text_splitter.chunk_overlap`.
separators (list[str] | None): List of separators to use when splitting text.
Defaults to `settings.text_splitter.separators` or
["\n\n", "\n", ".", "!", "?", " ", ""].
"""
config = settings.text_splitter
self.separators = (
separators
or config.separators
or [
"\n---\n",
"\n\n",
"\n```\n",
"\n## ",
"\n# ",
"\n**",
"\n",
". ",
"! ",
"? ",
" ",
"",
]
)
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size or config.chunk_size,
chunk_overlap=chunk_overlap or config.chunk_overlap,
separators=self.separators,
)
def split_text(self, text: str) -> list[str]:
"""Split the input text into chunks based on configured size, overlap, and separators.
Args:
text (str): The text to split.
Returns:
list[str]: List of text chunks.
"""
return self.splitter.split_text(text)
|