Spaces:
Sleeping
Sleeping
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from src.config import settings | |
| class TextSplitter: | |
| """Wrapper around LangChain's RecursiveCharacterTextSplitter for splitting | |
| Markdown or HTML text into chunks with optional overlap. | |
| Allows custom chunk sizes, overlaps, and separators, falling back | |
| to settings defaults if not provided. | |
| """ | |
| def __init__( | |
| self, | |
| chunk_size: int | None = None, | |
| chunk_overlap: int | None = None, | |
| separators: list[str] | None = None, | |
| ): | |
| """Initialize a TextSplitter instance. | |
| Args: | |
| chunk_size (int | None): Maximum size of each chunk. Defaults to | |
| `settings.text_splitter.chunk_size`. | |
| chunk_overlap (int | None): Number of overlapping characters between chunks. | |
| Defaults to `settings.text_splitter.chunk_overlap`. | |
| separators (list[str] | None): List of separators to use when splitting text. | |
| Defaults to `settings.text_splitter.separators` or | |
| ["\n\n", "\n", ".", "!", "?", " ", ""]. | |
| """ | |
| config = settings.text_splitter | |
| self.separators = ( | |
| separators | |
| or config.separators | |
| or [ | |
| "\n---\n", | |
| "\n\n", | |
| "\n```\n", | |
| "\n## ", | |
| "\n# ", | |
| "\n**", | |
| "\n", | |
| ". ", | |
| "! ", | |
| "? ", | |
| " ", | |
| "", | |
| ] | |
| ) | |
| self.splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size or config.chunk_size, | |
| chunk_overlap=chunk_overlap or config.chunk_overlap, | |
| separators=self.separators, | |
| ) | |
| def split_text(self, text: str) -> list[str]: | |
| """Split the input text into chunks based on configured size, overlap, and separators. | |
| Args: | |
| text (str): The text to split. | |
| Returns: | |
| list[str]: List of text chunks. | |
| """ | |
| return self.splitter.split_text(text) | |