File size: 2,123 Bytes
266d7bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from langchain_text_splitters import RecursiveCharacterTextSplitter

from src.config import settings


class TextSplitter:
    """Wrapper around LangChain's RecursiveCharacterTextSplitter for splitting
    Markdown or HTML text into chunks with optional overlap.

    Allows custom chunk sizes, overlaps, and separators, falling back
    to settings defaults if not provided.
    """

    def __init__(
        self,
        chunk_size: int | None = None,
        chunk_overlap: int | None = None,
        separators: list[str] | None = None,
    ):
        """Initialize a TextSplitter instance.

        Args:
            chunk_size (int | None): Maximum size of each chunk. Defaults to
                `settings.text_splitter.chunk_size`.
            chunk_overlap (int | None): Number of overlapping characters between chunks.
                Defaults to `settings.text_splitter.chunk_overlap`.
            separators (list[str] | None): List of separators to use when splitting text.
                Defaults to `settings.text_splitter.separators` or
                ["\n\n", "\n", ".", "!", "?", " ", ""].

        """
        config = settings.text_splitter

        self.separators = (
            separators
            or config.separators
            or [
                "\n---\n",
                "\n\n",
                "\n```\n",
                "\n## ",
                "\n# ",
                "\n**",
                "\n",
                ". ",
                "! ",
                "? ",
                " ",
                "",
            ]
        )
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size or config.chunk_size,
            chunk_overlap=chunk_overlap or config.chunk_overlap,
            separators=self.separators,
        )

    def split_text(self, text: str) -> list[str]:
        """Split the input text into chunks based on configured size, overlap, and separators.

        Args:
            text (str): The text to split.

        Returns:
            list[str]: List of text chunks.

        """
        return self.splitter.split_text(text)