File size: 1,139 Bytes
43d27f2
 
 
 
 
 
 
 
31086ae
43d27f2
 
 
 
31086ae
43d27f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from typing import Any

from graphgen.models.splitter.recursive_character_splitter import (
    RecursiveCharacterSplitter,
)


class MarkdownTextRefSplitter(RecursiveCharacterSplitter):
    """Attempts to chunk the text along Markdown-formatted headings."""

    def __init__(self, **kwargs: Any) -> None:
        """Initialize a MarkdownTextRefSplitter."""
        separators = [
            # First, try to chunk along Markdown headings (starting with level 2)
            "\n#{1,6} ",
            # Note the alternative syntax for headings (below) is not handled here
            # Heading level 2
            # ---------------
            # End of code block
            "```\n",
            # Horizontal lines
            "\n\\*\\*\\*+\n",
            "\n---+\n",
            "\n___+\n",
            # Note: horizontal lines defined by three or more of ***, ---, or ___
            # are handled by the regexes above, but alternative syntaxes (e.g., with spaces)
            # are not handled.
            "\n\n",
            "\n",
            " ",
            "",
        ]
        super().__init__(separators=separators, **kwargs)