Spaces:
Running
Running
File size: 1,139 Bytes
43d27f2 31086ae 43d27f2 31086ae 43d27f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
from typing import Any
from graphgen.models.splitter.recursive_character_splitter import (
RecursiveCharacterSplitter,
)
class MarkdownTextRefSplitter(RecursiveCharacterSplitter):
"""Attempts to chunk the text along Markdown-formatted headings."""
def __init__(self, **kwargs: Any) -> None:
"""Initialize a MarkdownTextRefSplitter."""
separators = [
# First, try to chunk along Markdown headings (starting with level 2)
"\n#{1,6} ",
# Note the alternative syntax for headings (below) is not handled here
# Heading level 2
# ---------------
# End of code block
"```\n",
# Horizontal lines
"\n\\*\\*\\*+\n",
"\n---+\n",
"\n___+\n",
# Note: horizontal lines defined by three or more of ***, ---, or ___
# are handled by the regexes above, but alternative syntaxes (e.g., with spaces)
# are not handled.
"\n\n",
"\n",
" ",
"",
]
super().__init__(separators=separators, **kwargs)
|