Spaces:
Build error
Build error
| import markdown | |
| import pdfplumber | |
| from pathlib import Path | |
| from tts_ui.utils import split_text_into_chunks, extract_text_from_epub, text_from_file | |
| class DocumentProcessor: | |
| def __init__(self, max_word_chunk_size=4000): | |
| self.max_word_chunk_size: int = max_word_chunk_size # Characters per chunk | |
| def process_doc(self, file_path: Path) -> list[str]: | |
| # get the file extension from the path | |
| ext: str = file_path.name.split(".")[-1].lower() | |
| match ext: | |
| case "pdf": | |
| return self._process_pdf(file_path) | |
| case "epub": | |
| return self._process_epub(file_path) | |
| case "md": | |
| return self._process_markdown(file_path) | |
| case "txt": | |
| return self._process_text(file_path) | |
| case _: | |
| raise Exception(f"No file found in {file_path}") | |
| def _process_pdf(self, file_path: str) -> list[str]: | |
| text = "" | |
| with pdfplumber.open(file_path) as pdf: | |
| for page in pdf.pages: | |
| text += page.extract_text() + "\n" | |
| return self._chunk_text(text) | |
| def _process_epub(self, file_path: str) -> list[str]: | |
| text = extract_text_from_epub(file_path) | |
| return self._chunk_text(text) | |
| def _process_markdown(self, file_path: str) -> list[str]: | |
| with open(file_path, "r") as f: | |
| md_text: str = f.read() | |
| return self._chunk_text(markdown.markdown(md_text)) | |
| def _process_text(self, file_path: str) -> list[str]: | |
| text = text_from_file(file_path) | |
| return self._chunk_text(text) | |
| def _chunk_text(self, text: str) -> list[str]: | |
| return split_text_into_chunks(text, self.max_word_chunk_size) | |