Spaces:
Sleeping
Sleeping
| import os | |
| from pypdf import PdfReader | |
| from config.rag_config import RAGConfig | |
| def load_pdfs(pdf_dir=None): | |
| pdf_dir = pdf_dir or RAGConfig().pdf_dir | |
| texts = [] | |
| for filename in os.listdir(pdf_dir): | |
| if filename.endswith(".pdf"): | |
| path = os.path.join(pdf_dir, filename) | |
| reader = PdfReader(path) | |
| text = "\n".join(page.extract_text() or "" for page in reader.pages) | |
| texts.append(text) | |
| return texts | |
| def chunk_text(text, chunk_size=500, overlap=100): | |
| chunks = [] | |
| start = 0 | |
| while start < len(text): | |
| end = start + chunk_size | |
| chunks.append(text[start:end]) | |
| start += chunk_size - overlap | |
| return chunks | |