Spaces:
Sleeping
Sleeping
File size: 703 Bytes
94f5c4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
import os
from pypdf import PdfReader
from config.rag_config import RAGConfig
def load_pdfs(pdf_dir=None):
pdf_dir = pdf_dir or RAGConfig().pdf_dir
texts = []
for filename in os.listdir(pdf_dir):
if filename.endswith(".pdf"):
path = os.path.join(pdf_dir, filename)
reader = PdfReader(path)
text = "\n".join(page.extract_text() or "" for page in reader.pages)
texts.append(text)
return texts
def chunk_text(text, chunk_size=500, overlap=100):
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunks.append(text[start:end])
start += chunk_size - overlap
return chunks
|