hf-rag-multi / src /pdf_loader.py
siyu618's picture
Upload 18 files
94f5c4b verified
raw
history blame contribute delete
703 Bytes
import os
from pypdf import PdfReader
from config.rag_config import RAGConfig
def load_pdfs(pdf_dir=None):
pdf_dir = pdf_dir or RAGConfig().pdf_dir
texts = []
for filename in os.listdir(pdf_dir):
if filename.endswith(".pdf"):
path = os.path.join(pdf_dir, filename)
reader = PdfReader(path)
text = "\n".join(page.extract_text() or "" for page in reader.pages)
texts.append(text)
return texts
def chunk_text(text, chunk_size=500, overlap=100):
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunks.append(text[start:end])
start += chunk_size - overlap
return chunks