Dyraa18's picture
Upload 8 files
fb2123c verified
raw
history blame
1.94 kB
import os
import re
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
def extract_and_clean_pdf(path: str, skip_pages: list[int] = None) -> list[str]:
skip_pages = skip_pages or []
cleaned_pages = []
for i, page_layout in enumerate(extract_pages(path), start=1):
if i in skip_pages:
print(f"Halaman {i} dilewati.")
continue
page_text = ""
for element in page_layout:
if isinstance(element, LTTextContainer):
page_text += element.get_text()
cleaned_text = clean_text(page_text)
cleaned_pages.append(cleaned_text)
print(f"\nTotal halaman diambil: {len(cleaned_pages)} halaman (dari {i} total halaman).")
return cleaned_pages
def clean_text(text: str) -> str:
text = text.replace("\n", " ").replace("\t", " ")
text = re.sub(r'[^\x20-\x7EÀ-ÿ]', '', text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
text = re.sub(r'([a-z])([0-9])', r'\1 \2', text)
text = re.sub(r'([0-9])([a-zA-Z])', r'\1 \2', text)
text = re.sub(r'\s+([.,!?;:])', r'\1', text)
return text.strip()
def save_cleaned_text(cleaned_pages: list[str], output_path: str):
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
for page in cleaned_pages:
f.write(page + "\n\n")
print(f"File teks berhasil disimpan ke:\n{output_path}")
if __name__ == "__main__":
pdf_path = "D:\Webchatbot\Dataset\Penjas\PJOK_BS_KLS_V.pdf"
output_txt = "D:\Webchatbot\Dataset\Penjas\Clean\Penjas Kelas V.txt"
halaman_dihapus = []+ list(range(1,15)) + list(range(188,208))
hasil = extract_and_clean_pdf(pdf_path, skip_pages=halaman_dihapus)
if hasil:
save_cleaned_text(hasil, output_txt)
else:
print("Tidak ada halaman yang diekstrak.")