Spaces:

Dyraa18
/

Web-Chatbot

Sleeping

File size: 1,938 Bytes

d3fc2f7

import os
import re
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer


def extract_and_clean_pdf(path: str, skip_pages: list[int] = None) -> list[str]:
    skip_pages = skip_pages or []
    cleaned_pages = []

    for i, page_layout in enumerate(extract_pages(path), start=1):
        if i in skip_pages:
            print(f"Halaman {i} dilewati.")
            continue

        page_text = ""
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                page_text += element.get_text()

        cleaned_text = clean_text(page_text)
        cleaned_pages.append(cleaned_text)

    print(f"\nTotal halaman diambil: {len(cleaned_pages)} halaman (dari {i} total halaman).")
    return cleaned_pages


def clean_text(text: str) -> str:
    text = text.replace("\n", " ").replace("\t", " ")
    text = re.sub(r'[^\x20-\x7EÀ-ÿ]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
    text = re.sub(r'([a-z])([0-9])', r'\1 \2', text)
    text = re.sub(r'([0-9])([a-zA-Z])', r'\1 \2', text)
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    return text.strip()


def save_cleaned_text(cleaned_pages: list[str], output_path: str):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    with open(output_path, "w", encoding="utf-8") as f:
        for page in cleaned_pages:
            f.write(page + "\n\n")

    print(f"File teks berhasil disimpan ke:\n{output_path}")


if __name__ == "__main__":
    pdf_path = "D:\Webchatbot\Dataset\Penjas\PJOK_BS_KLS_V.pdf"
    output_txt = "D:\Webchatbot\Dataset\Penjas\Clean\Penjas Kelas V.txt"

    halaman_dihapus = []+ list(range(1,15)) + list(range(188,208))
    hasil = extract_and_clean_pdf(pdf_path, skip_pages=halaman_dihapus)

    if hasil:
        save_cleaned_text(hasil, output_txt)
    else:
        print("Tidak ada halaman yang diekstrak.")