Spaces:
Running
Running
| import os | |
| import re | |
| from pdfminer.high_level import extract_pages | |
| from pdfminer.layout import LTTextContainer | |
| def extract_and_clean_pdf(path: str, skip_pages: list[int] = None) -> list[str]: | |
| skip_pages = skip_pages or [] | |
| cleaned_pages = [] | |
| for i, page_layout in enumerate(extract_pages(path), start=1): | |
| if i in skip_pages: | |
| print(f"Halaman {i} dilewati.") | |
| continue | |
| page_text = "" | |
| for element in page_layout: | |
| if isinstance(element, LTTextContainer): | |
| page_text += element.get_text() | |
| cleaned_text = clean_text(page_text) | |
| cleaned_pages.append(cleaned_text) | |
| print(f"\nTotal halaman diambil: {len(cleaned_pages)} halaman (dari {i} total halaman).") | |
| return cleaned_pages | |
| def clean_text(text: str) -> str: | |
| text = text.replace("\n", " ").replace("\t", " ") | |
| text = re.sub(r'[^\x20-\x7EÀ-ÿ]', '', text) | |
| text = re.sub(r'\s+', ' ', text) | |
| text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) | |
| text = re.sub(r'([a-z])([0-9])', r'\1 \2', text) | |
| text = re.sub(r'([0-9])([a-zA-Z])', r'\1 \2', text) | |
| text = re.sub(r'\s+([.,!?;:])', r'\1', text) | |
| return text.strip() | |
| def save_cleaned_text(cleaned_pages: list[str], output_path: str): | |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| for page in cleaned_pages: | |
| f.write(page + "\n\n") | |
| print(f"File teks berhasil disimpan ke:\n{output_path}") | |
| if __name__ == "__main__": | |
| pdf_path = "D:\Webchatbot\Dataset\Penjas\PJOK_BS_KLS_V.pdf" | |
| output_txt = "D:\Webchatbot\Dataset\Penjas\Clean\Penjas Kelas V.txt" | |
| halaman_dihapus = []+ list(range(1,15)) + list(range(188,208)) | |
| hasil = extract_and_clean_pdf(pdf_path, skip_pages=halaman_dihapus) | |
| if hasil: | |
| save_cleaned_text(hasil, output_txt) | |
| else: | |
| print("Tidak ada halaman yang diekstrak.") |