Dyraa18's picture
Upload 8 files
fb2123c verified
raw
history blame
3.35 kB
import os, re, sys
from typing import List, Optional, Set
import fitz
import pytesseract
from PIL import Image
from io import BytesIO
PDF_PATH = r"D:\Webchatbot\Dataset\Penjas\PJOK_BS_KLS_VI.pdf"
OUTPUT_TXT = r"D:\Webchatbot\Dataset\Penjas\Clean\Penjas Kelas VI.txt"
SKIP_PAGES = list(range(1, 22)) + list(range(200, 211)) + list(range(213, 226))
TESSERACT_CMD = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
OCR_LANG = "ind+eng"
DPI = 300
if TESSERACT_CMD:
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
URL_RE = re.compile(
r"(https?://\S+|www\.\S+|\b\S+\.(?:com|org|net|edu|gov|go|id|co)\S*)",
flags=re.IGNORECASE,
)
BAB_LINE_RE = re.compile(
r"^\s*(?:bab|BAB)\s*(?:[0-9]+|[IVXLCDM]+)\s*(?:[:\-–—]\s*.*)?\s*$"
)
BAB_PREFIX_RE = re.compile(
r"^\s*(?:bab|BAB)\s*(?:[0-9]+|[IVXLCDM]+)\s*(?:[:\-–—]\s*)?",
flags=re.IGNORECASE,
)
def clean_text(text: str) -> str:
text = URL_RE.sub("", text or "")
text = text.replace("\t", " ")
text = re.sub(r"[^\x09\x0A\x0D\x20-\x7EΓ€-ΓΏ]", "", text)
cleaned_lines: List[str] = []
for raw_ln in text.splitlines():
ln = re.sub(r"\s+", " ", raw_ln).strip()
if not ln:
continue
if BAB_LINE_RE.match(ln):
continue
ln = BAB_PREFIX_RE.sub("", ln).strip()
if not ln:
continue
cleaned_lines.append(ln)
text_out = "\n".join(cleaned_lines).strip()
return text_out
def pixmap_to_pil(pix: fitz.Pixmap) -> Image.Image:
if pix.alpha:
pix = fitz.Pixmap(fitz.csRGB, pix)
img_bytes = pix.tobytes("png")
return Image.open(BytesIO(img_bytes))
def ocr_page(img: Image.Image, lang: str) -> str:
return clean_text(pytesseract.image_to_string(img, lang=lang))
def main():
if not os.path.exists(PDF_PATH):
print(f"PDF tidak ditemukan: {PDF_PATH}")
sys.exit(1)
doc = fitz.open(PDF_PATH)
total = doc.page_count
skip: Set[int] = set(SKIP_PAGES or [])
zoom = DPI / 72.0
mat = fitz.Matrix(zoom, zoom)
results: List[str] = []
skipped = 0
kept = 0
print(f"[*] Total halaman: {total} | DPI render: {DPI}")
for page_num in range(1, total + 1):
if page_num in skip:
skipped += 1
print(f"Halaman {page_num} dilewati.")
continue
page = doc.load_page(page_num - 1)
pix = page.get_pixmap(matrix=mat, alpha=False)
img = pixmap_to_pil(pix)
print(f"Halaman {page_num}: OCR …")
try:
txt = ocr_page(img, OCR_LANG)
except Exception as e:
print(f"[!] OCR gagal halaman {page_num}: {e}")
txt = ""
if txt.strip():
results.append(txt.strip())
kept += 1
else:
print(f"Halaman {page_num}: hasil kosong/pendek.")
doc.close()
os.makedirs(os.path.dirname(OUTPUT_TXT), exist_ok=True)
with open(OUTPUT_TXT, "w", encoding="utf-8") as f:
for t in results:
if not t.strip():
continue
f.write(t + "\n\n")
print("\nRingkasan:")
print(f"- Total halaman : {total}")
print(f"- Dilewati (skip) : {skipped}")
print(f"- Tersimpan (non-skip): {kept}")
print(f"[*] Output: {OUTPUT_TXT}")
if __name__ == "__main__":
main()