Spaces:

Dyraa18
/

Rag-dan-Guardrail

Running

App Files Files Community

Rag-dan-Guardrail / Rag-Pipeline /cleans2.py

Dyraa18

Upload 8 files

fb2123c verified about 1 month ago

raw

history blame

3.35 kB


	import os, re, sys
	from typing import List, Optional, Set
	import fitz
	import pytesseract
	from PIL import Image
	from io import BytesIO

	PDF_PATH = r"D:\Webchatbot\Dataset\Penjas\PJOK_BS_KLS_VI.pdf"
	OUTPUT_TXT = r"D:\Webchatbot\Dataset\Penjas\Clean\Penjas Kelas VI.txt"
	SKIP_PAGES = list(range(1, 22)) + list(range(200, 211)) + list(range(213, 226))
	TESSERACT_CMD = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
	OCR_LANG = "ind+eng"
	DPI = 300


	if TESSERACT_CMD:
	pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD


	URL_RE = re.compile(
	r"(https?://\S+\|www\.\S+\|\b\S+\.(?:com\|org\|net\|edu\|gov\|go\|id\|co)\S*)",
	flags=re.IGNORECASE,
	)
	BAB_LINE_RE = re.compile(
	r"^\s(?:bab\|BAB)\s(?:[0-9]+\|[IVXLCDM]+)\s(?:[:\-–—]\s.)?\s$"
	)
	BAB_PREFIX_RE = re.compile(
	r"^\s(?:bab\|BAB)\s(?:[0-9]+\|[IVXLCDM]+)\s(?:[:\-–—]\s)?",
	flags=re.IGNORECASE,
	)

	def clean_text(text: str) -> str:
	text = URL_RE.sub("", text or "")

	text = text.replace("\t", " ")
	text = re.sub(r"[^\x09\x0A\x0D\x20-\x7EÀ-ÿ]", "", text)

	cleaned_lines: List[str] = []
	for raw_ln in text.splitlines():
	ln = re.sub(r"\s+", " ", raw_ln).strip()
	if not ln:
	continue

	if BAB_LINE_RE.match(ln):
	continue

	ln = BAB_PREFIX_RE.sub("", ln).strip()

	if not ln:
	continue

	cleaned_lines.append(ln)

	text_out = "\n".join(cleaned_lines).strip()
	return text_out

	def pixmap_to_pil(pix: fitz.Pixmap) -> Image.Image:
	if pix.alpha:
	pix = fitz.Pixmap(fitz.csRGB, pix)
	img_bytes = pix.tobytes("png")
	return Image.open(BytesIO(img_bytes))

	def ocr_page(img: Image.Image, lang: str) -> str:
	return clean_text(pytesseract.image_to_string(img, lang=lang))

	def main():
	if not os.path.exists(PDF_PATH):
	print(f"PDF tidak ditemukan: {PDF_PATH}")
	sys.exit(1)

	doc = fitz.open(PDF_PATH)
	total = doc.page_count
	skip: Set[int] = set(SKIP_PAGES or [])

	zoom = DPI / 72.0
	mat = fitz.Matrix(zoom, zoom)

	results: List[str] = []
	skipped = 0
	kept = 0

	print(f"[*] Total halaman: {total} \| DPI render: {DPI}")
	for page_num in range(1, total + 1):
	if page_num in skip:
	skipped += 1
	print(f"Halaman {page_num} dilewati.")
	continue

	page = doc.load_page(page_num - 1)
	pix = page.get_pixmap(matrix=mat, alpha=False)
	img = pixmap_to_pil(pix)

	print(f"Halaman {page_num}: OCR …")
	try:
	txt = ocr_page(img, OCR_LANG)
	except Exception as e:
	print(f"[!] OCR gagal halaman {page_num}: {e}")
	txt = ""

	if txt.strip():
	results.append(txt.strip())
	kept += 1
	else:
	print(f"Halaman {page_num}: hasil kosong/pendek.")

	doc.close()

	os.makedirs(os.path.dirname(OUTPUT_TXT), exist_ok=True)
	with open(OUTPUT_TXT, "w", encoding="utf-8") as f:
	for t in results:
	if not t.strip():
	continue
	f.write(t + "\n\n")

	print("\nRingkasan:")
	print(f"- Total halaman : {total}")
	print(f"- Dilewati (skip) : {skipped}")
	print(f"- Tersimpan (non-skip): {kept}")
	print(f"[*] Output: {OUTPUT_TXT}")

	if __name__ == "__main__":
	main()