Spaces:

Dyraa18
/

Rag-dan-Guardrail

Running

App Files Files Community

Rag-dan-Guardrail / Rag-Pipeline /cleans.py

Dyraa18

Upload 8 files

fb2123c verified about 1 month ago

raw

history blame

1.94 kB

	import os
	import re
	from pdfminer.high_level import extract_pages
	from pdfminer.layout import LTTextContainer


	def extract_and_clean_pdf(path: str, skip_pages: list[int] = None) -> list[str]:
	skip_pages = skip_pages or []
	cleaned_pages = []

	for i, page_layout in enumerate(extract_pages(path), start=1):
	if i in skip_pages:
	print(f"Halaman {i} dilewati.")
	continue

	page_text = ""
	for element in page_layout:
	if isinstance(element, LTTextContainer):
	page_text += element.get_text()

	cleaned_text = clean_text(page_text)
	cleaned_pages.append(cleaned_text)

	print(f"\nTotal halaman diambil: {len(cleaned_pages)} halaman (dari {i} total halaman).")
	return cleaned_pages


	def clean_text(text: str) -> str:
	text = text.replace("\n", " ").replace("\t", " ")
	text = re.sub(r'[^\x20-\x7EÀ-ÿ]', '', text)
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
	text = re.sub(r'([a-z])([0-9])', r'\1 \2', text)
	text = re.sub(r'([0-9])([a-zA-Z])', r'\1 \2', text)
	text = re.sub(r'\s+([.,!?;:])', r'\1', text)
	return text.strip()


	def save_cleaned_text(cleaned_pages: list[str], output_path: str):
	os.makedirs(os.path.dirname(output_path), exist_ok=True)

	with open(output_path, "w", encoding="utf-8") as f:
	for page in cleaned_pages:
	f.write(page + "\n\n")

	print(f"File teks berhasil disimpan ke:\n{output_path}")


	if __name__ == "__main__":
	pdf_path = "D:\Webchatbot\Dataset\Penjas\PJOK_BS_KLS_V.pdf"
	output_txt = "D:\Webchatbot\Dataset\Penjas\Clean\Penjas Kelas V.txt"

	halaman_dihapus = []+ list(range(1,15)) + list(range(188,208))
	hasil = extract_and_clean_pdf(pdf_path, skip_pages=halaman_dihapus)

	if hasil:
	save_cleaned_text(hasil, output_txt)
	else:
	print("Tidak ada halaman yang diekstrak.")