iamkoder001
/

ARAVALLI-1

ecological-intelligence

environmental-protection

Model card Files Files and versions

ARAVALLI-1 / data /scripts /cleaner.py

iamkoder001's picture

Create data/scripts/cleaner.py

5648af1 verified 4 days ago

history blame contribute delete

2.04 kB

	import os
	import pymupdf4llm
	import pathlib
	import hashlib

	class SovereignCleaner:
	"""
	Cleans raw PDF ingestion and converts it to training-ready text.
	Ensures every document is hashed for the GOEC Audit Trail.
	"""
	def __init__(self, raw_dir="data/raw/", clean_dir="data/processed/texts/"):
	self.raw_dir = raw_dir
	self.clean_dir = clean_dir
	if not os.path.exists(self.clean_dir):
	os.makedirs(self.clean_dir)

	def _get_file_hash(self, filepath):
	"""Generates SHA-256 hash to ensure the data is unfalsifiable."""
	sha256_hash = hashlib.sha256()
	with open(filepath, "rb") as f:
	for byte_block in iter(lambda: f.read(4096), b""):
	sha256_hash.update(byte_block)
	return sha256_hash.hexdigest()

	def clean_all(self):
	"""Iterates through raw PDFs and extracts structured text."""
	files = [f for f in os.listdir(self.raw_dir) if f.endswith(".pdf")]
	print(f"Cleaning {len(files)} documents for ARAVALLI-1...")

	for file in files:
	raw_path = os.path.join(self.raw_dir, file)
	file_hash = self._get_file_hash(raw_path)

	# Use PyMuPDF4LLM for Markdown extraction (keeps tables/headings)
	try:
	md_text = pymupdf4llm.to_markdown(raw_path)

	# Metadata injection for the model's context
	header = f"--- SOURCE_HASH: {file_hash} ---\n"
	final_text = header + md_text

	clean_name = file.replace(".pdf", ".md")
	clean_path = os.path.join(self.clean_dir, clean_name)

	with open(clean_path, "w", encoding="utf-8") as f:
	f.write(final_text)
	print(f"Verified & Cleaned: {file}")
	except Exception as e:
	print(f"Failed to clean {file}: {e}")

	if __name__ == "__main__":
	cleaner = SovereignCleaner()
	cleaner.clean_all()