Transformers
English
Hindi
Sanskrit
sovereign-ai
ecological-intelligence
indian-llm
environmental-protection
ARAVALLI-1 / data /scripts /cleaner.py
iamkoder001's picture
Create data/scripts/cleaner.py
5648af1 verified
import os
import pymupdf4llm
import pathlib
import hashlib
class SovereignCleaner:
"""
Cleans raw PDF ingestion and converts it to training-ready text.
Ensures every document is hashed for the GOEC Audit Trail.
"""
def __init__(self, raw_dir="data/raw/", clean_dir="data/processed/texts/"):
self.raw_dir = raw_dir
self.clean_dir = clean_dir
if not os.path.exists(self.clean_dir):
os.makedirs(self.clean_dir)
def _get_file_hash(self, filepath):
"""Generates SHA-256 hash to ensure the data is unfalsifiable."""
sha256_hash = hashlib.sha256()
with open(filepath, "rb") as f:
for byte_block in iter(lambda: f.read(4096), b""):
sha256_hash.update(byte_block)
return sha256_hash.hexdigest()
def clean_all(self):
"""Iterates through raw PDFs and extracts structured text."""
files = [f for f in os.listdir(self.raw_dir) if f.endswith(".pdf")]
print(f"Cleaning {len(files)} documents for ARAVALLI-1...")
for file in files:
raw_path = os.path.join(self.raw_dir, file)
file_hash = self._get_file_hash(raw_path)
# Use PyMuPDF4LLM for Markdown extraction (keeps tables/headings)
try:
md_text = pymupdf4llm.to_markdown(raw_path)
# Metadata injection for the model's context
header = f"--- SOURCE_HASH: {file_hash} ---\n"
final_text = header + md_text
clean_name = file.replace(".pdf", ".md")
clean_path = os.path.join(self.clean_dir, clean_name)
with open(clean_path, "w", encoding="utf-8") as f:
f.write(final_text)
print(f"Verified & Cleaned: {file}")
except Exception as e:
print(f"Failed to clean {file}: {e}")
if __name__ == "__main__":
cleaner = SovereignCleaner()
cleaner.clean_all()