Transformers
English
Hindi
Sanskrit
sovereign-ai
ecological-intelligence
indian-llm
environmental-protection
File size: 2,036 Bytes
5648af1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import os
import pymupdf4llm
import pathlib
import hashlib

class SovereignCleaner:
    """
    Cleans raw PDF ingestion and converts it to training-ready text.
    Ensures every document is hashed for the GOEC Audit Trail.
    """
    def __init__(self, raw_dir="data/raw/", clean_dir="data/processed/texts/"):
        self.raw_dir = raw_dir
        self.clean_dir = clean_dir
        if not os.path.exists(self.clean_dir):
            os.makedirs(self.clean_dir)

    def _get_file_hash(self, filepath):
        """Generates SHA-256 hash to ensure the data is unfalsifiable."""
        sha256_hash = hashlib.sha256()
        with open(filepath, "rb") as f:
            for byte_block in iter(lambda: f.read(4096), b""):
                sha256_hash.update(byte_block)
        return sha256_hash.hexdigest()

    def clean_all(self):
        """Iterates through raw PDFs and extracts structured text."""
        files = [f for f in os.listdir(self.raw_dir) if f.endswith(".pdf")]
        print(f"Cleaning {len(files)} documents for ARAVALLI-1...")

        for file in files:
            raw_path = os.path.join(self.raw_dir, file)
            file_hash = self._get_file_hash(raw_path)
            
            # Use PyMuPDF4LLM for Markdown extraction (keeps tables/headings)
            try:
                md_text = pymupdf4llm.to_markdown(raw_path)
                
                # Metadata injection for the model's context
                header = f"--- SOURCE_HASH: {file_hash} ---\n"
                final_text = header + md_text
                
                clean_name = file.replace(".pdf", ".md")
                clean_path = os.path.join(self.clean_dir, clean_name)
                
                with open(clean_path, "w", encoding="utf-8") as f:
                    f.write(final_text)
                print(f"Verified & Cleaned: {file}")
            except Exception as e:
                print(f"Failed to clean {file}: {e}")

if __name__ == "__main__":
    cleaner = SovereignCleaner()
    cleaner.clean_all()