Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Aug 22

Commit

4fe8577

verified ·

1 Parent(s): a179120

Update create_granular_chunks.py

Browse files

Files changed (1) hide show

create_granular_chunks.py +157 -354

create_granular_chunks.py CHANGED Viewed

@@ -1,414 +1,217 @@
-# create_granular_chunks.py - Enhanced Version for NEEPCO DOP Policies
 import os
 import json
 import re
-from typing import List, Dict, Any, Set
 import nltk
-# Download required NLTK data
-nltk.download('punkt', quiet=True)
-nltk.download('punkt_tab', quiet=True)
 # --- Configuration ---
 INPUT_FILE = "combined_context.jsonl"
-OUTPUT_FILE = "granular_chunks_final.jsonl"
 # --- Global State ---
 chunk_counter = 0
 def get_unique_id() -> str:
     """Returns a unique, incrementing ID for each chunk."""
     global chunk_counter
     chunk_counter += 1
     return f"chunk-{chunk_counter}"
-# --- Key Enhancement: NEEPCO-specific entity extraction ---
-def extract_key_entities(text: str) -> Dict[str, Set[str]]:
-    """Extract key entities specific to NEEPCO DOP policies."""
-    entities = {
-        'positions': set(),
-        'amounts': set(),
-        'sections': set(),
-        'procedures': set(),
-        'authorities': set()
     }
-    # Position patterns (E-1 to E-9, specific roles)
-    position_patterns = [
-        r'\b(?:Director|CMD|ED|CGM|GM|DGM|Sr\.?\s*M(?:gr)?|Manager|HOP|HOD)\b',
-        r'\bE-[1-9]\b',
-        r'\b(?:Chairman|Secretary|Chief|Head)\b'
-    ]
-    # Amount patterns (₹, crore, lakh)
-    amount_patterns = [
-        r'₹\s*\d+(?:[.,]\d+)*\s*(?:crore|lakh|thousand)?',
-        r'\d+(?:[.,]\d+)*\s*(?:crore|lakh|thousand)',
-        r'Full\s+Power[s]?'
-    ]
-    # Section patterns
-    section_patterns = [r'\b(?:Section|Annexure|Clause)\s*[IVX]+\b', r'\b(?:clause|sub-clause)\s*\d+\b']
-    # Extract entities
-    for pattern in position_patterns:
-        entities['positions'].update(re.findall(pattern, text, re.IGNORECASE))
-    for pattern in amount_patterns:
-        entities['amounts'].update(re.findall(pattern, text, re.IGNORECASE))
-    for pattern in section_patterns:
-        entities['sections'].update(re.findall(pattern, text, re.IGNORECASE))
-    return entities
-def create_question_answer_chunks(context: Dict) -> List[Dict]:
-    """Create targeted Q&A style chunks that anticipate user questions."""
-    chunks = []
-    section = context.get("section", "")
-    title = context.get("title", "")
-    clause = context.get("clause") or context.get("Clause")
-    # Generate approval authority questions
-    if "delegation" in context:
-        delegation = context["delegation"]
-        if isinstance(delegation, dict):
-            for authority, limit in delegation.items():
-                if limit and str(limit) not in ["---", "NIL"]:
-                    qa_text = (f"Question: Who can approve {title.lower()} and what is their limit? "
-                             f"Answer: {authority} can approve {title.lower()} up to {limit}. "
-                             f"This is covered under {section} clause {clause}.")
-                    entities = extract_key_entities(qa_text)
-                    chunk = {
-                        "id": get_unique_id(),
-                        "text": qa_text,
-                        "metadata": {
-                            "section": section,
-                            "clause": clause,
-                            "title": title,
-                            "chunk_type": "approval_authority",
-                            "authority": authority,
-                            "limit": str(limit),
-                            "entities": {k: list(v) for k, v in entities.items() if v}
-                        }
-                    }
-                    chunks.append(chunk)
-    # Generate procedure-specific chunks
-    if "items" in context:
-        for item in context["items"]:
-            if isinstance(item, str):
-                qa_text = (f"Question: What are the requirements for {title.lower()}? "
-                         f"Answer: For {title.lower()}, one requirement is: {item}. "
-                         f"This is specified in {section} clause {clause}.")
-                entities = extract_key_entities(qa_text)
-                chunk = {
-                    "id": get_unique_id(),
-                    "text": qa_text,
-                    "metadata": {
-                        "section": section,
-                        "clause": clause,
-                        "title": title,
-                        "chunk_type": "requirement",
-                        "requirement": item,
-                        "entities": {k: list(v) for k, v in entities.items() if v}
-                    }
-                }
-                chunks.append(chunk)
-    return chunks
-def create_context_rich_chunks(context: Dict) -> List[Dict]:
-    """Create chunks with rich contextual information."""
-    chunks = []
-    section = context.get("section", "")
-    title = context.get("title", "")
-    clause = context.get("clause") or context.get("Clause")
-    # Handle delegation information with full context
-    if "delegation" in context:
-        delegation = context["delegation"]
-        if isinstance(delegation, dict):
-            # Create a comprehensive delegation summary
-            delegation_items = []
-            for auth, limit in delegation.items():
-                if limit and str(limit) not in ["---", "NIL"]:
-                    delegation_items.append(f"{auth}: {limit}")
-                elif str(limit) in ["---", "NIL"]:
-                    delegation_items.append(f"{auth}: No authority")
-            if delegation_items:
-                delegation_text = (f"In {section} clause {clause} regarding '{title}', "
-                                 f"the delegation of powers is as follows: {'; '.join(delegation_items)}. ")
-                # Add remarks if available
-                if "remarks" in context:
-                    remarks = format_remarks(context["remarks"])
-                    delegation_text += f"Important notes: {remarks}"
-                entities = extract_key_entities(delegation_text)
-                chunk = {
-                    "id": get_unique_id(),
-                    "text": delegation_text,
-                    "metadata": {
-                        "section": section,
-                        "clause": clause,
-                        "title": title,
-                        "chunk_type": "delegation_summary",
-                        "delegation_count": len(delegation_items),
-                        "entities": {k: list(v) for k, v in entities.items() if v}
-                    }
-                }
-                chunks.append(chunk)
-    # Handle composition information (for committees)
-    if "composition" in context:
-        composition = context["composition"]
-        if isinstance(composition, list):
-            comp_text = f"The composition for '{title}' in {section} clause {clause} includes: "
-            comp_details = []
-            for item in composition:
-                if isinstance(item, dict):
-                    for role, members in item.items():
-                        if isinstance(members, list):
-                            comp_details.append(f"{role}: {', '.join(members)}")
-                        else:
-                            comp_details.append(f"{role}: {members}")
-            comp_text += "; ".join(comp_details) + "."
-            if "approving_authority" in context:
-                comp_text += f" The approving authority is: {context['approving_authority']}."
-            entities = extract_key_entities(comp_text)
-            chunk = {
-                "id": get_unique_id(),
-                "text": comp_text,
-                "metadata": {
-                    "section": section,
-                    "clause": clause,
-                    "title": title,
-                    "chunk_type": "composition",
-                    "entities": {k: list(v) for k, v in entities.items() if v}
-                }
-            }
-            chunks.append(chunk)
-    return chunks
-def create_method_specific_chunks(context: Dict) -> List[Dict]:
-    """Handle method-specific information (like different tender types)."""
-    chunks = []
-    if "methods" in context:
-        for method in context["methods"]:
-            if isinstance(method, dict) and "method" in method:
-                method_name = method["method"]
-                delegation = method.get("delegation", {})
-                if isinstance(delegation, dict):
-                    method_text = (f"For {context.get('title', 'procurement')} using {method_name}, "
-                                 f"the approval limits are: ")
-                    limits = []
-                    for auth, limit in delegation.items():
-                        if limit and str(limit) not in ["---", "NIL"]:
-                            limits.append(f"{auth} can approve up to {limit}")
-                    method_text += "; ".join(limits) + f". This is covered under {context.get('section')} clause {context.get('clause')}."
-                    entities = extract_key_entities(method_text)
-                    chunk = {
-                        "id": get_unique_id(),
-                        "text": method_text,
-                        "metadata": {
-                            "section": context.get("section"),
-                            "clause": context.get("clause"),
-                            "title": context.get("title"),
-                            "method": method_name,
-                            "chunk_type": "method_specific",
-                            "entities": {k: list(v) for k, v in entities.items() if v}
-                        }
-                    }
-                    chunks.append(chunk)
-    return chunks
 def format_remarks(remarks: Any) -> str:
-    """Enhanced remarks formatting with better structure."""
     if isinstance(remarks, list):
-        formatted_remarks = []
         for item in remarks:
             if isinstance(item, dict):
                 for key, value in item.items():
-                    formatted_remarks.append(f"{key}: {value}")
             else:
-                formatted_remarks.append(str(item))
-        return " | ".join(formatted_remarks)
-    return str(remarks) if remarks else ""
-def process_subclauses(subclauses: List[Dict], parent_context: Dict) -> List[Dict]:
-    """Process subclauses with enhanced context preservation."""
     chunks = []
-    for subclause in subclauses:
-        if isinstance(subclause, dict):
-            # Merge parent context
-            full_context = {**parent_context, **subclause}
-            # Generate different types of chunks
-            chunks.extend(create_question_answer_chunks(full_context))
-            chunks.extend(create_context_rich_chunks(full_context))
-            chunks.extend(create_method_specific_chunks(full_context))
-            # Recursively process nested structures
-            if "subclauses" in subclause:
-                chunks.extend(process_subclauses(subclause["subclauses"], full_context))
     return chunks
 def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
-    """Enhanced entry processing with multiple chunking strategies."""
     context = {**(parent_context or {}), **data}
     chunks = []
-    # Process subclauses first (most important for DOP policies)
-    if "subclauses" in data:
-        chunks.extend(process_subclauses(data["subclauses"], context))
-    # Generate various chunk types for the main entry
-    chunks.extend(create_question_answer_chunks(context))
-    chunks.extend(create_context_rich_chunks(context))
-    chunks.extend(create_method_specific_chunks(context))
-    # Handle special cases for financial concurrence
-    if context.get("section") == "Financial Concurrence":
-        fc_text = (f"Financial Concurrence requirements for {context.get('title', 'this matter')}: "
-                   f"{context.get('description', 'See policy details')}. ")
-        if "exclusions" in context:
-            fc_text += f"Exclusions from financial concurrence: {'; '.join(context['exclusions'])}."
-        entities = extract_key_entities(fc_text)
-        chunk = {
-            "id": get_unique_id(),
-            "text": fc_text,
-            "metadata": {
-                "section": context.get("section"),
-                "clause": context.get("clause"),
-                "title": context.get("title"),
-                "chunk_type": "financial_concurrence",
-                "entities": {k: list(v) for k, v in entities.items() if v}
-            }
-        }
-        chunks.append(chunk)
-    # Handle Annexure items (Board-level approvals)
-    if context.get("section") == "Annexure A":
-        annexure_text = (f"Board of Directors approval is required for {context.get('title')}: "
-                        f"{context.get('description', 'various matters')}. ")
-        if "items" in context:
-            annexure_text += f"Specific items include: {'; '.join(context['items'])}."
-        entities = extract_key_entities(annexure_text)
-        chunk = {
-            "id": get_unique_id(),
-            "text": annexure_text,
-            "metadata": {
-                "section": context.get("section"),
-                "clause": context.get("clause"),
-                "title": context.get("title"),
-                "chunk_type": "board_approval",
-                "entities": {k: list(v) for k, v in entities.items() if v}
-            }
-        }
-        chunks.append(chunk)
     return chunks
 def main():
-    """Enhanced main function with better logging and deduplication."""
-    print(f"Processing '{INPUT_FILE}' with enhanced NEEPCO DOP chunking...")
     all_chunks = []
-    line_count = 0
     try:
         with open(INPUT_FILE, 'r', encoding='utf-8') as f:
             for i, line in enumerate(f):
-                line_count += 1
                 try:
                     data = json.loads(line)
                     processed = process_entry(data)
                     if processed:
                         all_chunks.extend(processed)
-                        if line_count % 10 == 0:
-                            print(f"Processed {line_count} lines, generated {len(all_chunks)} chunks so far...")
                 except json.JSONDecodeError:
                     print(f"Warning: Skipping malformed JSON on line {i+1}")
                     continue
     except FileNotFoundError:
         print(f"Error: Input file '{INPUT_FILE}' not found.")
         return
-    print(f"Generated {len(all_chunks)} total chunks from {line_count} lines.")
-    # Enhanced deduplication by text similarity
-    unique_chunks = []
-    seen_texts = set()
     for chunk in all_chunks:
-        # Create a normalized version for comparison
-        normalized_text = re.sub(r'\s+', ' ', chunk['text'].lower().strip())
-        if normalized_text not in seen_texts:
-            seen_texts.add(normalized_text)
-            unique_chunks.append(chunk)
-    print(f"After deduplication: {len(unique_chunks)} unique chunks.")
-    # Sort chunks by section and clause for better organization
-    def sort_key(chunk):
-        section = chunk['metadata'].get('section', 'ZZZ')
-        clause = chunk['metadata'].get('clause', 999)
-        if isinstance(clause, str):
-            try:
-                clause = int(re.search(r'\d+', clause).group())
-            except:
-                clause = 999
-        return (section, clause)
-    unique_chunks.sort(key=sort_key)
-    # Write output
     with open(OUTPUT_FILE, 'w', encoding='utf-8') as outf:
         for chunk in unique_chunks:
             outf.write(json.dumps(chunk, ensure_ascii=False) + "\n")
-    print(f"Successfully wrote enhanced chunks to '{OUTPUT_FILE}'.")
-    # Print some statistics
-    chunk_types = {}
-    for chunk in unique_chunks:
-        chunk_type = chunk['metadata'].get('chunk_type', 'unknown')
-        chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1
-    print("\nChunk type distribution:")
-    for chunk_type, count in sorted(chunk_types.items()):
-        print(f"  {chunk_type}: {count}")
 if __name__ == "__main__":
     main()

+# create_granular_chunks.py
 import os
 import json
 import re
+from typing import List, Dict, Any
 import nltk
+# Download punkt tokenizer if not already done (Ensure this runs once in your environment setup)
+nltk.download('punkt')
+nltk.download('punkt_tab')  # Also download punkt_tab to avoid LookupError
 # --- Configuration ---
 INPUT_FILE = "combined_context.jsonl"
+OUTPUT_FILE = "granular_chunks_final.jsonl"  # Keep filename consistent
 # --- Global State ---
 chunk_counter = 0
 def get_unique_id() -> str:
     """Returns a unique, incrementing ID for each chunk."""
     global chunk_counter
     chunk_counter += 1
     return f"chunk-{chunk_counter}"
+def create_chunk(context: Dict, text: str) -> Dict:
+    """Creates a standardized chunk dictionary with rich metadata."""
+    metadata = {
+        "section": context.get("section"),
+        "clause": context.get("clause") or context.get("Clause"),
+        "title": context.get("title"),
+        "source_description": context.get("description"),
     }
+    # Add other primitive metadata keys
+    for key, value in context.items():
+        if key not in metadata and isinstance(value, (str, int, float, bool)):
+            metadata[key] = value
+    return {
+        "id": get_unique_id(),
+        "text": text.strip(),
+        "metadata": {k: v for k, v in metadata.items() if v is not None}
+    }
+def format_delegation_text(delegation: Any) -> str:
+    """
+    Formats a delegation dictionary or string into a readable string.
+    Explicitly includes "NIL" or "---" to capture no power cases.
+    """
+    if not isinstance(delegation, dict):
+        return str(delegation)
+    parts = [f"the limit for {auth} is {limit if limit and str(limit) != '---' else 'NIL'}" for auth, limit in delegation.items()]
+    return ", ".join(parts) if parts else "No specific delegation provided."
 def format_remarks(remarks: Any) -> str:
+    """Safely formats the 'remarks' field, handling various data types."""
     if isinstance(remarks, list):
+        remark_parts = []
         for item in remarks:
             if isinstance(item, dict):
                 for key, value in item.items():
+                    remark_parts.append(f"{key}: {value}")
             else:
+                remark_parts.append(str(item))
+        return " ".join(remark_parts)
+    return str(remarks)
+def build_descriptive_text(context: Dict) -> str:
+    """
+    Builds a clear, descriptive, natural language text by combining fields.
+    Focused for best relevance and contextual richness.
+    """
+    text_parts = []
+    if context.get("title"):
+        text_parts.append(f"Regarding the policy '{context['title']}'")
+    specific_desc = context.get('description') or context.get('method')
+    if specific_desc and specific_desc != context.get('title'):
+        text_parts.append(f"specifically for '{specific_desc}'")
+    if "delegation" in context:
+        delegation_text = format_delegation_text(context["delegation"])
+        text_parts.append(f", financial delegations are: {delegation_text}.")
+    elif "composition" in context:
+        composition_parts = []
+        for item in context["composition"]:
+            if isinstance(item, dict):
+                for role, members in item.items():
+                    member_text = (f"the {role} is {members}" if isinstance(members, str)
+                                   else f"the {role} are: {', '.join(members)}")
+                    composition_parts.append(member_text)
+        text_parts.append(f", the composition is: {'; '.join(composition_parts)}.")
+    if "remarks" in context and context["remarks"]:
+        remarks_text = format_remarks(context["remarks"])
+        text_parts.append(f" Important remarks include: {remarks_text}")
+    # Join all parts into a flowing sentence
+    return " ".join(text_parts).strip()
+def split_text_into_chunks(text: str, max_char_length: int = 1500, overlap: int = 200) -> List[str]:
+    """
+    Splits a long text into smaller chunks with controlled overlap.
+    Uses sentence tokenization for natural splits.
+    """
+    text = text.strip()
+    if len(text) <= max_char_length:
+        return [text]
+    # Explicitly specify language to avoid punkt_tab error
+    sentences = nltk.tokenize.sent_tokenize(text, language='english')
     chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        # +1 for space/newline likely added between sentences
+        if len(current_chunk) + len(sentence) + 1 <= max_char_length:
+            current_chunk += (" " + sentence) if current_chunk else sentence
+        else:
+            chunks.append(current_chunk.strip())
+            # Start next chunk with overlap from end of previous chunk (by characters)
+            if overlap < len(current_chunk):
+                current_chunk = current_chunk[-overlap:] + " " + sentence
+            else:
+                current_chunk = sentence
+    if current_chunk:
+        chunks.append(current_chunk.strip())
     return chunks
 def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
+    """
+    Processes a JSON policy entry and returns granular, context-rich chunks.
+    Applies recursive traversal and implements chunk size limiting.
+    """
     context = {**(parent_context or {}), **data}
     chunks = []
+    # Handler 1: Simple Item Lists (ex: rules, exclusions)
+    list_key = next((key for key in ["items", "exclusions"] if key in data and isinstance(data.get(key), list)), None)
+    if list_key:
+        base_title = context.get('title', 'a policy')
+        for item in data[list_key]:
+            if isinstance(item, str):
+                # Build chunk text with clear descriptive prefix for relevance
+                text = f"A rule regarding '{base_title}' is: {item}."
+                # Split if too long
+                for sub_chunk in split_text_into_chunks(text):
+                    chunks.append(create_chunk(context, sub_chunk))
+        return chunks
+    # Handler 2: Recursive traversal for nested dictionaries/lists
+    has_recursed = False
+    for key, value in data.items():
+        if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
+            for item in value:
+                chunks.extend(process_entry(item, context))
+            has_recursed = True
+    # Handler 3: Leaf nodes with delegation, composition or description
+    if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
+        text = build_descriptive_text(context)
+        # Split long descriptive text intelligently
+        for chunk_text in split_text_into_chunks(text):
+            chunks.append(create_chunk(context, chunk_text))
     return chunks
 def main():
+    """Main orchestration to read input, process, and write chunks."""
+    print(f"Starting to process '{INPUT_FILE}' for improved granular chunking...")
     all_chunks = []
     try:
         with open(INPUT_FILE, 'r', encoding='utf-8') as f:
             for i, line in enumerate(f):
                 try:
                     data = json.loads(line)
                     processed = process_entry(data)
                     if processed:
                         all_chunks.extend(processed)
                 except json.JSONDecodeError:
                     print(f"Warning: Skipping malformed JSON on line {i+1}")
                     continue
     except FileNotFoundError:
         print(f"Error: Input file '{INPUT_FILE}' not found.")
         return
+    print(f"Generated {len(all_chunks)} chunks before deduplication.")
+    # Deduplicate by text content (retaining last occurrences)
+    unique_chunks_map = {}
     for chunk in all_chunks:
+        unique_chunks_map[chunk['text']] = chunk
+    unique_chunks = list(unique_chunks_map.values())
+    print(f"{len(unique_chunks)} unique chunks after deduplication.")
+    # Write output in JSONL format for later vector DB ingestion
     with open(OUTPUT_FILE, 'w', encoding='utf-8') as outf:
         for chunk in unique_chunks:
             outf.write(json.dumps(chunk, ensure_ascii=False) + "\n")
+    print(f"Successfully wrote improved granular chunks to '{OUTPUT_FILE}'.")
 if __name__ == "__main__":
     main()