Spaces:

Kalpokoch
/

ChatbotDemo

Running

App Files Files

Kalpokoch commited on Aug 21

Commit

74a1e84

verified ·

1 Parent(s): ea6be63

Update create_granular_chunks.py

Browse files

Files changed (1) hide show

create_granular_chunks.py +195 -219

create_granular_chunks.py CHANGED Viewed

@@ -1,241 +1,217 @@
-# create_granular_chunks.py (place this in root directory)
 import json
 import re
-import hashlib
-from typing import List, Dict, Any, Set
-import tiktoken
-def count_tokens(text: str, model: str = "gpt-3.5-turbo") -> int:
-    """Count tokens using tiktoken."""
-    try:
-        encoding = tiktoken.encoding_for_model(model)
-        return len(encoding.encode(text))
-    except Exception:
-        # Fallback to simple word-based estimation
-        return len(text.split()) * 1.3
-def extract_financial_keywords(text: str) -> List[str]:
-    """Extract financial keywords from text."""
-    financial_patterns = [
-        r'₹[\d,]+(?:\.\d{1,2})?(?:\s*(?:crore|lakh|thousand))?',
-        r'\b(?:budget|cost|expenditure|estimate|payment|procurement)\b',
-        r'\b(?:tender|contract|purchase|award)\b',
-        r'\b(?:crore|lakh|thousand)\b'
-    ]
-    keywords = set()
-    for pattern in financial_patterns:
-        matches = re.findall(pattern, text, re.IGNORECASE)
-        keywords.update(matches)
-    return list(keywords)[:10]  # Limit to 10 keywords
-def extract_authority_keywords(text: str) -> List[str]:
-    """Extract authority/designation keywords from text."""
-    authority_patterns = [
-        r'\b(?:D\([TPF]\)|ED|CGM|GM|DGM|Sr\.?\s*M(?:anager)?)\b',
-        r'\b(?:Director|Manager|Chief|Head)\b',
-        r'\b(?:CMD|BOD|HOP|HOD|HOF)\b',
-        r'\b(?:approval|sanction|delegation|authority|power)\b'
-    ]
-    keywords = set()
-    for pattern in authority_patterns:
-        matches = re.findall(pattern, text, re.IGNORECASE)
-        keywords.update(matches)
-    return list(keywords)[:10]  # Limit to 10 keywords
-def create_chunk_text_from_item(item: Dict) -> str:
-    """Create comprehensive chunk text from a single item."""
-    parts = []
-    # Add section and title context
-    if item.get('section'):
-        parts.append(f"Regarding the policy '{item.get('title', 'Unknown')}' under section '{item['section']}':")
-    # Add main description
-    if item.get('description'):
-        parts.append(item['description'])
-    # Add items if present
-    if item.get('items'):
-        if len(item['items']) == 1:
-            parts.append(f"This covers: {item['items'][0]}")
-        else:
-            parts.append("This covers the following:")
-            for i, sub_item in enumerate(item['items'], 1):
-                parts.append(f"{i}. {sub_item}")
-    # Add delegation information
-    if item.get('delegation'):
-        parts.append("Authority delegation:")
-        for role, limit in item['delegation'].items():
-            if limit and limit != "NIL":
-                parts.append(f"- {role}: {limit}")
-    # Add subclauses
-    if item.get('subclauses'):
-        parts.append("This includes:")
-        for subclause in item['subclauses']:
-            if subclause.get('description'):
-                parts.append(f"• {subclause['description']}")
-            if subclause.get('delegation'):
-                for role, limit in subclause['delegation'].items():
-                    if limit and limit != "NIL":
-                        parts.append(f"  - {role}: {limit}")
-    # Add methods (for complex delegation structures)
-    if item.get('methods'):
-        for method in item['methods']:
-            if method.get('delegation'):
-                parts.append(f"For {method.get('method', 'this method')}:")
-                for role, limit in method['delegation'].items():
-                    if limit and limit != "NIL":
-                        parts.append(f"- {role}: {limit}")
-    # Add remarks
-    if item.get('remarks'):
-        parts.append("Important notes:")
-        if isinstance(item['remarks'], list):
-            for remark in item['remarks']:
-                if isinstance(remark, str):
-                    parts.append(f"• {remark}")
-        elif isinstance(item['remarks'], str):
-            parts.append(f"• {item['remarks']}")
-    return " ".join(parts)
-def split_into_token_chunks(text: str, max_tokens: int = 400, overlap_tokens: int = 50) -> List[str]:
-    """Split text into chunks based on token count."""
-    sentences = re.split(r'[.!?]\s+', text)
     chunks = []
     current_chunk = ""
-    current_tokens = 0
     for sentence in sentences:
-        sentence = sentence.strip()
-        if not sentence:
-            continue
-        sentence_tokens = count_tokens(sentence)
-        # If adding this sentence would exceed max_tokens, finalize current chunk
-        if current_tokens + sentence_tokens > max_tokens and current_chunk:
             chunks.append(current_chunk.strip())
-            # Start new chunk with overlap
-            if overlap_tokens > 0 and chunks:
-                overlap_text = current_chunk[-overlap_tokens*5:]  # Rough overlap estimation
-                current_chunk = overlap_text + " " + sentence
             else:
                 current_chunk = sentence
-            current_tokens = count_tokens(current_chunk)
-        else:
-            current_chunk += (" " if current_chunk else "") + sentence
-            current_tokens += sentence_tokens
-    # Add the last chunk if it has content
-    if current_chunk.strip():
         chunks.append(current_chunk.strip())
     return chunks
-def create_chunk_hash(text: str) -> str:
-    """Create a hash of the chunk text for deduplication."""
-    return hashlib.md5(text.encode('utf-8')).hexdigest()[:12]
-def process_jsonl_file(file_path: str, output_path: str):
-    """Process the JSONL file and create granular chunks."""
-    print(f"Starting to process '{file_path}' with token-based chunking and keyword enhancement...")
     all_chunks = []
-    chunk_hashes = set()  # For deduplication
-    chunk_id_counter = 1
     try:
-        with open(file_path, 'r', encoding='utf-8') as file:
-            for line_num, line in enumerate(file, 1):
                 try:
-                    item = json.loads(line.strip())
-                    # Create comprehensive text from the item
-                    chunk_text = create_chunk_text_from_item(item)
-                    if not chunk_text.strip():
-                        continue
-                    # Split into token-based chunks
-                    text_chunks = split_into_token_chunks(chunk_text)
-                    for i, chunk in enumerate(text_chunks):
-                        if not chunk.strip():
-                            continue
-                        # Check for duplicates
-                        chunk_hash = create_chunk_hash(chunk)
-                        if chunk_hash in chunk_hashes:
-                            continue
-                        chunk_hashes.add(chunk_hash)
-                        # Extract keywords
-                        financial_keywords = extract_financial_keywords(chunk)
-                        authority_keywords = extract_authority_keywords(chunk)
-                        # Create chunk object
-                        chunk_obj = {
-                            'id': f'chunk-{chunk_id_counter}',
-                            'text': chunk,
-                            'metadata': {
-                                'section': item.get('section', ''),
-                                'clause': item.get('clause', ''),
-                                'title': item.get('title', ''),
-                                'chunk_index': i,
-                                'source_line': line_num,
-                                'financial_keywords': financial_keywords,
-                                'authority_keywords': authority_keywords,
-                                'token_count': count_tokens(chunk)
-                            }
-                        }
-                        all_chunks.append(chunk_obj)
-                        chunk_id_counter += 1
-                except json.JSONDecodeError as e:
-                    print(f"Warning: Invalid JSON on line {line_num}: {e}")
                     continue
     except FileNotFoundError:
-        print(f"Error: File '{file_path}' not found.")
         return
-    except Exception as e:
-        print(f"Error reading file: {e}")
-        return
     print(f"Generated {len(all_chunks)} chunks before deduplication.")
-    print(f"{len(chunk_hashes)} unique chunks after deduplication.")
-    # Write chunks to output file
-    try:
-        with open(output_path, 'w', encoding='utf-8') as output_file:
-            for chunk in all_chunks:
-                json.dump(chunk, output_file, ensure_ascii=False)
-                output_file.write('\n')
-        print(f"Successfully wrote improved granular chunks to '{output_path}'.")
-        print(f"Sample chunk structure:")
-        if all_chunks:
-            sample = all_chunks[0]
-            print(f"  ID: {sample['id']}")
-            print(f"  Text length: {len(sample['text'])} chars")
-            print(f"  Section: {sample['metadata']['section']}")
-            print(f"  Financial keywords: {sample['metadata']['financial_keywords'][:3]}...")
-            print(f"  Token count: {sample['metadata']['token_count']}")
-    except Exception as e:
-        print(f"Error writing output file: {e}")
 if __name__ == "__main__":
-    input_file = "combined_context.jsonl"
-    output_file = "granular_chunks_final.jsonl"
-    process_jsonl_file(input_file, output_file)

+# create_granular_chunks.py
+import os
 import json
 import re
+from typing import List, Dict, Any
+import nltk
+# Download punkt tokenizer if not already done (Ensure this runs once in your environment setup)
+nltk.download('punkt')
+nltk.download('punkt_tab')  # Also download punkt_tab to avoid LookupError
+# --- Configuration ---
+INPUT_FILE = "combined_context.jsonl"
+OUTPUT_FILE = "granular_chunks_final.jsonl"  # Keep filename consistent
+# --- Global State ---
+chunk_counter = 0
+def get_unique_id() -> str:
+    """Returns a unique, incrementing ID for each chunk."""
+    global chunk_counter
+    chunk_counter += 1
+    return f"chunk-{chunk_counter}"
+def create_chunk(context: Dict, text: str) -> Dict:
+    """Creates a standardized chunk dictionary with rich metadata."""
+    metadata = {
+        "section": context.get("section"),
+        "clause": context.get("clause") or context.get("Clause"),
+        "title": context.get("title"),
+        "source_description": context.get("description"),
+    }
+    # Add other primitive metadata keys
+    for key, value in context.items():
+        if key not in metadata and isinstance(value, (str, int, float, bool)):
+            metadata[key] = value
+    return {
+        "id": get_unique_id(),
+        "text": text.strip(),
+        "metadata": {k: v for k, v in metadata.items() if v is not None}
+    }
+def format_delegation_text(delegation: Any) -> str:
+    """
+    Formats a delegation dictionary or string into a readable string.
+    Explicitly includes "NIL" or "---" to capture no power cases.
+    """
+    if not isinstance(delegation, dict):
+        return str(delegation)
+    parts = [f"the limit for {auth} is {limit if limit and str(limit) != '---' else 'NIL'}" for auth, limit in delegation.items()]
+    return ", ".join(parts) if parts else "No specific delegation provided."
+def format_remarks(remarks: Any) -> str:
+    """Safely formats the 'remarks' field, handling various data types."""
+    if isinstance(remarks, list):
+        remark_parts = []
+        for item in remarks:
+            if isinstance(item, dict):
+                for key, value in item.items():
+                    remark_parts.append(f"{key}: {value}")
+            else:
+                remark_parts.append(str(item))
+        return " ".join(remark_parts)
+    return str(remarks)
+def build_descriptive_text(context: Dict) -> str:
+    """
+    Builds a clear, descriptive, natural language text by combining fields.
+    Focused for best relevance and contextual richness.
+    """
+    text_parts = []
+    if context.get("title"):
+        text_parts.append(f"Regarding the policy '{context['title']}'")
+    specific_desc = context.get('description') or context.get('method')
+    if specific_desc and specific_desc != context.get('title'):
+        text_parts.append(f"specifically for '{specific_desc}'")
+    if "delegation" in context:
+        delegation_text = format_delegation_text(context["delegation"])
+        text_parts.append(f", financial delegations are: {delegation_text}.")
+    elif "composition" in context:
+        composition_parts = []
+        for item in context["composition"]:
+            if isinstance(item, dict):
+                for role, members in item.items():
+                    member_text = (f"the {role} is {members}" if isinstance(members, str)
+                                   else f"the {role} are: {', '.join(members)}")
+                    composition_parts.append(member_text)
+        text_parts.append(f", the composition is: {'; '.join(composition_parts)}.")
+    if "remarks" in context and context["remarks"]:
+        remarks_text = format_remarks(context["remarks"])
+        text_parts.append(f" Important remarks include: {remarks_text}")
+    # Join all parts into a flowing sentence
+    return " ".join(text_parts).strip()
+def split_text_into_chunks(text: str, max_char_length: int = 1500, overlap: int = 200) -> List[str]:
+    """
+    Splits a long text into smaller chunks with controlled overlap.
+    Uses sentence tokenization for natural splits.
+    """
+    text = text.strip()
+    if len(text) <= max_char_length:
+        return [text]
+    # Explicitly specify language to avoid punkt_tab error
+    sentences = nltk.tokenize.sent_tokenize(text, language='english')
     chunks = []
     current_chunk = ""
     for sentence in sentences:
+        # +1 for space/newline likely added between sentences
+        if len(current_chunk) + len(sentence) + 1 <= max_char_length:
+            current_chunk += (" " + sentence) if current_chunk else sentence
+        else:
             chunks.append(current_chunk.strip())
+            # Start next chunk with overlap from end of previous chunk (by characters)
+            if overlap < len(current_chunk):
+                current_chunk = current_chunk[-overlap:] + " " + sentence
             else:
                 current_chunk = sentence
+    if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks
+def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
+    """
+    Processes a JSON policy entry and returns granular, context-rich chunks.
+    Applies recursive traversal and implements chunk size limiting.
+    """
+    context = {**(parent_context or {}), **data}
+    chunks = []
+    # Handler 1: Simple Item Lists (ex: rules, exclusions)
+    list_key = next((key for key in ["items", "exclusions"] if key in data and isinstance(data.get(key), list)), None)
+    if list_key:
+        base_title = context.get('title', 'a policy')
+        for item in data[list_key]:
+            if isinstance(item, str):
+                # Build chunk text with clear descriptive prefix for relevance
+                text = f"A rule regarding '{base_title}' is: {item}."
+                # Split if too long
+                for sub_chunk in split_text_into_chunks(text):
+                    chunks.append(create_chunk(context, sub_chunk))
+        return chunks
+    # Handler 2: Recursive traversal for nested dictionaries/lists
+    has_recursed = False
+    for key, value in data.items():
+        if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
+            for item in value:
+                chunks.extend(process_entry(item, context))
+            has_recursed = True
+    # Handler 3: Leaf nodes with delegation, composition or description
+    if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
+        text = build_descriptive_text(context)
+        # Split long descriptive text intelligently
+        for chunk_text in split_text_into_chunks(text):
+            chunks.append(create_chunk(context, chunk_text))
+    return chunks
+def main():
+    """Main orchestration to read input, process, and write chunks."""
+    print(f"Starting to process '{INPUT_FILE}' for improved granular chunking...")
     all_chunks = []
     try:
+        with open(INPUT_FILE, 'r', encoding='utf-8') as f:
+            for i, line in enumerate(f):
                 try:
+                    data = json.loads(line)
+                    processed = process_entry(data)
+                    if processed:
+                        all_chunks.extend(processed)
+                except json.JSONDecodeError:
+                    print(f"Warning: Skipping malformed JSON on line {i+1}")
                     continue
     except FileNotFoundError:
+        print(f"Error: Input file '{INPUT_FILE}' not found.")
         return
     print(f"Generated {len(all_chunks)} chunks before deduplication.")
+    # Deduplicate by text content (retaining last occurrences)
+    unique_chunks_map = {}
+    for chunk in all_chunks:
+        unique_chunks_map[chunk['text']] = chunk
+    unique_chunks = list(unique_chunks_map.values())
+    print(f"{len(unique_chunks)} unique chunks after deduplication.")
+    # Write output in JSONL format for later vector DB ingestion
+    with open(OUTPUT_FILE, 'w', encoding='utf-8') as outf:
+        for chunk in unique_chunks:
+            outf.write(json.dumps(chunk, ensure_ascii=False) + "\n")
+    print(f"Successfully wrote improved granular chunks to '{OUTPUT_FILE}'.")
 if __name__ == "__main__":
+    main()