Spaces:

Kalpokoch
/

ChatbotDemo

Running

File size: 7,971 Bytes

import json
import re
from typing import List, Dict, Any

# Define the input and output filenames
INPUT_FILE = "combined_context.jsonl"
OUTPUT_FILE = "granular_chunks_improved.jsonl"

# Global counter to ensure all generated IDs are unique
chunk_counter = 0

def get_unique_id() -> int:
    """Returns a unique, incrementing ID."""
    global chunk_counter
    chunk_counter += 1
    return chunk_counter

def parse_value_to_int(value_str: str) -> int:
    """Converts a financial string like '₹5 crore' or '₹50 lakh' to an integer."""
    if not isinstance(value_str, str):
        return 0
    value_str = value_str.lower().replace('₹', '').strip()
    if value_str in ["nil", "---", ""]:
        return 0
    try:
        num_part = re.findall(r'[\d\.]+', value_str)
        if not num_part:
            return 0
        num = float(num_part[0])
        if 'crore' in value_str:
            return int(num * 1_00_00_000)
        if 'lakh' in value_str:
            return int(num * 1_00_000)
        return int(num)
    except (ValueError, IndexError):
        return 0

def create_chunk(context: Dict, text_override: str = None, id_override: str = None) -> Dict:
    """Helper function to create a standardized chunk with rich metadata."""
    chunk_id = id_override if id_override else f"chunk-{get_unique_id()}"
    
    # Determine the primary text for the chunk
    text = text_override
    if not text:
        # Create a sensible default text if none is provided
        text_parts = [context.get("title"), context.get("description")]
        text = ". ".join(filter(None, text_parts)) or str(context)

    metadata = {
        "section": context.get("section"),
        "clause": context.get("clause"),
        "subclause_id": context.get("id"),
        "title": context.get("title"),
        "description": context.get("description"),
        "authority": context.get("authority"),
        "limit_text": context.get("limit_text"),
        "limit_inr": parse_value_to_int(str(context.get("limit_text", ""))),
        "source": context.get("source"),
    }
    
    return {
        "id": chunk_id,
        "text": text,
        "metadata": {k: v for k, v in metadata.items() if v is not None}
    }

def _process_authority_power(data: Dict, context: Dict) -> List[Dict]:
    """
    Specifically handles the complex "authority" and "extent_of_power" structures.
    This logic is complex because the data types for these keys vary.
    """
    chunks = []
    title = context.get("title", "this rule")
    
    # Case 1: Authority and Power are simple strings
    if isinstance(data.get("authority"), str) and isinstance(data.get("extent_of_power"), str):
        text = f"Regarding '{title}', the approving authority is {data['authority']} with '{data['extent_of_power']}'."
        chunk_context = context.copy()
        chunk_context["authority"] = data['authority']
        chunk_context["limit_text"] = data['extent_of_power']
        chunks.append(create_chunk(chunk_context, text_override=text))

    # Case 2: Authority and Power are lists of dictionaries (most complex case)
    elif isinstance(data.get("authority"), list) and isinstance(data.get("extent_of_power"), list):
        authorities = data["authority"]
        powers = data["extent_of_power"]
        # Assuming the lists correspond to each other
        for i in range(min(len(authorities), len(powers))):
            auth_item = authorities[i]
            power_item = powers[i]
            # Extract descriptions from the dictionaries
            auth_desc = next(iter(auth_item.values())) if isinstance(auth_item, dict) else str(auth_item)
            power_desc = next(iter(power_item.values())) if isinstance(power_item, dict) else str(power_item)

            text = f"For '{title}', the authority for '{auth_desc}' is given '{power_desc}'."
            chunk_context = context.copy()
            chunk_context["authority"] = auth_desc
            chunk_context["limit_text"] = power_desc
            chunks.append(create_chunk(chunk_context, text_override=text))

    # Fallback for any other structure
    else:
        text = f"Regarding '{title}', the authority and power details are as follows: {json.dumps(data)}."
        chunks.append(create_chunk(context, text_override=text))
        
    return chunks

def process_chunk(data: Dict, context: Dict) -> List[Dict]:
    """
    Processes a dictionary from the source file and deconstructs it into granular chunks.
    """
    new_chunks = []
    
    # Update context with current data, giving preference to new keys
    current_context = context.copy()
    current_context.update(data)
    
    has_nested_chunks = False

    # --- Rule-based deconstruction ---

    # Rule 1: Handle "delegation" structure (most specific)
    if "delegation" in data and isinstance(data["delegation"], dict):
        for authority, limit_text in data["delegation"].items():
            desc = current_context.get('description') or current_context.get('title')
            text = f"Regarding '{desc}', the delegation for {authority} is '{limit_text}'."
            chunk_context = current_context.copy()
            chunk_context["authority"] = authority
            chunk_context["limit_text"] = str(limit_text)
            new_chunks.append(create_chunk(chunk_context, text_override=text))
        return new_chunks

    # Rule 2: Handle "authority" and "extent_of_power" structures
    if "authority" in data and "extent_of_power" in data:
        return _process_authority_power(data, current_context)

    # Rule 3: Recursively process nested lists of dictionaries or strings
    for key, value in data.items():
        if isinstance(value, list) and value:
            # Sub-rule 3a: List of dictionaries (e.g., subclauses, items)
            if all(isinstance(item, dict) for item in value):
                for item in value:
                    nested_results = process_chunk(item, current_context)
                    if nested_results:
                        new_chunks.extend(nested_results)
                        has_nested_chunks = True
            
            # Sub-rule 3b: List of simple strings (e.g., items in Annexure A)
            elif all(isinstance(item, str) for item in value):
                title = current_context.get('title')
                for item_text in value:
                    text = f"Regarding '{title}', a relevant item is: {item_text}."
                    new_chunks.append(create_chunk(current_context, text_override=text))
                has_nested_chunks = True

    # --- Finalization ---

    # If we created specific chunks from children, we don't need the generic parent.
    if has_nested_chunks:
        return new_chunks

    # Base case: If no specific rules were matched, create a single chunk for the item.
    # This happens for "leaf" nodes that cannot be deconstructed further.
    new_chunks.append(create_chunk(current_context))
    return new_chunks

def main():
    """Main function to read, process, and write."""
    print(f"Starting to process '{INPUT_FILE}'...")
    final_chunks = []
    
    try:
        with open(INPUT_FILE, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                try:
                    data = json.loads(line)
                    processed = process_chunk(data, {})
                    final_chunks.extend(processed)
                except json.JSONDecodeError:
                    print(f"Warning: Skipping malformed JSON on line {i+1}")
                    continue
    except FileNotFoundError:
        print(f"Error: Input file '{INPUT_FILE}' not found.")
        return

    print(f"Deconstructed into {len(final_chunks)} granular chunks.")

    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        for chunk in final_chunks:
            f.write(json.dumps(chunk) + '\n')

    print(f"Successfully created improved granular chunks file: '{OUTPUT_FILE}'")

if __name__ == "__main__":
    main()