File size: 7,971 Bytes
a9681af
 
 
 
 
9df2551
 
a9681af
 
 
 
9df2551
a9681af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9df2551
a9681af
 
9df2551
 
 
 
 
 
 
a9681af
 
 
9df2551
a9681af
 
 
 
 
9df2551
a9681af
 
 
 
 
 
 
 
9df2551
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9681af
 
9df2551
a9681af
 
 
9df2551
a9681af
 
 
9df2551
 
 
 
 
a9681af
 
9df2551
 
a9681af
 
9df2551
a9681af
 
 
9df2551
a9681af
9df2551
a9681af
9df2551
a9681af
9df2551
 
 
 
a9681af
 
 
 
9df2551
 
 
 
 
 
 
 
 
 
a9681af
9df2551
a9681af
 
 
9df2551
 
a9681af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9df2551
a9681af
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import json
import re
from typing import List, Dict, Any

# Define the input and output filenames
INPUT_FILE = "combined_context.jsonl"
OUTPUT_FILE = "granular_chunks_improved.jsonl"

# Global counter to ensure all generated IDs are unique
chunk_counter = 0

def get_unique_id() -> int:
    """Returns a unique, incrementing ID."""
    global chunk_counter
    chunk_counter += 1
    return chunk_counter

def parse_value_to_int(value_str: str) -> int:
    """Converts a financial string like '₹5 crore' or '₹50 lakh' to an integer."""
    if not isinstance(value_str, str):
        return 0
    value_str = value_str.lower().replace('₹', '').strip()
    if value_str in ["nil", "---", ""]:
        return 0
    try:
        num_part = re.findall(r'[\d\.]+', value_str)
        if not num_part:
            return 0
        num = float(num_part[0])
        if 'crore' in value_str:
            return int(num * 1_00_00_000)
        if 'lakh' in value_str:
            return int(num * 1_00_000)
        return int(num)
    except (ValueError, IndexError):
        return 0

def create_chunk(context: Dict, text_override: str = None, id_override: str = None) -> Dict:
    """Helper function to create a standardized chunk with rich metadata."""
    chunk_id = id_override if id_override else f"chunk-{get_unique_id()}"
    
    # Determine the primary text for the chunk
    text = text_override
    if not text:
        # Create a sensible default text if none is provided
        text_parts = [context.get("title"), context.get("description")]
        text = ". ".join(filter(None, text_parts)) or str(context)

    metadata = {
        "section": context.get("section"),
        "clause": context.get("clause"),
        "subclause_id": context.get("id"),
        "title": context.get("title"),
        "description": context.get("description"),
        "authority": context.get("authority"),
        "limit_text": context.get("limit_text"),
        "limit_inr": parse_value_to_int(str(context.get("limit_text", ""))),
        "source": context.get("source"),
    }
    
    return {
        "id": chunk_id,
        "text": text,
        "metadata": {k: v for k, v in metadata.items() if v is not None}
    }

def _process_authority_power(data: Dict, context: Dict) -> List[Dict]:
    """
    Specifically handles the complex "authority" and "extent_of_power" structures.
    This logic is complex because the data types for these keys vary.
    """
    chunks = []
    title = context.get("title", "this rule")
    
    # Case 1: Authority and Power are simple strings
    if isinstance(data.get("authority"), str) and isinstance(data.get("extent_of_power"), str):
        text = f"Regarding '{title}', the approving authority is {data['authority']} with '{data['extent_of_power']}'."
        chunk_context = context.copy()
        chunk_context["authority"] = data['authority']
        chunk_context["limit_text"] = data['extent_of_power']
        chunks.append(create_chunk(chunk_context, text_override=text))

    # Case 2: Authority and Power are lists of dictionaries (most complex case)
    elif isinstance(data.get("authority"), list) and isinstance(data.get("extent_of_power"), list):
        authorities = data["authority"]
        powers = data["extent_of_power"]
        # Assuming the lists correspond to each other
        for i in range(min(len(authorities), len(powers))):
            auth_item = authorities[i]
            power_item = powers[i]
            # Extract descriptions from the dictionaries
            auth_desc = next(iter(auth_item.values())) if isinstance(auth_item, dict) else str(auth_item)
            power_desc = next(iter(power_item.values())) if isinstance(power_item, dict) else str(power_item)

            text = f"For '{title}', the authority for '{auth_desc}' is given '{power_desc}'."
            chunk_context = context.copy()
            chunk_context["authority"] = auth_desc
            chunk_context["limit_text"] = power_desc
            chunks.append(create_chunk(chunk_context, text_override=text))

    # Fallback for any other structure
    else:
        text = f"Regarding '{title}', the authority and power details are as follows: {json.dumps(data)}."
        chunks.append(create_chunk(context, text_override=text))
        
    return chunks

def process_chunk(data: Dict, context: Dict) -> List[Dict]:
    """
    Processes a dictionary from the source file and deconstructs it into granular chunks.
    """
    new_chunks = []
    
    # Update context with current data, giving preference to new keys
    current_context = context.copy()
    current_context.update(data)
    
    has_nested_chunks = False

    # --- Rule-based deconstruction ---

    # Rule 1: Handle "delegation" structure (most specific)
    if "delegation" in data and isinstance(data["delegation"], dict):
        for authority, limit_text in data["delegation"].items():
            desc = current_context.get('description') or current_context.get('title')
            text = f"Regarding '{desc}', the delegation for {authority} is '{limit_text}'."
            chunk_context = current_context.copy()
            chunk_context["authority"] = authority
            chunk_context["limit_text"] = str(limit_text)
            new_chunks.append(create_chunk(chunk_context, text_override=text))
        return new_chunks

    # Rule 2: Handle "authority" and "extent_of_power" structures
    if "authority" in data and "extent_of_power" in data:
        return _process_authority_power(data, current_context)

    # Rule 3: Recursively process nested lists of dictionaries or strings
    for key, value in data.items():
        if isinstance(value, list) and value:
            # Sub-rule 3a: List of dictionaries (e.g., subclauses, items)
            if all(isinstance(item, dict) for item in value):
                for item in value:
                    nested_results = process_chunk(item, current_context)
                    if nested_results:
                        new_chunks.extend(nested_results)
                        has_nested_chunks = True
            
            # Sub-rule 3b: List of simple strings (e.g., items in Annexure A)
            elif all(isinstance(item, str) for item in value):
                title = current_context.get('title')
                for item_text in value:
                    text = f"Regarding '{title}', a relevant item is: {item_text}."
                    new_chunks.append(create_chunk(current_context, text_override=text))
                has_nested_chunks = True

    # --- Finalization ---

    # If we created specific chunks from children, we don't need the generic parent.
    if has_nested_chunks:
        return new_chunks

    # Base case: If no specific rules were matched, create a single chunk for the item.
    # This happens for "leaf" nodes that cannot be deconstructed further.
    new_chunks.append(create_chunk(current_context))
    return new_chunks

def main():
    """Main function to read, process, and write."""
    print(f"Starting to process '{INPUT_FILE}'...")
    final_chunks = []
    
    try:
        with open(INPUT_FILE, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                try:
                    data = json.loads(line)
                    processed = process_chunk(data, {})
                    final_chunks.extend(processed)
                except json.JSONDecodeError:
                    print(f"Warning: Skipping malformed JSON on line {i+1}")
                    continue
    except FileNotFoundError:
        print(f"Error: Input file '{INPUT_FILE}' not found.")
        return

    print(f"Deconstructed into {len(final_chunks)} granular chunks.")

    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        for chunk in final_chunks:
            f.write(json.dumps(chunk) + '\n')

    print(f"Successfully created improved granular chunks file: '{OUTPUT_FILE}'")

if __name__ == "__main__":
    main()