Spaces:
Running
Running
File size: 7,971 Bytes
a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af 9df2551 a9681af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import json
import re
from typing import List, Dict, Any
# Define the input and output filenames
INPUT_FILE = "combined_context.jsonl"
OUTPUT_FILE = "granular_chunks_improved.jsonl"
# Global counter to ensure all generated IDs are unique
chunk_counter = 0
def get_unique_id() -> int:
"""Returns a unique, incrementing ID."""
global chunk_counter
chunk_counter += 1
return chunk_counter
def parse_value_to_int(value_str: str) -> int:
"""Converts a financial string like '₹5 crore' or '₹50 lakh' to an integer."""
if not isinstance(value_str, str):
return 0
value_str = value_str.lower().replace('₹', '').strip()
if value_str in ["nil", "---", ""]:
return 0
try:
num_part = re.findall(r'[\d\.]+', value_str)
if not num_part:
return 0
num = float(num_part[0])
if 'crore' in value_str:
return int(num * 1_00_00_000)
if 'lakh' in value_str:
return int(num * 1_00_000)
return int(num)
except (ValueError, IndexError):
return 0
def create_chunk(context: Dict, text_override: str = None, id_override: str = None) -> Dict:
"""Helper function to create a standardized chunk with rich metadata."""
chunk_id = id_override if id_override else f"chunk-{get_unique_id()}"
# Determine the primary text for the chunk
text = text_override
if not text:
# Create a sensible default text if none is provided
text_parts = [context.get("title"), context.get("description")]
text = ". ".join(filter(None, text_parts)) or str(context)
metadata = {
"section": context.get("section"),
"clause": context.get("clause"),
"subclause_id": context.get("id"),
"title": context.get("title"),
"description": context.get("description"),
"authority": context.get("authority"),
"limit_text": context.get("limit_text"),
"limit_inr": parse_value_to_int(str(context.get("limit_text", ""))),
"source": context.get("source"),
}
return {
"id": chunk_id,
"text": text,
"metadata": {k: v for k, v in metadata.items() if v is not None}
}
def _process_authority_power(data: Dict, context: Dict) -> List[Dict]:
"""
Specifically handles the complex "authority" and "extent_of_power" structures.
This logic is complex because the data types for these keys vary.
"""
chunks = []
title = context.get("title", "this rule")
# Case 1: Authority and Power are simple strings
if isinstance(data.get("authority"), str) and isinstance(data.get("extent_of_power"), str):
text = f"Regarding '{title}', the approving authority is {data['authority']} with '{data['extent_of_power']}'."
chunk_context = context.copy()
chunk_context["authority"] = data['authority']
chunk_context["limit_text"] = data['extent_of_power']
chunks.append(create_chunk(chunk_context, text_override=text))
# Case 2: Authority and Power are lists of dictionaries (most complex case)
elif isinstance(data.get("authority"), list) and isinstance(data.get("extent_of_power"), list):
authorities = data["authority"]
powers = data["extent_of_power"]
# Assuming the lists correspond to each other
for i in range(min(len(authorities), len(powers))):
auth_item = authorities[i]
power_item = powers[i]
# Extract descriptions from the dictionaries
auth_desc = next(iter(auth_item.values())) if isinstance(auth_item, dict) else str(auth_item)
power_desc = next(iter(power_item.values())) if isinstance(power_item, dict) else str(power_item)
text = f"For '{title}', the authority for '{auth_desc}' is given '{power_desc}'."
chunk_context = context.copy()
chunk_context["authority"] = auth_desc
chunk_context["limit_text"] = power_desc
chunks.append(create_chunk(chunk_context, text_override=text))
# Fallback for any other structure
else:
text = f"Regarding '{title}', the authority and power details are as follows: {json.dumps(data)}."
chunks.append(create_chunk(context, text_override=text))
return chunks
def process_chunk(data: Dict, context: Dict) -> List[Dict]:
"""
Processes a dictionary from the source file and deconstructs it into granular chunks.
"""
new_chunks = []
# Update context with current data, giving preference to new keys
current_context = context.copy()
current_context.update(data)
has_nested_chunks = False
# --- Rule-based deconstruction ---
# Rule 1: Handle "delegation" structure (most specific)
if "delegation" in data and isinstance(data["delegation"], dict):
for authority, limit_text in data["delegation"].items():
desc = current_context.get('description') or current_context.get('title')
text = f"Regarding '{desc}', the delegation for {authority} is '{limit_text}'."
chunk_context = current_context.copy()
chunk_context["authority"] = authority
chunk_context["limit_text"] = str(limit_text)
new_chunks.append(create_chunk(chunk_context, text_override=text))
return new_chunks
# Rule 2: Handle "authority" and "extent_of_power" structures
if "authority" in data and "extent_of_power" in data:
return _process_authority_power(data, current_context)
# Rule 3: Recursively process nested lists of dictionaries or strings
for key, value in data.items():
if isinstance(value, list) and value:
# Sub-rule 3a: List of dictionaries (e.g., subclauses, items)
if all(isinstance(item, dict) for item in value):
for item in value:
nested_results = process_chunk(item, current_context)
if nested_results:
new_chunks.extend(nested_results)
has_nested_chunks = True
# Sub-rule 3b: List of simple strings (e.g., items in Annexure A)
elif all(isinstance(item, str) for item in value):
title = current_context.get('title')
for item_text in value:
text = f"Regarding '{title}', a relevant item is: {item_text}."
new_chunks.append(create_chunk(current_context, text_override=text))
has_nested_chunks = True
# --- Finalization ---
# If we created specific chunks from children, we don't need the generic parent.
if has_nested_chunks:
return new_chunks
# Base case: If no specific rules were matched, create a single chunk for the item.
# This happens for "leaf" nodes that cannot be deconstructed further.
new_chunks.append(create_chunk(current_context))
return new_chunks
def main():
"""Main function to read, process, and write."""
print(f"Starting to process '{INPUT_FILE}'...")
final_chunks = []
try:
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
try:
data = json.loads(line)
processed = process_chunk(data, {})
final_chunks.extend(processed)
except json.JSONDecodeError:
print(f"Warning: Skipping malformed JSON on line {i+1}")
continue
except FileNotFoundError:
print(f"Error: Input file '{INPUT_FILE}' not found.")
return
print(f"Deconstructed into {len(final_chunks)} granular chunks.")
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
for chunk in final_chunks:
f.write(json.dumps(chunk) + '\n')
print(f"Successfully created improved granular chunks file: '{OUTPUT_FILE}'")
if __name__ == "__main__":
main() |