ChatbotDemo / create_granular_chunks.py
Kalpokoch's picture
updated to granular context chunks
9df2551
raw
history blame
7.97 kB
import json
import re
from typing import List, Dict, Any
# Define the input and output filenames
INPUT_FILE = "combined_context.jsonl"
OUTPUT_FILE = "granular_chunks_improved.jsonl"
# Global counter to ensure all generated IDs are unique
chunk_counter = 0
def get_unique_id() -> int:
"""Returns a unique, incrementing ID."""
global chunk_counter
chunk_counter += 1
return chunk_counter
def parse_value_to_int(value_str: str) -> int:
"""Converts a financial string like '₹5 crore' or '₹50 lakh' to an integer."""
if not isinstance(value_str, str):
return 0
value_str = value_str.lower().replace('₹', '').strip()
if value_str in ["nil", "---", ""]:
return 0
try:
num_part = re.findall(r'[\d\.]+', value_str)
if not num_part:
return 0
num = float(num_part[0])
if 'crore' in value_str:
return int(num * 1_00_00_000)
if 'lakh' in value_str:
return int(num * 1_00_000)
return int(num)
except (ValueError, IndexError):
return 0
def create_chunk(context: Dict, text_override: str = None, id_override: str = None) -> Dict:
"""Helper function to create a standardized chunk with rich metadata."""
chunk_id = id_override if id_override else f"chunk-{get_unique_id()}"
# Determine the primary text for the chunk
text = text_override
if not text:
# Create a sensible default text if none is provided
text_parts = [context.get("title"), context.get("description")]
text = ". ".join(filter(None, text_parts)) or str(context)
metadata = {
"section": context.get("section"),
"clause": context.get("clause"),
"subclause_id": context.get("id"),
"title": context.get("title"),
"description": context.get("description"),
"authority": context.get("authority"),
"limit_text": context.get("limit_text"),
"limit_inr": parse_value_to_int(str(context.get("limit_text", ""))),
"source": context.get("source"),
}
return {
"id": chunk_id,
"text": text,
"metadata": {k: v for k, v in metadata.items() if v is not None}
}
def _process_authority_power(data: Dict, context: Dict) -> List[Dict]:
"""
Specifically handles the complex "authority" and "extent_of_power" structures.
This logic is complex because the data types for these keys vary.
"""
chunks = []
title = context.get("title", "this rule")
# Case 1: Authority and Power are simple strings
if isinstance(data.get("authority"), str) and isinstance(data.get("extent_of_power"), str):
text = f"Regarding '{title}', the approving authority is {data['authority']} with '{data['extent_of_power']}'."
chunk_context = context.copy()
chunk_context["authority"] = data['authority']
chunk_context["limit_text"] = data['extent_of_power']
chunks.append(create_chunk(chunk_context, text_override=text))
# Case 2: Authority and Power are lists of dictionaries (most complex case)
elif isinstance(data.get("authority"), list) and isinstance(data.get("extent_of_power"), list):
authorities = data["authority"]
powers = data["extent_of_power"]
# Assuming the lists correspond to each other
for i in range(min(len(authorities), len(powers))):
auth_item = authorities[i]
power_item = powers[i]
# Extract descriptions from the dictionaries
auth_desc = next(iter(auth_item.values())) if isinstance(auth_item, dict) else str(auth_item)
power_desc = next(iter(power_item.values())) if isinstance(power_item, dict) else str(power_item)
text = f"For '{title}', the authority for '{auth_desc}' is given '{power_desc}'."
chunk_context = context.copy()
chunk_context["authority"] = auth_desc
chunk_context["limit_text"] = power_desc
chunks.append(create_chunk(chunk_context, text_override=text))
# Fallback for any other structure
else:
text = f"Regarding '{title}', the authority and power details are as follows: {json.dumps(data)}."
chunks.append(create_chunk(context, text_override=text))
return chunks
def process_chunk(data: Dict, context: Dict) -> List[Dict]:
"""
Processes a dictionary from the source file and deconstructs it into granular chunks.
"""
new_chunks = []
# Update context with current data, giving preference to new keys
current_context = context.copy()
current_context.update(data)
has_nested_chunks = False
# --- Rule-based deconstruction ---
# Rule 1: Handle "delegation" structure (most specific)
if "delegation" in data and isinstance(data["delegation"], dict):
for authority, limit_text in data["delegation"].items():
desc = current_context.get('description') or current_context.get('title')
text = f"Regarding '{desc}', the delegation for {authority} is '{limit_text}'."
chunk_context = current_context.copy()
chunk_context["authority"] = authority
chunk_context["limit_text"] = str(limit_text)
new_chunks.append(create_chunk(chunk_context, text_override=text))
return new_chunks
# Rule 2: Handle "authority" and "extent_of_power" structures
if "authority" in data and "extent_of_power" in data:
return _process_authority_power(data, current_context)
# Rule 3: Recursively process nested lists of dictionaries or strings
for key, value in data.items():
if isinstance(value, list) and value:
# Sub-rule 3a: List of dictionaries (e.g., subclauses, items)
if all(isinstance(item, dict) for item in value):
for item in value:
nested_results = process_chunk(item, current_context)
if nested_results:
new_chunks.extend(nested_results)
has_nested_chunks = True
# Sub-rule 3b: List of simple strings (e.g., items in Annexure A)
elif all(isinstance(item, str) for item in value):
title = current_context.get('title')
for item_text in value:
text = f"Regarding '{title}', a relevant item is: {item_text}."
new_chunks.append(create_chunk(current_context, text_override=text))
has_nested_chunks = True
# --- Finalization ---
# If we created specific chunks from children, we don't need the generic parent.
if has_nested_chunks:
return new_chunks
# Base case: If no specific rules were matched, create a single chunk for the item.
# This happens for "leaf" nodes that cannot be deconstructed further.
new_chunks.append(create_chunk(current_context))
return new_chunks
def main():
"""Main function to read, process, and write."""
print(f"Starting to process '{INPUT_FILE}'...")
final_chunks = []
try:
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
try:
data = json.loads(line)
processed = process_chunk(data, {})
final_chunks.extend(processed)
except json.JSONDecodeError:
print(f"Warning: Skipping malformed JSON on line {i+1}")
continue
except FileNotFoundError:
print(f"Error: Input file '{INPUT_FILE}' not found.")
return
print(f"Deconstructed into {len(final_chunks)} granular chunks.")
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
for chunk in final_chunks:
f.write(json.dumps(chunk) + '\n')
print(f"Successfully created improved granular chunks file: '{OUTPUT_FILE}'")
if __name__ == "__main__":
main()