Spaces:

Kalpokoch
/

ChatbotDemo

Running

App Files Files

ChatbotDemo / create_granular_chunks.py

Kalpokoch

updated to granular context chunks

9df2551 4 months ago

raw

history blame

7.97 kB

	import json
	import re
	from typing import List, Dict, Any

	# Define the input and output filenames
	INPUT_FILE = "combined_context.jsonl"
	OUTPUT_FILE = "granular_chunks_improved.jsonl"

	# Global counter to ensure all generated IDs are unique
	chunk_counter = 0

	def get_unique_id() -> int:
	"""Returns a unique, incrementing ID."""
	global chunk_counter
	chunk_counter += 1
	return chunk_counter

	def parse_value_to_int(value_str: str) -> int:
	"""Converts a financial string like '₹5 crore' or '₹50 lakh' to an integer."""
	if not isinstance(value_str, str):
	return 0
	value_str = value_str.lower().replace('₹', '').strip()
	if value_str in ["nil", "---", ""]:
	return 0
	try:
	num_part = re.findall(r'[\d\.]+', value_str)
	if not num_part:
	return 0
	num = float(num_part[0])
	if 'crore' in value_str:
	return int(num * 1_00_00_000)
	if 'lakh' in value_str:
	return int(num * 1_00_000)
	return int(num)
	except (ValueError, IndexError):
	return 0

	def create_chunk(context: Dict, text_override: str = None, id_override: str = None) -> Dict:
	"""Helper function to create a standardized chunk with rich metadata."""
	chunk_id = id_override if id_override else f"chunk-{get_unique_id()}"

	# Determine the primary text for the chunk
	text = text_override
	if not text:
	# Create a sensible default text if none is provided
	text_parts = [context.get("title"), context.get("description")]
	text = ". ".join(filter(None, text_parts)) or str(context)

	metadata = {
	"section": context.get("section"),
	"clause": context.get("clause"),
	"subclause_id": context.get("id"),
	"title": context.get("title"),
	"description": context.get("description"),
	"authority": context.get("authority"),
	"limit_text": context.get("limit_text"),
	"limit_inr": parse_value_to_int(str(context.get("limit_text", ""))),
	"source": context.get("source"),
	}

	return {
	"id": chunk_id,
	"text": text,
	"metadata": {k: v for k, v in metadata.items() if v is not None}
	}

	def _process_authority_power(data: Dict, context: Dict) -> List[Dict]:
	"""
	Specifically handles the complex "authority" and "extent_of_power" structures.
	This logic is complex because the data types for these keys vary.
	"""
	chunks = []
	title = context.get("title", "this rule")

	# Case 1: Authority and Power are simple strings
	if isinstance(data.get("authority"), str) and isinstance(data.get("extent_of_power"), str):
	text = f"Regarding '{title}', the approving authority is {data['authority']} with '{data['extent_of_power']}'."
	chunk_context = context.copy()
	chunk_context["authority"] = data['authority']
	chunk_context["limit_text"] = data['extent_of_power']
	chunks.append(create_chunk(chunk_context, text_override=text))

	# Case 2: Authority and Power are lists of dictionaries (most complex case)
	elif isinstance(data.get("authority"), list) and isinstance(data.get("extent_of_power"), list):
	authorities = data["authority"]
	powers = data["extent_of_power"]
	# Assuming the lists correspond to each other
	for i in range(min(len(authorities), len(powers))):
	auth_item = authorities[i]
	power_item = powers[i]
	# Extract descriptions from the dictionaries
	auth_desc = next(iter(auth_item.values())) if isinstance(auth_item, dict) else str(auth_item)
	power_desc = next(iter(power_item.values())) if isinstance(power_item, dict) else str(power_item)

	text = f"For '{title}', the authority for '{auth_desc}' is given '{power_desc}'."
	chunk_context = context.copy()
	chunk_context["authority"] = auth_desc
	chunk_context["limit_text"] = power_desc
	chunks.append(create_chunk(chunk_context, text_override=text))

	# Fallback for any other structure
	else:
	text = f"Regarding '{title}', the authority and power details are as follows: {json.dumps(data)}."
	chunks.append(create_chunk(context, text_override=text))

	return chunks

	def process_chunk(data: Dict, context: Dict) -> List[Dict]:
	"""
	Processes a dictionary from the source file and deconstructs it into granular chunks.
	"""
	new_chunks = []

	# Update context with current data, giving preference to new keys
	current_context = context.copy()
	current_context.update(data)

	has_nested_chunks = False

	# --- Rule-based deconstruction ---

	# Rule 1: Handle "delegation" structure (most specific)
	if "delegation" in data and isinstance(data["delegation"], dict):
	for authority, limit_text in data["delegation"].items():
	desc = current_context.get('description') or current_context.get('title')
	text = f"Regarding '{desc}', the delegation for {authority} is '{limit_text}'."
	chunk_context = current_context.copy()
	chunk_context["authority"] = authority
	chunk_context["limit_text"] = str(limit_text)
	new_chunks.append(create_chunk(chunk_context, text_override=text))
	return new_chunks

	# Rule 2: Handle "authority" and "extent_of_power" structures
	if "authority" in data and "extent_of_power" in data:
	return _process_authority_power(data, current_context)

	# Rule 3: Recursively process nested lists of dictionaries or strings
	for key, value in data.items():
	if isinstance(value, list) and value:
	# Sub-rule 3a: List of dictionaries (e.g., subclauses, items)
	if all(isinstance(item, dict) for item in value):
	for item in value:
	nested_results = process_chunk(item, current_context)
	if nested_results:
	new_chunks.extend(nested_results)
	has_nested_chunks = True

	# Sub-rule 3b: List of simple strings (e.g., items in Annexure A)
	elif all(isinstance(item, str) for item in value):
	title = current_context.get('title')
	for item_text in value:
	text = f"Regarding '{title}', a relevant item is: {item_text}."
	new_chunks.append(create_chunk(current_context, text_override=text))
	has_nested_chunks = True

	# --- Finalization ---

	# If we created specific chunks from children, we don't need the generic parent.
	if has_nested_chunks:
	return new_chunks

	# Base case: If no specific rules were matched, create a single chunk for the item.
	# This happens for "leaf" nodes that cannot be deconstructed further.
	new_chunks.append(create_chunk(current_context))
	return new_chunks

	def main():
	"""Main function to read, process, and write."""
	print(f"Starting to process '{INPUT_FILE}'...")
	final_chunks = []

	try:
	with open(INPUT_FILE, 'r', encoding='utf-8') as f:
	for i, line in enumerate(f):
	try:
	data = json.loads(line)
	processed = process_chunk(data, {})
	final_chunks.extend(processed)
	except json.JSONDecodeError:
	print(f"Warning: Skipping malformed JSON on line {i+1}")
	continue
	except FileNotFoundError:
	print(f"Error: Input file '{INPUT_FILE}' not found.")
	return

	print(f"Deconstructed into {len(final_chunks)} granular chunks.")

	with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
	for chunk in final_chunks:
	f.write(json.dumps(chunk) + '\n')

	print(f"Successfully created improved granular chunks file: '{OUTPUT_FILE}'")

	if __name__ == "__main__":
	main()