Spaces:
Running
Running
| import json | |
| import re | |
| from typing import List, Dict, Any | |
| # Define the input and output filenames | |
| INPUT_FILE = "combined_context.jsonl" | |
| OUTPUT_FILE = "granular_chunks_improved.jsonl" | |
| # Global counter to ensure all generated IDs are unique | |
| chunk_counter = 0 | |
| def get_unique_id() -> int: | |
| """Returns a unique, incrementing ID.""" | |
| global chunk_counter | |
| chunk_counter += 1 | |
| return chunk_counter | |
| def parse_value_to_int(value_str: str) -> int: | |
| """Converts a financial string like '₹5 crore' or '₹50 lakh' to an integer.""" | |
| if not isinstance(value_str, str): | |
| return 0 | |
| value_str = value_str.lower().replace('₹', '').strip() | |
| if value_str in ["nil", "---", ""]: | |
| return 0 | |
| try: | |
| num_part = re.findall(r'[\d\.]+', value_str) | |
| if not num_part: | |
| return 0 | |
| num = float(num_part[0]) | |
| if 'crore' in value_str: | |
| return int(num * 1_00_00_000) | |
| if 'lakh' in value_str: | |
| return int(num * 1_00_000) | |
| return int(num) | |
| except (ValueError, IndexError): | |
| return 0 | |
| def create_chunk(context: Dict, text_override: str = None, id_override: str = None) -> Dict: | |
| """Helper function to create a standardized chunk with rich metadata.""" | |
| chunk_id = id_override if id_override else f"chunk-{get_unique_id()}" | |
| # Determine the primary text for the chunk | |
| text = text_override | |
| if not text: | |
| # Create a sensible default text if none is provided | |
| text_parts = [context.get("title"), context.get("description")] | |
| text = ". ".join(filter(None, text_parts)) or str(context) | |
| metadata = { | |
| "section": context.get("section"), | |
| "clause": context.get("clause"), | |
| "subclause_id": context.get("id"), | |
| "title": context.get("title"), | |
| "description": context.get("description"), | |
| "authority": context.get("authority"), | |
| "limit_text": context.get("limit_text"), | |
| "limit_inr": parse_value_to_int(str(context.get("limit_text", ""))), | |
| "source": context.get("source"), | |
| } | |
| return { | |
| "id": chunk_id, | |
| "text": text, | |
| "metadata": {k: v for k, v in metadata.items() if v is not None} | |
| } | |
| def _process_authority_power(data: Dict, context: Dict) -> List[Dict]: | |
| """ | |
| Specifically handles the complex "authority" and "extent_of_power" structures. | |
| This logic is complex because the data types for these keys vary. | |
| """ | |
| chunks = [] | |
| title = context.get("title", "this rule") | |
| # Case 1: Authority and Power are simple strings | |
| if isinstance(data.get("authority"), str) and isinstance(data.get("extent_of_power"), str): | |
| text = f"Regarding '{title}', the approving authority is {data['authority']} with '{data['extent_of_power']}'." | |
| chunk_context = context.copy() | |
| chunk_context["authority"] = data['authority'] | |
| chunk_context["limit_text"] = data['extent_of_power'] | |
| chunks.append(create_chunk(chunk_context, text_override=text)) | |
| # Case 2: Authority and Power are lists of dictionaries (most complex case) | |
| elif isinstance(data.get("authority"), list) and isinstance(data.get("extent_of_power"), list): | |
| authorities = data["authority"] | |
| powers = data["extent_of_power"] | |
| # Assuming the lists correspond to each other | |
| for i in range(min(len(authorities), len(powers))): | |
| auth_item = authorities[i] | |
| power_item = powers[i] | |
| # Extract descriptions from the dictionaries | |
| auth_desc = next(iter(auth_item.values())) if isinstance(auth_item, dict) else str(auth_item) | |
| power_desc = next(iter(power_item.values())) if isinstance(power_item, dict) else str(power_item) | |
| text = f"For '{title}', the authority for '{auth_desc}' is given '{power_desc}'." | |
| chunk_context = context.copy() | |
| chunk_context["authority"] = auth_desc | |
| chunk_context["limit_text"] = power_desc | |
| chunks.append(create_chunk(chunk_context, text_override=text)) | |
| # Fallback for any other structure | |
| else: | |
| text = f"Regarding '{title}', the authority and power details are as follows: {json.dumps(data)}." | |
| chunks.append(create_chunk(context, text_override=text)) | |
| return chunks | |
| def process_chunk(data: Dict, context: Dict) -> List[Dict]: | |
| """ | |
| Processes a dictionary from the source file and deconstructs it into granular chunks. | |
| """ | |
| new_chunks = [] | |
| # Update context with current data, giving preference to new keys | |
| current_context = context.copy() | |
| current_context.update(data) | |
| has_nested_chunks = False | |
| # --- Rule-based deconstruction --- | |
| # Rule 1: Handle "delegation" structure (most specific) | |
| if "delegation" in data and isinstance(data["delegation"], dict): | |
| for authority, limit_text in data["delegation"].items(): | |
| desc = current_context.get('description') or current_context.get('title') | |
| text = f"Regarding '{desc}', the delegation for {authority} is '{limit_text}'." | |
| chunk_context = current_context.copy() | |
| chunk_context["authority"] = authority | |
| chunk_context["limit_text"] = str(limit_text) | |
| new_chunks.append(create_chunk(chunk_context, text_override=text)) | |
| return new_chunks | |
| # Rule 2: Handle "authority" and "extent_of_power" structures | |
| if "authority" in data and "extent_of_power" in data: | |
| return _process_authority_power(data, current_context) | |
| # Rule 3: Recursively process nested lists of dictionaries or strings | |
| for key, value in data.items(): | |
| if isinstance(value, list) and value: | |
| # Sub-rule 3a: List of dictionaries (e.g., subclauses, items) | |
| if all(isinstance(item, dict) for item in value): | |
| for item in value: | |
| nested_results = process_chunk(item, current_context) | |
| if nested_results: | |
| new_chunks.extend(nested_results) | |
| has_nested_chunks = True | |
| # Sub-rule 3b: List of simple strings (e.g., items in Annexure A) | |
| elif all(isinstance(item, str) for item in value): | |
| title = current_context.get('title') | |
| for item_text in value: | |
| text = f"Regarding '{title}', a relevant item is: {item_text}." | |
| new_chunks.append(create_chunk(current_context, text_override=text)) | |
| has_nested_chunks = True | |
| # --- Finalization --- | |
| # If we created specific chunks from children, we don't need the generic parent. | |
| if has_nested_chunks: | |
| return new_chunks | |
| # Base case: If no specific rules were matched, create a single chunk for the item. | |
| # This happens for "leaf" nodes that cannot be deconstructed further. | |
| new_chunks.append(create_chunk(current_context)) | |
| return new_chunks | |
| def main(): | |
| """Main function to read, process, and write.""" | |
| print(f"Starting to process '{INPUT_FILE}'...") | |
| final_chunks = [] | |
| try: | |
| with open(INPUT_FILE, 'r', encoding='utf-8') as f: | |
| for i, line in enumerate(f): | |
| try: | |
| data = json.loads(line) | |
| processed = process_chunk(data, {}) | |
| final_chunks.extend(processed) | |
| except json.JSONDecodeError: | |
| print(f"Warning: Skipping malformed JSON on line {i+1}") | |
| continue | |
| except FileNotFoundError: | |
| print(f"Error: Input file '{INPUT_FILE}' not found.") | |
| return | |
| print(f"Deconstructed into {len(final_chunks)} granular chunks.") | |
| with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: | |
| for chunk in final_chunks: | |
| f.write(json.dumps(chunk) + '\n') | |
| print(f"Successfully created improved granular chunks file: '{OUTPUT_FILE}'") | |
| if __name__ == "__main__": | |
| main() |