File size: 12,970 Bytes
7602502
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
"""Processor for transforming standard sets into Pinecone-ready format."""

from __future__ import annotations

import json
from pathlib import Path
from typing import TYPE_CHECKING

from loguru import logger

from tools.config import get_settings
from tools.models import StandardSet, StandardSetResponse
from tools.pinecone_models import PineconeRecord, ProcessedStandardSet

if TYPE_CHECKING:
    from collections.abc import Mapping

settings = get_settings()


class StandardSetProcessor:
    """Processes standard sets into Pinecone-ready format."""

    def __init__(self):
        """Initialize the processor."""
        self.id_to_standard: dict[str, dict] = {}
        self.parent_to_children: dict[str | None, list[str]] = {}
        self.leaf_nodes: set[str] = set()
        self.root_nodes: set[str] = set()

    def process_standard_set(self, standard_set: StandardSet) -> ProcessedStandardSet:
        """
        Process a standard set into Pinecone-ready records.

        Args:
            standard_set: The StandardSet model from the API

        Returns:
            ProcessedStandardSet with all records ready for Pinecone
        """
        # Build relationship maps from all standards
        self._build_relationship_maps(standard_set.standards)

        # Process each standard into a PineconeRecord
        records = []
        for standard_id, standard in standard_set.standards.items():
            record = self._transform_standard(standard, standard_set)
            records.append(record)

        return ProcessedStandardSet(records=records)

    def _build_relationship_maps(self, standards: dict[str, Standard]) -> None:
        """
        Build helper data structures from all standards in the set.

        Args:
            standards: Dictionary mapping standard ID to Standard object
        """
        # Convert to dict format for easier manipulation
        standards_dict = {
            std_id: standard.model_dump() for std_id, standard in standards.items()
        }

        # Build ID-to-standard map
        self.id_to_standard = self._build_id_to_standard_map(standards_dict)

        # Build parent-to-children map (sorted by position)
        self.parent_to_children = self._build_parent_to_children_map(standards_dict)

        # Identify leaf nodes
        self.leaf_nodes = self._identify_leaf_nodes(standards_dict)

        # Identify root nodes
        self.root_nodes = self._identify_root_nodes(standards_dict)

    def _build_id_to_standard_map(
        self, standards: dict[str, dict]
    ) -> dict[str, dict]:
        """Build map of id -> standard object."""
        return {std_id: std for std_id, std in standards.items()}

    def _build_parent_to_children_map(
        self, standards: dict[str, dict]
    ) -> dict[str | None, list[str]]:
        """
        Build map of parentId -> [child_ids], sorted by position ascending.

        Args:
            standards: Dictionary of standard ID to standard dict

        Returns:
            Dictionary mapping parent ID (or None for roots) to sorted list of child IDs
        """
        parent_map: dict[str | None, list[tuple[int, str]]] = {}

        for std_id, std in standards.items():
            parent_id = std.get("parentId")
            position = std.get("position", 0)

            if parent_id not in parent_map:
                parent_map[parent_id] = []
            parent_map[parent_id].append((position, std_id))

        # Sort each list by position and extract just the IDs
        result: dict[str | None, list[str]] = {}
        for parent_id, children in parent_map.items():
            sorted_children = sorted(children, key=lambda x: x[0])
            result[parent_id] = [std_id for _, std_id in sorted_children]

        return result

    def _identify_leaf_nodes(self, standards: dict[str, dict]) -> set[str]:
        """
        Identify leaf nodes: standards whose ID does NOT appear as any standard's parentId.

        Args:
            standards: Dictionary of standard ID to standard dict

        Returns:
            Set of standard IDs that are leaf nodes
        """
        all_ids = set(standards.keys())
        parent_ids = {std.get("parentId") for std in standards.values() if std.get("parentId") is not None}

        # Leaf nodes are IDs that are NOT in parent_ids
        return all_ids - parent_ids

    def _identify_root_nodes(self, standards: dict[str, dict]) -> set[str]:
        """
        Identify root nodes: standards where parentId is null.

        Args:
            standards: Dictionary of standard ID to standard dict

        Returns:
            Set of standard IDs that are root nodes
        """
        return {
            std_id
            for std_id, std in standards.items()
            if std.get("parentId") is None
        }

    def find_root_id(self, standard: dict, id_to_standard: dict[str, dict]) -> str:
        """
        Walk up the parent chain to find the root ancestor.

        Args:
            standard: The standard dict to find root for
            id_to_standard: Map of ID to standard dict

        Returns:
            The root ancestor's ID
        """
        current = standard
        visited = set()  # Prevent infinite loops from bad data

        while current.get("parentId") is not None:
            parent_id = current["parentId"]
            if parent_id in visited:
                break  # Circular reference protection
            visited.add(parent_id)

            if parent_id not in id_to_standard:
                break  # Parent not found, use current as root
            current = id_to_standard[parent_id]

        return current["id"]

    def build_ordered_ancestors(
        self, standard: dict, id_to_standard: dict[str, dict]
    ) -> list[str]:
        """
        Build ancestor list ordered from root (index 0) to immediate parent (last index).

        Args:
            standard: The standard dict to build ancestors for
            id_to_standard: Map of ID to standard dict

        Returns:
            List of ancestor IDs ordered root -> immediate parent
        """
        ancestors = []
        current_id = standard.get("parentId")
        visited = set()

        while current_id is not None and current_id not in visited:
            visited.add(current_id)
            if current_id in id_to_standard:
                ancestors.append(current_id)
                current_id = id_to_standard[current_id].get("parentId")
            else:
                break

        ancestors.reverse()  # Now ordered root → immediate parent
        return ancestors

    def _compute_sibling_count(self, standard: dict) -> int:
        """
        Count standards with same parent_id, excluding self.

        Args:
            standard: The standard dict

        Returns:
            Number of siblings (excluding self)
        """
        parent_id = standard.get("parentId")
        if parent_id not in self.parent_to_children:
            return 0

        siblings = self.parent_to_children[parent_id]
        # Exclude self from count
        return len([s for s in siblings if s != standard["id"]])

    def _build_content_text(self, standard: dict) -> str:
        """
        Generate content text block with full hierarchy.

        Format: "Depth N (notation): description" for each ancestor and self.

        Args:
            standard: The standard dict

        Returns:
            Multi-line text block with full hierarchy
        """
        # Build ordered ancestor chain
        ancestor_ids = self.build_ordered_ancestors(standard, self.id_to_standard)

        # Build lines from root to current standard
        lines = []

        # Add ancestor lines
        for ancestor_id in ancestor_ids:
            ancestor = self.id_to_standard[ancestor_id]
            depth = ancestor.get("depth", 0)
            description = ancestor.get("description", "")
            notation = ancestor.get("statementNotation")

            if notation:
                lines.append(f"Depth {depth} ({notation}): {description}")
            else:
                lines.append(f"Depth {depth}: {description}")

        # Add current standard line
        depth = standard.get("depth", 0)
        description = standard.get("description", "")
        notation = standard.get("statementNotation")

        if notation:
            lines.append(f"Depth {depth} ({notation}): {description}")
        else:
            lines.append(f"Depth {depth}: {description}")

        return "\n".join(lines)

    def _transform_standard(
        self, standard: Standard, standard_set: StandardSet
    ) -> PineconeRecord:
        """
        Transform a single standard into a PineconeRecord.

        Args:
            standard: The Standard object to transform
            standard_set: The parent StandardSet containing context

        Returns:
            PineconeRecord ready for Pinecone upsert
        """
        std_dict = standard.model_dump()

        # Compute hierarchy relationships
        is_root = std_dict.get("parentId") is None
        root_id = (
            std_dict["id"] if is_root else self.find_root_id(std_dict, self.id_to_standard)
        )
        ancestor_ids = self.build_ordered_ancestors(std_dict, self.id_to_standard)
        child_ids = self.parent_to_children.get(std_dict["id"], [])
        is_leaf = std_dict["id"] in self.leaf_nodes
        sibling_count = self._compute_sibling_count(std_dict)

        # Build content text
        content = self._build_content_text(std_dict)

        # Extract standard set context
        parent_id = std_dict.get("parentId")  # Keep as None if null

        # Build record with all fields
        # Note: Use "id" not "_id" - Pydantic handles serialization alias automatically
        record_data = {
            "id": std_dict["id"],
            "content": content,
            "standard_set_id": standard_set.id,
            "standard_set_title": standard_set.title,
            "subject": standard_set.subject,
            "normalized_subject": standard_set.normalizedSubject,  # Optional, can be None
            "education_levels": standard_set.educationLevels,
            "document_id": standard_set.document.id,
            "document_valid": standard_set.document.valid,
            "publication_status": standard_set.document.publicationStatus,  # Optional, can be None
            "jurisdiction_id": standard_set.jurisdiction.id,
            "jurisdiction_title": standard_set.jurisdiction.title,
            "depth": std_dict.get("depth", 0),
            "is_leaf": is_leaf,
            "is_root": is_root,
            "parent_id": parent_id,
            "root_id": root_id,
            "ancestor_ids": ancestor_ids,
            "child_ids": child_ids,
            "sibling_count": sibling_count,
        }

        # Add optional fields only if present
        if std_dict.get("asnIdentifier"):
            record_data["asn_identifier"] = std_dict["asnIdentifier"]
        if std_dict.get("statementNotation"):
            record_data["statement_notation"] = std_dict["statementNotation"]
        if std_dict.get("statementLabel"):
            record_data["statement_label"] = std_dict["statementLabel"]

        return PineconeRecord(**record_data)


def process_and_save(standard_set_id: str) -> Path:
    """
    Load data.json, process it, and save processed.json.

    Args:
        standard_set_id: The ID of the standard set to process

    Returns:
        Path to the saved processed.json file

    Raises:
        FileNotFoundError: If data.json doesn't exist
        ValueError: If JSON is invalid
    """
    # Locate data.json
    data_file = settings.standard_sets_dir / standard_set_id / "data.json"
    if not data_file.exists():
        logger.warning(f"data.json not found for set {standard_set_id}, skipping")
        raise FileNotFoundError(f"data.json not found for set {standard_set_id}")

    # Load and parse JSON
    try:
        with open(data_file, encoding="utf-8") as f:
            raw_data = json.load(f)
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON in {data_file}: {e}") from e

    # Parse into Pydantic model
    try:
        response = StandardSetResponse(**raw_data)
        standard_set = response.data
    except Exception as e:
        raise ValueError(f"Failed to parse standard set data: {e}") from e

    # Process the standard set
    processor = StandardSetProcessor()
    processed_set = processor.process_standard_set(standard_set)

    # Save processed.json
    processed_file = settings.standard_sets_dir / standard_set_id / "processed.json"
    processed_file.parent.mkdir(parents=True, exist_ok=True)

    with open(processed_file, "w", encoding="utf-8") as f:
        json.dump(processed_set.model_dump(mode="json"), f, indent=2)

    logger.info(
        f"Processed {standard_set_id}: {len(processed_set.records)} records"
    )

    return processed_file