File size: 2,807 Bytes
7602502
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3fac7d8
 
7602502
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""Pydantic models for Pinecone-processed standard records."""

from __future__ import annotations

from typing import Any

from pydantic import BaseModel, ConfigDict, Field, field_validator


class PineconeRecord(BaseModel):
    """A single standard record ready for Pinecone upsert."""

    model_config = ConfigDict(
        json_encoders={
            # Ensure parent_id null is serialized as null, not omitted
            type(None): lambda v: None,
        },
        # Use snake_case for field names (matches JSON schema)
        populate_by_name=True,
    )

    # Core identifier - use alias to serialize as _id
    id: str = Field(alias="_id", serialization_alias="_id")

    # Content for embedding
    content: str

    # Standard Set Context
    standard_set_id: str
    standard_set_title: str
    subject: str
    normalized_subject: str | None = None
    education_levels: list[str]
    document_id: str | None = None
    document_valid: str | None = None
    publication_status: str | None = None
    jurisdiction_id: str
    jurisdiction_title: str

    # Standard Identity & Position
    asn_identifier: str | None = None
    statement_notation: str | None = None
    statement_label: str | None = None
    depth: int
    is_leaf: bool
    is_root: bool

    # Hierarchy Relationships
    parent_id: str | None = None  # null for root nodes
    root_id: str
    ancestor_ids: list[str]
    child_ids: list[str]
    sibling_count: int

    @field_validator("education_levels", mode="before")
    @classmethod
    def process_education_levels(cls, v: Any) -> list[str]:
        """
        Process education_levels: split comma-separated strings, flatten, dedupe.

        Handles cases where source data has comma-separated values within array
        elements (e.g., ["01,02"] instead of ["01", "02"]).

        Args:
            v: Input value (list[str] or list with comma-separated strings)

        Returns:
            Flattened, deduplicated list of grade level strings
        """
        if not isinstance(v, list):
            return []

        # Split comma-separated strings and flatten
        flattened: list[str] = []
        for item in v:
            if isinstance(item, str):
                # Split on commas and strip whitespace
                split_items = [s.strip() for s in item.split(",") if s.strip()]
                flattened.extend(split_items)

        # Deduplicate while preserving order
        seen: set[str] = set()
        result: list[str] = []
        for item in flattened:
            if item not in seen:
                seen.add(item)
                result.append(item)

        return result


class ProcessedStandardSet(BaseModel):
    """Container for processed standard set records ready for Pinecone."""

    records: list[PineconeRecord]