File size: 2,602 Bytes
d23adb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python3
"""
Load PDF/DOCX legal documents into the database with full text + sections.
"""

from __future__ import annotations

import argparse
import json
import os
import sys
from pathlib import Path
from typing import Any, Dict, List

PROJECT_ROOT = Path(__file__).resolve().parents[2]
BACKEND_DIR = PROJECT_ROOT / "backend"
# Only add BACKEND_DIR to sys.path (not hue_portal subdirectory)
# Django needs to find hue_portal package (which is in backend/hue_portal)
if str(BACKEND_DIR) not in sys.path:
    sys.path.insert(0, str(BACKEND_DIR))

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "hue_portal.hue_portal.settings")

import django
django.setup()

from django.core.management import call_command  # noqa: E402


def parse_manifest(path: Path) -> List[Dict[str, Any]]:
    data = json.loads(path.read_text(encoding="utf-8"))
    if not isinstance(data, list):
        raise ValueError("Manifest must be a list of document entries.")
    return data


def ingest_document(root: Path, entry: Dict[str, Any], dry_run: bool = False) -> None:
    source_file = root / entry["source_file"]
    if not source_file.exists():
        raise FileNotFoundError(source_file)

    if dry_run:
        print(f"▶ (dry-run) Would ingest {entry['code']} from {source_file}")
        return

    args = {
        "file": str(source_file),
        "code": entry["code"],
        "title": entry.get("title"),
        "doc_type": entry.get("doc_type", "other"),
        "summary": entry.get("summary", ""),
        "issued_by": entry.get("issued_by", ""),
        "issued_at": entry.get("issued_at"),
        "source_url": entry.get("source_url", ""),
        "metadata": json.dumps(entry.get("metadata", {})),
    }
    print(f"▶ Loading {entry['code']} from {source_file}")
    call_command("load_legal_document", **args)


def main():
    parser = argparse.ArgumentParser(description="Load legal documents into DB.")
    parser.add_argument(
        "--manifest",
        type=Path,
        default=Path(__file__).with_name("legal_documents_manifest.json"),
        help="Path to JSON manifest describing documents.",
    )
    parser.add_argument(
        "--root",
        type=Path,
        default=PROJECT_ROOT,
        help="Root directory for relative source_file paths.",
    )
    parser.add_argument("--dry-run", action="store_true", help="Parse files without DB writes.")
    args = parser.parse_args()

    manifest = parse_manifest(args.manifest)
    for entry in manifest:
        ingest_document(args.root, entry, dry_run=args.dry_run)


if __name__ == "__main__":
    main()