Spaces:
Sleeping
Sleeping
Upload backend/core/services/legal_ingestion.py with huggingface_hub
Browse files
backend/core/services/legal_ingestion.py
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utilities to ingest uploaded legal documents into persistent storage.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import hashlib
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
from datetime import datetime, date
|
| 10 |
+
from io import BytesIO
|
| 11 |
+
from typing import BinaryIO, Dict, Optional
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
import re
|
| 14 |
+
|
| 15 |
+
from django.conf import settings
|
| 16 |
+
from django.core.files.base import ContentFile
|
| 17 |
+
from django.db import transaction
|
| 18 |
+
from django.utils import timezone
|
| 19 |
+
|
| 20 |
+
from hue_portal.core.models import (
|
| 21 |
+
LegalDocument,
|
| 22 |
+
LegalSection,
|
| 23 |
+
LegalDocumentImage,
|
| 24 |
+
IngestionJob,
|
| 25 |
+
)
|
| 26 |
+
from hue_portal.core.etl.legal_document_loader import load_legal_document
|
| 27 |
+
from hue_portal.core.tasks import process_ingestion_job
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class LegalIngestionResult:
|
| 32 |
+
document: LegalDocument
|
| 33 |
+
created: bool
|
| 34 |
+
sections_count: int
|
| 35 |
+
images_count: int
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _parse_date(value: Optional[str | date]) -> Optional[date]:
|
| 39 |
+
if isinstance(value, date):
|
| 40 |
+
return value
|
| 41 |
+
if not value:
|
| 42 |
+
return None
|
| 43 |
+
for fmt in ("%Y-%m-%d", "%d/%m/%Y"):
|
| 44 |
+
try:
|
| 45 |
+
return datetime.strptime(value, fmt).date()
|
| 46 |
+
except ValueError:
|
| 47 |
+
continue
|
| 48 |
+
return None
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _sha256(data: bytes) -> str:
|
| 52 |
+
digest = hashlib.sha256()
|
| 53 |
+
digest.update(data)
|
| 54 |
+
return digest.hexdigest()
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _normalize_text(text: str) -> str:
|
| 58 |
+
cleaned = re.sub(r"\s+", "", text or "")
|
| 59 |
+
return cleaned.lower()
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
DOC_TYPE_KEYWORDS = {
|
| 63 |
+
"decision": ["quyết định"],
|
| 64 |
+
"circular": ["thông tư"],
|
| 65 |
+
"guideline": ["hướng dẫn"],
|
| 66 |
+
"plan": ["kế hoạch"],
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _auto_fill_metadata(
|
| 71 |
+
*, text: str, title: str, issued_by: str, issued_at: Optional[date], doc_type: str
|
| 72 |
+
) -> tuple[str, str, Optional[date], str]:
|
| 73 |
+
head = (text or "")[:2000]
|
| 74 |
+
if not issued_by:
|
| 75 |
+
match = re.search(r"(BỘ\s+[A-ZÂĂÊÔƠƯ\s]+|ỦY BAN\s+NHÂN DÂN\s+[^\n]+)", head, re.IGNORECASE)
|
| 76 |
+
if match:
|
| 77 |
+
issued_by = match.group(0).strip()
|
| 78 |
+
|
| 79 |
+
if not issued_at:
|
| 80 |
+
match = re.search(
|
| 81 |
+
r"(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})", head,
|
| 82 |
+
)
|
| 83 |
+
if match:
|
| 84 |
+
day, month, year = match.groups()
|
| 85 |
+
issued_at = _parse_date(f"{year}-{int(month):02d}-{int(day):02d}")
|
| 86 |
+
else:
|
| 87 |
+
match = re.search(
|
| 88 |
+
r"ngày\s+(\d{1,2})\s+tháng\s+(\d{1,2})\s+năm\s+(\d{4})",
|
| 89 |
+
head,
|
| 90 |
+
re.IGNORECASE,
|
| 91 |
+
)
|
| 92 |
+
if match:
|
| 93 |
+
day, month, year = match.groups()
|
| 94 |
+
issued_at = _parse_date(f"{year}-{int(month):02d}-{int(day):02d}")
|
| 95 |
+
|
| 96 |
+
if doc_type == "other":
|
| 97 |
+
lower = head.lower()
|
| 98 |
+
for dtype, keywords in DOC_TYPE_KEYWORDS.items():
|
| 99 |
+
if any(keyword in lower for keyword in keywords):
|
| 100 |
+
doc_type = dtype
|
| 101 |
+
break
|
| 102 |
+
|
| 103 |
+
if not title or title == (DOC_TYPE_KEYWORDS.get(doc_type, [title])[0] if doc_type != "other" else ""):
|
| 104 |
+
match = re.search(r"(QUYẾT ĐỊNH|THÔNG TƯ|HƯỚNG DẪN|KẾ HOẠCH)[^\n]+", head, re.IGNORECASE)
|
| 105 |
+
if match:
|
| 106 |
+
title = match.group(0).strip().title()
|
| 107 |
+
|
| 108 |
+
return title, issued_by, issued_at, doc_type
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def ingest_uploaded_document(
|
| 112 |
+
*,
|
| 113 |
+
file_obj: BinaryIO,
|
| 114 |
+
filename: str,
|
| 115 |
+
metadata: Dict,
|
| 116 |
+
) -> LegalIngestionResult:
|
| 117 |
+
"""
|
| 118 |
+
Ingest uploaded PDF/DOCX file, storing raw file, sections, and extracted images.
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
file_obj: Binary file-like object positioned at start.
|
| 122 |
+
filename: Original filename.
|
| 123 |
+
metadata: dict containing code, title, doc_type, summary, issued_by, issued_at, source_url, extra_metadata.
|
| 124 |
+
"""
|
| 125 |
+
code = metadata.get("code", "").strip()
|
| 126 |
+
if not code:
|
| 127 |
+
raise ValueError("Document code is required.")
|
| 128 |
+
|
| 129 |
+
title = metadata.get("title") or code
|
| 130 |
+
doc_type = metadata.get("doc_type", "other")
|
| 131 |
+
issued_at = _parse_date(metadata.get("issued_at"))
|
| 132 |
+
summary = metadata.get("summary", "")
|
| 133 |
+
issued_by = metadata.get("issued_by", "")
|
| 134 |
+
source_url = metadata.get("source_url", "")
|
| 135 |
+
extra_metadata = metadata.get("metadata") or {}
|
| 136 |
+
|
| 137 |
+
file_bytes = file_obj.read()
|
| 138 |
+
if hasattr(file_obj, "seek"):
|
| 139 |
+
file_obj.seek(0)
|
| 140 |
+
checksum = _sha256(file_bytes)
|
| 141 |
+
mime_type = metadata.get("mime_type") or getattr(file_obj, "content_type", "")
|
| 142 |
+
size = len(file_bytes)
|
| 143 |
+
|
| 144 |
+
extracted = load_legal_document(BytesIO(file_bytes), filename=filename)
|
| 145 |
+
title, issued_by, issued_at, doc_type = _auto_fill_metadata(
|
| 146 |
+
text=extracted.text, title=title, issued_by=issued_by, issued_at=issued_at, doc_type=doc_type
|
| 147 |
+
)
|
| 148 |
+
normalized_text = _normalize_text(extracted.text)
|
| 149 |
+
content_checksum = _sha256(normalized_text.encode("utf-8"))
|
| 150 |
+
|
| 151 |
+
duplicate = (
|
| 152 |
+
LegalDocument.objects.filter(content_checksum=content_checksum)
|
| 153 |
+
.exclude(code=code)
|
| 154 |
+
.first()
|
| 155 |
+
)
|
| 156 |
+
if duplicate:
|
| 157 |
+
raise ValueError(f"Nội dung trùng với văn bản hiện có: {duplicate.code}")
|
| 158 |
+
|
| 159 |
+
with transaction.atomic():
|
| 160 |
+
doc, created = LegalDocument.objects.get_or_create(
|
| 161 |
+
code=code,
|
| 162 |
+
defaults={
|
| 163 |
+
"title": title,
|
| 164 |
+
"doc_type": doc_type,
|
| 165 |
+
"summary": summary,
|
| 166 |
+
"issued_by": issued_by,
|
| 167 |
+
"issued_at": issued_at,
|
| 168 |
+
"source_url": source_url,
|
| 169 |
+
"metadata": extra_metadata,
|
| 170 |
+
},
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
# Update metadata if document already existed (keep latest info)
|
| 174 |
+
doc.title = title
|
| 175 |
+
doc.doc_type = doc_type
|
| 176 |
+
doc.summary = summary
|
| 177 |
+
doc.issued_by = issued_by
|
| 178 |
+
doc.issued_at = issued_at
|
| 179 |
+
doc.source_url = source_url
|
| 180 |
+
doc.metadata = extra_metadata
|
| 181 |
+
doc.page_count = extracted.page_count
|
| 182 |
+
doc.raw_text = extracted.text
|
| 183 |
+
doc.raw_text_ocr = extracted.ocr_text or ""
|
| 184 |
+
doc.file_checksum = checksum
|
| 185 |
+
doc.content_checksum = content_checksum
|
| 186 |
+
doc.file_size = size
|
| 187 |
+
doc.mime_type = mime_type
|
| 188 |
+
doc.original_filename = filename
|
| 189 |
+
doc.updated_at = timezone.now()
|
| 190 |
+
|
| 191 |
+
# Save binary file
|
| 192 |
+
content = ContentFile(file_bytes)
|
| 193 |
+
storage_name = f"{code}/{filename}"
|
| 194 |
+
doc.uploaded_file.save(storage_name, content, save=False)
|
| 195 |
+
doc.source_file = doc.uploaded_file.name
|
| 196 |
+
doc.save()
|
| 197 |
+
|
| 198 |
+
# Replace sections
|
| 199 |
+
doc.sections.all().delete()
|
| 200 |
+
sections = []
|
| 201 |
+
for idx, section in enumerate(extracted.sections, start=1):
|
| 202 |
+
sections.append(
|
| 203 |
+
LegalSection(
|
| 204 |
+
document=doc,
|
| 205 |
+
section_code=section.code,
|
| 206 |
+
section_title=section.title,
|
| 207 |
+
level=section.level,
|
| 208 |
+
order=idx,
|
| 209 |
+
content=section.content,
|
| 210 |
+
excerpt=section.content[:400],
|
| 211 |
+
page_start=section.page_start,
|
| 212 |
+
page_end=section.page_end,
|
| 213 |
+
is_ocr=section.is_ocr,
|
| 214 |
+
metadata=section.metadata or {},
|
| 215 |
+
)
|
| 216 |
+
)
|
| 217 |
+
LegalSection.objects.bulk_create(sections, batch_size=200)
|
| 218 |
+
|
| 219 |
+
# Replace images
|
| 220 |
+
doc.images.all().delete()
|
| 221 |
+
images = []
|
| 222 |
+
for idx, image in enumerate(extracted.images, start=1):
|
| 223 |
+
image_content = ContentFile(image.data)
|
| 224 |
+
image_name = f"{code}/img_{idx}.{image.extension}"
|
| 225 |
+
img_instance = LegalDocumentImage(
|
| 226 |
+
document=doc,
|
| 227 |
+
page_number=image.page_number,
|
| 228 |
+
description=image.description,
|
| 229 |
+
width=image.width,
|
| 230 |
+
height=image.height,
|
| 231 |
+
checksum=_sha256(image.data),
|
| 232 |
+
)
|
| 233 |
+
img_instance.image.save(image_name, image_content, save=False)
|
| 234 |
+
images.append(img_instance)
|
| 235 |
+
LegalDocumentImage.objects.bulk_create(images, batch_size=100)
|
| 236 |
+
|
| 237 |
+
return LegalIngestionResult(
|
| 238 |
+
document=doc,
|
| 239 |
+
created=created,
|
| 240 |
+
sections_count=len(sections),
|
| 241 |
+
images_count=len(images),
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
def enqueue_ingestion_job(*, file_obj, filename: str, metadata: Dict) -> IngestionJob:
|
| 246 |
+
"""
|
| 247 |
+
Persist uploaded file to a temporary job folder and enqueue Celery processing.
|
| 248 |
+
"""
|
| 249 |
+
|
| 250 |
+
job = IngestionJob.objects.create(
|
| 251 |
+
code=metadata.get("code", ""),
|
| 252 |
+
filename=filename,
|
| 253 |
+
metadata=metadata,
|
| 254 |
+
status=IngestionJob.STATUS_PENDING,
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
temp_dir = Path(settings.MEDIA_ROOT) / "ingestion_jobs" / str(job.id)
|
| 258 |
+
temp_dir.mkdir(parents=True, exist_ok=True)
|
| 259 |
+
temp_path = temp_dir / filename
|
| 260 |
+
|
| 261 |
+
if hasattr(file_obj, "seek"):
|
| 262 |
+
file_obj.seek(0)
|
| 263 |
+
if hasattr(file_obj, "chunks"):
|
| 264 |
+
with temp_path.open("wb") as dest:
|
| 265 |
+
for chunk in file_obj.chunks():
|
| 266 |
+
dest.write(chunk)
|
| 267 |
+
else:
|
| 268 |
+
data = file_obj.read()
|
| 269 |
+
with temp_path.open("wb") as dest:
|
| 270 |
+
dest.write(data)
|
| 271 |
+
|
| 272 |
+
job.storage_path = str(temp_path)
|
| 273 |
+
job.save(update_fields=["storage_path"])
|
| 274 |
+
process_ingestion_job.delay(str(job.id))
|
| 275 |
+
return job
|
| 276 |
+
|