davidtran999 commited on
Commit
be32d76
·
verified ·
1 Parent(s): 45dea2f

Upload backend/core/tests/test_legal_ingestion.py with huggingface_hub

Browse files
backend/core/tests/test_legal_ingestion.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import tempfile
4
+ from io import BytesIO
5
+
6
+ from django.test import TestCase, override_settings
7
+ from django.core.files.uploadedfile import SimpleUploadedFile
8
+ from PIL import Image as PILImage
9
+ from docx import Document
10
+
11
+ from hue_portal.core.services import ingest_uploaded_document, enqueue_ingestion_job
12
+ from hue_portal.core.models import LegalDocument, IngestionJob
13
+
14
+
15
+ class LegalIngestionServiceTests(TestCase):
16
+ def setUp(self):
17
+ self.media_dir = tempfile.mkdtemp(prefix="legal-media-")
18
+ self.override = override_settings(MEDIA_ROOT=self.media_dir)
19
+ self.override.enable()
20
+
21
+ def tearDown(self):
22
+ self.override.disable()
23
+ shutil.rmtree(self.media_dir, ignore_errors=True)
24
+
25
+ def _make_docx_with_image(self) -> bytes:
26
+ document = Document()
27
+ document.add_paragraph("Điều 1. Quy định chung")
28
+ document.add_paragraph("Nội dung điều 1 được ghi rõ ràng.")
29
+
30
+ fd, image_path = tempfile.mkstemp(suffix=".png")
31
+ os.close(fd)
32
+ try:
33
+ pil_image = PILImage.new("RGB", (32, 32), color="red")
34
+ pil_image.save(image_path)
35
+ document.add_picture(image_path)
36
+ finally:
37
+ os.remove(image_path)
38
+
39
+ buffer = BytesIO()
40
+ document.save(buffer)
41
+ return buffer.getvalue()
42
+
43
+ def _make_docx_with_header(self, header: str, body: str) -> bytes:
44
+ document = Document()
45
+ document.add_paragraph(header)
46
+ for line in body.split("\n"):
47
+ document.add_paragraph(line)
48
+ buffer = BytesIO()
49
+ document.save(buffer)
50
+ return buffer.getvalue()
51
+
52
+ def test_ingest_docx_extracts_sections_and_images(self):
53
+ docx_bytes = self._make_docx_with_image()
54
+ metadata = {
55
+ "code": "TEST-DOC-1",
56
+ "title": "Tài liệu thử nghiệm",
57
+ "doc_type": "circular",
58
+ "summary": "Tài liệu test",
59
+ "issued_by": "Test Unit",
60
+ "issued_at": "2025-11-18",
61
+ "source_url": "",
62
+ "metadata": {"tags": ["demo"]},
63
+ }
64
+
65
+ result = ingest_uploaded_document(
66
+ file_obj=BytesIO(docx_bytes),
67
+ filename="test.docx",
68
+ metadata=metadata,
69
+ )
70
+
71
+ self.assertGreaterEqual(result.sections_count, 1)
72
+ self.assertEqual(result.images_count, 1)
73
+ self.assertTrue(result.document.raw_text.startswith("Điều 1"))
74
+ self.assertTrue(result.document.file_checksum)
75
+ self.assertEqual(result.document.raw_text_ocr, "")
76
+ self.assertTrue(result.document.uploaded_file.name)
77
+ self.assertTrue(result.document.images.exists())
78
+
79
+ stored_doc = LegalDocument.objects.get(code="TEST-DOC-1")
80
+ self.assertGreaterEqual(stored_doc.sections.count(), 1)
81
+ self.assertEqual(stored_doc.sections.filter(is_ocr=True).count(), 0)
82
+
83
+ def test_enqueue_ingestion_job_runs_when_eager(self):
84
+ docx_bytes = self._make_docx_with_image()
85
+ upload = SimpleUploadedFile("test.docx", docx_bytes, content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
86
+ metadata = {
87
+ "code": "TEST-DOC-QUEUE",
88
+ "title": "Hàng đợi",
89
+ "doc_type": "decision",
90
+ }
91
+
92
+ job = enqueue_ingestion_job(file_obj=upload, filename=upload.name, metadata=metadata)
93
+ job.refresh_from_db()
94
+
95
+ self.assertEqual(job.status, IngestionJob.STATUS_COMPLETED)
96
+ self.assertIsNotNone(job.document)
97
+ self.assertEqual(job.stats.get("sections"), job.document.sections.count())
98
+
99
+ def test_auto_metadata_and_deduplication(self):
100
+ header = "QUYẾT ĐỊNH CỦA BỘ CÔNG AN\nNgày 01/02/2024"
101
+ docx_bytes = self._make_docx_with_header(header, "Nội dung quyết định ...")
102
+ metadata = {
103
+ "code": "AUTO-META",
104
+ "title": "",
105
+ "doc_type": "other",
106
+ "issued_by": "",
107
+ "issued_at": "",
108
+ }
109
+ result = ingest_uploaded_document(
110
+ file_obj=BytesIO(docx_bytes),
111
+ filename="auto.docx",
112
+ metadata=metadata,
113
+ )
114
+ stored_doc = LegalDocument.objects.get(code="AUTO-META")
115
+ self.assertEqual(stored_doc.doc_type, "decision")
116
+ self.assertIsNotNone(stored_doc.issued_at)
117
+ self.assertIn("Bộ Công An", stored_doc.issued_by.title())
118
+ self.assertTrue(result.document.content_checksum)
119
+
120
+ metadata_dup = {
121
+ "code": "AUTO-META-2",
122
+ "title": "",
123
+ "doc_type": "other",
124
+ }
125
+ with self.assertRaises(ValueError):
126
+ ingest_uploaded_document(
127
+ file_obj=BytesIO(docx_bytes),
128
+ filename="auto-copy.docx",
129
+ metadata=metadata_dup,
130
+ )
131
+