File size: 19,731 Bytes
765d69d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
"""
RAG (Retrieval-Augmented Generation) pipeline for answer generation.
"""
import re
import unicodedata
from typing import List, Dict, Any, Optional
from .hybrid_search import hybrid_search
from .models import Procedure, Fine, Office, Advisory, LegalSection
from hue_portal.chatbot.chatbot import format_fine_amount
from hue_portal.chatbot.llm_integration import get_llm_generator
from hue_portal.chatbot.structured_legal import format_structured_legal_answer


def retrieve_top_k_documents(
    query: str,
    content_type: str,
    top_k: int = 5
) -> List[Any]:
    """
    Retrieve top-k documents using hybrid search.
    
    Args:
        query: Search query.
        content_type: Type of content ('procedure', 'fine', 'office', 'advisory').
        top_k: Number of documents to retrieve.
    
    Returns:
        List of document objects.
    """
    # Get appropriate queryset
    if content_type == 'procedure':
        queryset = Procedure.objects.all()
        text_fields = ['title', 'domain', 'conditions', 'dossier']
    elif content_type == 'fine':
        queryset = Fine.objects.all()
        text_fields = ['name', 'code', 'article', 'decree', 'remedial']
    elif content_type == 'office':
        queryset = Office.objects.all()
        text_fields = ['unit_name', 'address', 'district', 'service_scope']
    elif content_type == 'advisory':
        queryset = Advisory.objects.all()
        text_fields = ['title', 'summary']
    elif content_type == 'legal':
        queryset = LegalSection.objects.select_related("document").all()
        text_fields = ['section_title', 'section_code', 'content']
    else:
        return []
    
    # Use hybrid search with text_fields for exact match boost
    try:
        from .config.hybrid_search_config import get_config
        config = get_config(content_type)
        results = hybrid_search(
            queryset, 
            query, 
            top_k=top_k,
            bm25_weight=config.bm25_weight,
            vector_weight=config.vector_weight,
            min_hybrid_score=config.min_hybrid_score,
            text_fields=text_fields
        )
        return results
    except Exception as e:
        print(f"Error in retrieval: {e}")
        return []


def generate_answer_template(
    query: str,
    documents: List[Any],
    content_type: str,
    context: Optional[List[Dict[str, Any]]] = None,
    use_llm: bool = True
) -> str:
    """
    Generate answer using LLM (if available) or template-based summarization.
    
    Args:
        query: Original query.
        documents: Retrieved documents.
        content_type: Type of content.
        context: Optional conversation context.
        use_llm: Whether to try LLM generation first.
    
    Returns:
        Generated answer text.
    """
    def _invoke_llm(documents_for_prompt: List[Any]) -> Optional[str]:
        """Call configured LLM provider safely."""
        try:
            import traceback
            from hue_portal.chatbot.llm_integration import get_llm_generator

            llm = get_llm_generator()
            if not llm:
                print("[RAG] ⚠️ LLM not available, using template", flush=True)
                return None

                print(f"[RAG] Using LLM provider: {llm.provider}", flush=True)
            llm_answer = llm.generate_answer(
                query,
                context=context,
                documents=documents_for_prompt
            )
                if llm_answer:
                    print(f"[RAG] ✅ LLM answer generated (length: {len(llm_answer)})", flush=True)
                    return llm_answer

            print("[RAG] ⚠️ LLM returned None, using template", flush=True)
        except Exception as exc:
            import traceback

            error_trace = traceback.format_exc()
            print(f"[RAG] ❌ LLM generation failed, using template: {exc}", flush=True)
            print(f"[RAG] ❌ Trace: {error_trace}", flush=True)
        return None

    llm_enabled = use_llm or content_type == 'general'
    if llm_enabled:
        llm_documents = documents if documents else []
        llm_answer = _invoke_llm(llm_documents)
        if llm_answer:
            return llm_answer
    
    # If no documents, fall back gracefully
    if not documents:
        if content_type == 'general':
            return (
                f"Tôi chưa có dữ liệu pháp luật liên quan đến '{query}', "
                "nhưng vẫn sẵn sàng trò chuyện hoặc hỗ trợ bạn ở chủ đề khác. "
                "Bạn có thể mô tả cụ thể hơn để tôi giúp tốt hơn nhé!"
            )
        return (
            f"Xin lỗi, tôi không tìm thấy thông tin liên quan đến '{query}' trong cơ sở dữ liệu. "
            "Vui lòng thử lại với từ khóa khác hoặc liên hệ trực tiếp với Công an thành phố Huế để được tư vấn."
        )
    
    # Fallback to template-based generation
    if content_type == 'procedure':
        return _generate_procedure_answer(query, documents)
    elif content_type == 'fine':
        return _generate_fine_answer(query, documents)
    elif content_type == 'office':
        return _generate_office_answer(query, documents)
    elif content_type == 'advisory':
        return _generate_advisory_answer(query, documents)
    elif content_type == 'legal':
        return _generate_legal_answer(query, documents)
    else:
        return _generate_general_answer(query, documents)


def _generate_procedure_answer(query: str, documents: List[Procedure]) -> str:
    """Generate answer for procedure queries."""
    count = len(documents)
    answer = f"Tôi tìm thấy {count} thủ tục liên quan đến '{query}':\n\n"
    
    for i, doc in enumerate(documents[:5], 1):
        answer += f"{i}. {doc.title}\n"
        if doc.domain:
            answer += f"   Lĩnh vực: {doc.domain}\n"
        if doc.level:
            answer += f"   Cấp: {doc.level}\n"
        if doc.conditions:
            conditions_short = doc.conditions[:100] + "..." if len(doc.conditions) > 100 else doc.conditions
            answer += f"   Điều kiện: {conditions_short}\n"
        answer += "\n"
    
    if count > 5:
        answer += f"... và {count - 5} thủ tục khác.\n"
    
    return answer


def _generate_fine_answer(query: str, documents: List[Fine]) -> str:
    """Generate answer for fine queries."""
    count = len(documents)
    answer = f"Tôi tìm thấy {count} mức phạt liên quan đến '{query}':\n\n"
    
    # Highlight best match (first result) if available
    if documents:
        best_match = documents[0]
        answer += "Kết quả chính xác nhất:\n"
        answer += f"• {best_match.name}\n"
        if best_match.code:
            answer += f"  Mã vi phạm: {best_match.code}\n"
        
        # Format fine amount using helper function
        fine_amount = format_fine_amount(
            float(best_match.min_fine) if best_match.min_fine else None,
            float(best_match.max_fine) if best_match.max_fine else None
        )
        if fine_amount:
            answer += f"  Mức phạt: {fine_amount}\n"
        
        if best_match.article:
            answer += f"  Điều luật: {best_match.article}\n"
        answer += "\n"
        
        # Add other results if available
        if count > 1:
            answer += "Các mức phạt khác:\n"
            for i, doc in enumerate(documents[1:5], 2):
                answer += f"{i}. {doc.name}\n"
                if doc.code:
                    answer += f"   Mã vi phạm: {doc.code}\n"
                
                # Format fine amount
                fine_amount = format_fine_amount(
                    float(doc.min_fine) if doc.min_fine else None,
                    float(doc.max_fine) if doc.max_fine else None
                )
                if fine_amount:
                    answer += f"   Mức phạt: {fine_amount}\n"
                
                if doc.article:
                    answer += f"   Điều luật: {doc.article}\n"
                answer += "\n"
    else:
        # Fallback if no documents
        for i, doc in enumerate(documents[:5], 1):
            answer += f"{i}. {doc.name}\n"
            if doc.code:
                answer += f"   Mã vi phạm: {doc.code}\n"
            
            # Format fine amount
            fine_amount = format_fine_amount(
                float(doc.min_fine) if doc.min_fine else None,
                float(doc.max_fine) if doc.max_fine else None
            )
            if fine_amount:
                answer += f"   Mức phạt: {fine_amount}\n"
            
            if doc.article:
                answer += f"   Điều luật: {doc.article}\n"
            answer += "\n"
    
    if count > 5:
        answer += f"... và {count - 5} mức phạt khác.\n"
    
    return answer


def _generate_office_answer(query: str, documents: List[Office]) -> str:
    """Generate answer for office queries."""
    count = len(documents)
    answer = f"Tôi tìm thấy {count} đơn vị liên quan đến '{query}':\n\n"
    
    for i, doc in enumerate(documents[:5], 1):
        answer += f"{i}. {doc.unit_name}\n"
        if doc.address:
            answer += f"   Địa chỉ: {doc.address}\n"
        if doc.district:
            answer += f"   Quận/Huyện: {doc.district}\n"
        if doc.phone:
            answer += f"   Điện thoại: {doc.phone}\n"
        if doc.working_hours:
            answer += f"   Giờ làm việc: {doc.working_hours}\n"
        answer += "\n"
    
    if count > 5:
        answer += f"... và {count - 5} đơn vị khác.\n"
    
    return answer


def _generate_advisory_answer(query: str, documents: List[Advisory]) -> str:
    """Generate answer for advisory queries."""
    count = len(documents)
    answer = f"Tôi tìm thấy {count} cảnh báo liên quan đến '{query}':\n\n"
    
    for i, doc in enumerate(documents[:5], 1):
        answer += f"{i}. {doc.title}\n"
        if doc.summary:
            summary_short = doc.summary[:150] + "..." if len(doc.summary) > 150 else doc.summary
            answer += f"   {summary_short}\n"
        answer += "\n"
    
    if count > 5:
        answer += f"... và {count - 5} cảnh báo khác.\n"
    
    return answer


def _clean_text(value: str) -> str:
    """Normalize whitespace and strip noise for legal snippets."""
    if not value:
        return ""
    compressed = re.sub(r"\s+", " ", value)
    return compressed.strip()


def _summarize_section(
    section: LegalSection,
    max_sentences: int = 3,
    max_chars: int = 600
) -> str:
    """
    Produce a concise Vietnamese summary directly from the stored content.
    
    This is used as the Vietnamese prefill before calling the LLM so we avoid
    English drift and keep the answer grounded.
    """
    content = _clean_text(section.content)
    if not content:
        return ""

    # Split by sentence boundaries; fall back to chunks if delimiters missing.
    sentences = re.split(r"(?<=[.!?])\s+", content)
    if not sentences:
        sentences = [content]

    summary_parts = []
    for sentence in sentences:
        if not sentence:
            continue
        summary_parts.append(sentence)
        joined = " ".join(summary_parts)
        if len(summary_parts) >= max_sentences or len(joined) >= max_chars:
            break

    summary = " ".join(summary_parts)
    if len(summary) > max_chars:
        summary = summary[:max_chars].rsplit(" ", 1)[0] + "..."
    return summary.strip()


def _format_citation(section: LegalSection) -> str:
    citation = section.document.title
    if section.section_code:
        citation = f"{citation}{section.section_code}"
    page = ""
    if section.page_start:
        page = f" (trang {section.page_start}"
        if section.page_end and section.page_end != section.page_start:
            page += f"-{section.page_end}"
        page += ")"
    return f"{citation}{page}".strip()


def _build_legal_prefill(documents: List[LegalSection]) -> str:
    """
    Build a compact Vietnamese summary block that will be injected into the
    Guardrails prompt. The goal is to bias the model toward Vietnamese output.
    """
    if not documents:
        return ""

    lines = ["Bản tóm tắt tiếng Việt từ cơ sở dữ liệu:"]
    for idx, section in enumerate(documents[:3], start=1):
        summary = _summarize_section(section, max_sentences=2, max_chars=400)
        citation = _format_citation(section)
        if not summary:
            continue
        lines.append(f"{idx}. {summary} (Nguồn: {citation})")

    return "\n".join(lines)


def _generate_legal_citation_block(documents: List[LegalSection]) -> str:
    """Return formatted citation block reused by multiple answer modes."""
    if not documents:
        return ""

    lines: List[str] = []
    for idx, section in enumerate(documents[:5], start=1):
        summary = _summarize_section(section)
        snippet = _clean_text(section.content)[:350]
        if snippet and len(snippet) == 350:
            snippet = snippet.rsplit(" ", 1)[0] + "..."
        citation = _format_citation(section)

        lines.append(f"{idx}. {section.section_title or 'Nội dung'}{citation}")
        if summary:
            lines.append(f"   - Tóm tắt: {summary}")
        if snippet:
            lines.append(f"   - Trích dẫn: \"{snippet}\"")
        lines.append("")

    if len(documents) > 5:
        lines.append(f"... và {len(documents) - 5} trích đoạn khác trong cùng nguồn dữ liệu.")

    return "\n".join(lines).strip()


def _generate_legal_answer(query: str, documents: List[LegalSection]) -> str:
    count = len(documents)
    if count == 0:
        return (
            f"Tôi chưa tìm thấy trích dẫn pháp lý nào cho '{query}'. "
            "Bạn có thể cung cấp thêm ngữ cảnh để tôi tiếp tục hỗ trợ."
        )

    header = (
        f"Tôi đã tổng hợp {count} trích đoạn pháp lý liên quan đến '{query}'. "
        "Đây là bản tóm tắt tiếng Việt kèm trích dẫn:"
    )
    citation_block = _generate_legal_citation_block(documents)
    return f"{header}\n\n{citation_block}".strip()


def _generate_general_answer(query: str, documents: List[Any]) -> str:
    """Generate general answer."""
    count = len(documents)
    return f"Tôi tìm thấy {count} kết quả liên quan đến '{query}'. Vui lòng xem chi tiết bên dưới."


def _strip_accents(value: str) -> str:
    return "".join(
        char for char in unicodedata.normalize("NFD", value)
        if unicodedata.category(char) != "Mn"
    )


def _contains_markers(
    text_with_accents: str,
    text_without_accents: str,
    markers: List[str]
) -> bool:
    for marker in markers:
        marker_lower = marker.lower()
        marker_no_accents = _strip_accents(marker_lower)
        if marker_lower in text_with_accents or marker_no_accents in text_without_accents:
            return True
    return False


def _is_valid_legal_answer(answer: str, documents: List[LegalSection]) -> bool:
    """
    Validate that the LLM answer for legal intent references actual legal content.
    
    Criteria:
        - Must not contain denial phrases (already handled earlier) or "xin lỗi".
        - Must not introduce obvious monetary values (legal documents không có số tiền phạt).
        - Must have tối thiểu 40 ký tự để tránh câu trả lời quá ngắn.
    """
    if not answer:
        return False
    
    normalized_answer = answer.lower()
    normalized_answer_no_accents = _strip_accents(normalized_answer)
    
    denial_markers = [
        "xin lỗi",
        "thông tin trong cơ sở dữ liệu chưa đủ",
        "không thể giúp",
        "không tìm thấy thông tin",
        "không có dữ liệu",
    ]
    if _contains_markers(normalized_answer, normalized_answer_no_accents, denial_markers):
        return False
    
    money_markers = ["vnđ", "vnd", "đồng", "đ", "dong"]
    if _contains_markers(normalized_answer, normalized_answer_no_accents, money_markers):
        return False
    
    if len(answer.strip()) < 40:
        return False
    
    return True


def rag_pipeline(
    query: str,
    intent: str,
    top_k: int = 5,
    min_confidence: float = 0.3,
    context: Optional[List[Dict[str, Any]]] = None,
    use_llm: bool = True
) -> Dict[str, Any]:
    """
    Complete RAG pipeline: retrieval + answer generation.
    
    Args:
        query: User query.
        intent: Detected intent.
        top_k: Number of documents to retrieve.
        min_confidence: Minimum confidence threshold.
        context: Optional conversation context.
        use_llm: Whether to use LLM for answer generation.
    
    Returns:
        Dictionary with 'answer', 'documents', 'count', 'confidence', 'content_type'.
    """
    # Map intent to content type
    intent_to_type = {
        'search_procedure': 'procedure',
        'search_fine': 'fine',
        'search_office': 'office',
        'search_advisory': 'advisory',
        'search_legal': 'legal',
        'general_query': 'general',
        'greeting': 'general',
    }
    
    content_type = intent_to_type.get(intent, 'procedure')
    
    # Retrieve documents
    documents = retrieve_top_k_documents(query, content_type, top_k=top_k)
    
    # Enable LLM automatically for casual conversation intents
    llm_allowed = use_llm or intent in {"general_query", "greeting"}

    structured_used = False
    answer: Optional[str] = None

    if intent == "search_legal" and documents:
        llm = get_llm_generator()
        if llm:
            prefill_summary = _build_legal_prefill(documents)
            structured = llm.generate_structured_legal_answer(
                query,
                documents,
                prefill_summary=prefill_summary,
            )
            if structured:
                answer = format_structured_legal_answer(structured)
                structured_used = True
                citation_block = _generate_legal_citation_block(documents)
                if citation_block:
                    answer = (
                        f"{answer.rstrip()}\n\nTrích dẫn chi tiết:\n{citation_block}"
                    )

    if answer is None:
        answer = generate_answer_template(
            query,
            documents,
            content_type,
            context=context,
            use_llm=llm_allowed
        )

    # Fallback nếu intent pháp luật nhưng câu LLM không đạt tiêu chí
    if (
        intent == "search_legal"
        and documents
        and isinstance(answer, str)
        and not structured_used
    ):
        if not _is_valid_legal_answer(answer, documents):
            print("[RAG] ⚠️ Fallback: invalid legal answer detected", flush=True)
            answer = _generate_legal_answer(query, documents)
        else:
            citation_block = _generate_legal_answer(query, documents)
            if citation_block.strip():
                answer = f"{answer.rstrip()}\n\nTrích dẫn chi tiết:\n{citation_block}"
    
    # Calculate confidence (simple: based on number of results and scores)
    confidence = min(1.0, len(documents) / top_k)
    if documents and hasattr(documents[0], '_hybrid_score'):
        confidence = max(confidence, documents[0]._hybrid_score)
    
    return {
        'answer': answer,
        'documents': documents,
        'count': len(documents),
        'confidence': confidence,
        'content_type': content_type
    }