""" RAG (Retrieval-Augmented Generation) pipeline for answer generation. """ import re import unicodedata from typing import List, Dict, Any, Optional from .hybrid_search import hybrid_search from .models import Procedure, Fine, Office, Advisory, LegalSection from hue_portal.chatbot.chatbot import format_fine_amount from hue_portal.chatbot.llm_integration import get_llm_generator from hue_portal.chatbot.structured_legal import format_structured_legal_answer def retrieve_top_k_documents( query: str, content_type: str, top_k: int = 5 ) -> List[Any]: """ Retrieve top-k documents using hybrid search. Args: query: Search query. content_type: Type of content ('procedure', 'fine', 'office', 'advisory'). top_k: Number of documents to retrieve. Returns: List of document objects. """ # Get appropriate queryset if content_type == 'procedure': queryset = Procedure.objects.all() text_fields = ['title', 'domain', 'conditions', 'dossier'] elif content_type == 'fine': queryset = Fine.objects.all() text_fields = ['name', 'code', 'article', 'decree', 'remedial'] elif content_type == 'office': queryset = Office.objects.all() text_fields = ['unit_name', 'address', 'district', 'service_scope'] elif content_type == 'advisory': queryset = Advisory.objects.all() text_fields = ['title', 'summary'] elif content_type == 'legal': queryset = LegalSection.objects.select_related("document").all() text_fields = ['section_title', 'section_code', 'content'] else: return [] # Use hybrid search with text_fields for exact match boost try: from .config.hybrid_search_config import get_config config = get_config(content_type) results = hybrid_search( queryset, query, top_k=top_k, bm25_weight=config.bm25_weight, vector_weight=config.vector_weight, min_hybrid_score=config.min_hybrid_score, text_fields=text_fields ) return results except Exception as e: print(f"Error in retrieval: {e}") return [] def generate_answer_template( query: str, documents: List[Any], content_type: str, context: Optional[List[Dict[str, Any]]] = None, use_llm: bool = True ) -> str: """ Generate answer using LLM (if available) or template-based summarization. Args: query: Original query. documents: Retrieved documents. content_type: Type of content. context: Optional conversation context. use_llm: Whether to try LLM generation first. Returns: Generated answer text. """ def _invoke_llm(documents_for_prompt: List[Any]) -> Optional[str]: """Call configured LLM provider safely.""" try: import traceback from hue_portal.chatbot.llm_integration import get_llm_generator llm = get_llm_generator() if not llm: print("[RAG] ⚠️ LLM not available, using template", flush=True) return None print(f"[RAG] Using LLM provider: {llm.provider}", flush=True) llm_answer = llm.generate_answer( query, context=context, documents=documents_for_prompt ) if llm_answer: print(f"[RAG] ✅ LLM answer generated (length: {len(llm_answer)})", flush=True) return llm_answer print("[RAG] ⚠️ LLM returned None, using template", flush=True) except Exception as exc: import traceback error_trace = traceback.format_exc() print(f"[RAG] ❌ LLM generation failed, using template: {exc}", flush=True) print(f"[RAG] ❌ Trace: {error_trace}", flush=True) return None llm_enabled = use_llm or content_type == 'general' if llm_enabled: llm_documents = documents if documents else [] llm_answer = _invoke_llm(llm_documents) if llm_answer: return llm_answer # If no documents, fall back gracefully if not documents: if content_type == 'general': return ( f"Tôi chưa có dữ liệu pháp luật liên quan đến '{query}', " "nhưng vẫn sẵn sàng trò chuyện hoặc hỗ trợ bạn ở chủ đề khác. " "Bạn có thể mô tả cụ thể hơn để tôi giúp tốt hơn nhé!" ) return ( f"Xin lỗi, tôi không tìm thấy thông tin liên quan đến '{query}' trong cơ sở dữ liệu. " "Vui lòng thử lại với từ khóa khác hoặc liên hệ trực tiếp với Công an thành phố Huế để được tư vấn." ) # Fallback to template-based generation if content_type == 'procedure': return _generate_procedure_answer(query, documents) elif content_type == 'fine': return _generate_fine_answer(query, documents) elif content_type == 'office': return _generate_office_answer(query, documents) elif content_type == 'advisory': return _generate_advisory_answer(query, documents) elif content_type == 'legal': return _generate_legal_answer(query, documents) else: return _generate_general_answer(query, documents) def _generate_procedure_answer(query: str, documents: List[Procedure]) -> str: """Generate answer for procedure queries.""" count = len(documents) answer = f"Tôi tìm thấy {count} thủ tục liên quan đến '{query}':\n\n" for i, doc in enumerate(documents[:5], 1): answer += f"{i}. {doc.title}\n" if doc.domain: answer += f" Lĩnh vực: {doc.domain}\n" if doc.level: answer += f" Cấp: {doc.level}\n" if doc.conditions: conditions_short = doc.conditions[:100] + "..." if len(doc.conditions) > 100 else doc.conditions answer += f" Điều kiện: {conditions_short}\n" answer += "\n" if count > 5: answer += f"... và {count - 5} thủ tục khác.\n" return answer def _generate_fine_answer(query: str, documents: List[Fine]) -> str: """Generate answer for fine queries.""" count = len(documents) answer = f"Tôi tìm thấy {count} mức phạt liên quan đến '{query}':\n\n" # Highlight best match (first result) if available if documents: best_match = documents[0] answer += "Kết quả chính xác nhất:\n" answer += f"• {best_match.name}\n" if best_match.code: answer += f" Mã vi phạm: {best_match.code}\n" # Format fine amount using helper function fine_amount = format_fine_amount( float(best_match.min_fine) if best_match.min_fine else None, float(best_match.max_fine) if best_match.max_fine else None ) if fine_amount: answer += f" Mức phạt: {fine_amount}\n" if best_match.article: answer += f" Điều luật: {best_match.article}\n" answer += "\n" # Add other results if available if count > 1: answer += "Các mức phạt khác:\n" for i, doc in enumerate(documents[1:5], 2): answer += f"{i}. {doc.name}\n" if doc.code: answer += f" Mã vi phạm: {doc.code}\n" # Format fine amount fine_amount = format_fine_amount( float(doc.min_fine) if doc.min_fine else None, float(doc.max_fine) if doc.max_fine else None ) if fine_amount: answer += f" Mức phạt: {fine_amount}\n" if doc.article: answer += f" Điều luật: {doc.article}\n" answer += "\n" else: # Fallback if no documents for i, doc in enumerate(documents[:5], 1): answer += f"{i}. {doc.name}\n" if doc.code: answer += f" Mã vi phạm: {doc.code}\n" # Format fine amount fine_amount = format_fine_amount( float(doc.min_fine) if doc.min_fine else None, float(doc.max_fine) if doc.max_fine else None ) if fine_amount: answer += f" Mức phạt: {fine_amount}\n" if doc.article: answer += f" Điều luật: {doc.article}\n" answer += "\n" if count > 5: answer += f"... và {count - 5} mức phạt khác.\n" return answer def _generate_office_answer(query: str, documents: List[Office]) -> str: """Generate answer for office queries.""" count = len(documents) answer = f"Tôi tìm thấy {count} đơn vị liên quan đến '{query}':\n\n" for i, doc in enumerate(documents[:5], 1): answer += f"{i}. {doc.unit_name}\n" if doc.address: answer += f" Địa chỉ: {doc.address}\n" if doc.district: answer += f" Quận/Huyện: {doc.district}\n" if doc.phone: answer += f" Điện thoại: {doc.phone}\n" if doc.working_hours: answer += f" Giờ làm việc: {doc.working_hours}\n" answer += "\n" if count > 5: answer += f"... và {count - 5} đơn vị khác.\n" return answer def _generate_advisory_answer(query: str, documents: List[Advisory]) -> str: """Generate answer for advisory queries.""" count = len(documents) answer = f"Tôi tìm thấy {count} cảnh báo liên quan đến '{query}':\n\n" for i, doc in enumerate(documents[:5], 1): answer += f"{i}. {doc.title}\n" if doc.summary: summary_short = doc.summary[:150] + "..." if len(doc.summary) > 150 else doc.summary answer += f" {summary_short}\n" answer += "\n" if count > 5: answer += f"... và {count - 5} cảnh báo khác.\n" return answer def _clean_text(value: str) -> str: """Normalize whitespace and strip noise for legal snippets.""" if not value: return "" compressed = re.sub(r"\s+", " ", value) return compressed.strip() def _summarize_section( section: LegalSection, max_sentences: int = 3, max_chars: int = 600 ) -> str: """ Produce a concise Vietnamese summary directly from the stored content. This is used as the Vietnamese prefill before calling the LLM so we avoid English drift and keep the answer grounded. """ content = _clean_text(section.content) if not content: return "" # Split by sentence boundaries; fall back to chunks if delimiters missing. sentences = re.split(r"(?<=[.!?])\s+", content) if not sentences: sentences = [content] summary_parts = [] for sentence in sentences: if not sentence: continue summary_parts.append(sentence) joined = " ".join(summary_parts) if len(summary_parts) >= max_sentences or len(joined) >= max_chars: break summary = " ".join(summary_parts) if len(summary) > max_chars: summary = summary[:max_chars].rsplit(" ", 1)[0] + "..." return summary.strip() def _format_citation(section: LegalSection) -> str: citation = section.document.title if section.section_code: citation = f"{citation} – {section.section_code}" page = "" if section.page_start: page = f" (trang {section.page_start}" if section.page_end and section.page_end != section.page_start: page += f"-{section.page_end}" page += ")" return f"{citation}{page}".strip() def _build_legal_prefill(documents: List[LegalSection]) -> str: """ Build a compact Vietnamese summary block that will be injected into the Guardrails prompt. The goal is to bias the model toward Vietnamese output. """ if not documents: return "" lines = ["Bản tóm tắt tiếng Việt từ cơ sở dữ liệu:"] for idx, section in enumerate(documents[:3], start=1): summary = _summarize_section(section, max_sentences=2, max_chars=400) citation = _format_citation(section) if not summary: continue lines.append(f"{idx}. {summary} (Nguồn: {citation})") return "\n".join(lines) def _generate_legal_citation_block(documents: List[LegalSection]) -> str: """Return formatted citation block reused by multiple answer modes.""" if not documents: return "" lines: List[str] = [] for idx, section in enumerate(documents[:5], start=1): summary = _summarize_section(section) snippet = _clean_text(section.content)[:350] if snippet and len(snippet) == 350: snippet = snippet.rsplit(" ", 1)[0] + "..." citation = _format_citation(section) lines.append(f"{idx}. {section.section_title or 'Nội dung'} – {citation}") if summary: lines.append(f" - Tóm tắt: {summary}") if snippet: lines.append(f" - Trích dẫn: \"{snippet}\"") lines.append("") if len(documents) > 5: lines.append(f"... và {len(documents) - 5} trích đoạn khác trong cùng nguồn dữ liệu.") return "\n".join(lines).strip() def _generate_legal_answer(query: str, documents: List[LegalSection]) -> str: count = len(documents) if count == 0: return ( f"Tôi chưa tìm thấy trích dẫn pháp lý nào cho '{query}'. " "Bạn có thể cung cấp thêm ngữ cảnh để tôi tiếp tục hỗ trợ." ) header = ( f"Tôi đã tổng hợp {count} trích đoạn pháp lý liên quan đến '{query}'. " "Đây là bản tóm tắt tiếng Việt kèm trích dẫn:" ) citation_block = _generate_legal_citation_block(documents) return f"{header}\n\n{citation_block}".strip() def _generate_general_answer(query: str, documents: List[Any]) -> str: """Generate general answer.""" count = len(documents) return f"Tôi tìm thấy {count} kết quả liên quan đến '{query}'. Vui lòng xem chi tiết bên dưới." def _strip_accents(value: str) -> str: return "".join( char for char in unicodedata.normalize("NFD", value) if unicodedata.category(char) != "Mn" ) def _contains_markers( text_with_accents: str, text_without_accents: str, markers: List[str] ) -> bool: for marker in markers: marker_lower = marker.lower() marker_no_accents = _strip_accents(marker_lower) if marker_lower in text_with_accents or marker_no_accents in text_without_accents: return True return False def _is_valid_legal_answer(answer: str, documents: List[LegalSection]) -> bool: """ Validate that the LLM answer for legal intent references actual legal content. Criteria: - Must not contain denial phrases (already handled earlier) or "xin lỗi". - Must not introduce obvious monetary values (legal documents không có số tiền phạt). - Must have tối thiểu 40 ký tự để tránh câu trả lời quá ngắn. """ if not answer: return False normalized_answer = answer.lower() normalized_answer_no_accents = _strip_accents(normalized_answer) denial_markers = [ "xin lỗi", "thông tin trong cơ sở dữ liệu chưa đủ", "không thể giúp", "không tìm thấy thông tin", "không có dữ liệu", ] if _contains_markers(normalized_answer, normalized_answer_no_accents, denial_markers): return False money_markers = ["vnđ", "vnd", "đồng", "đ", "dong"] if _contains_markers(normalized_answer, normalized_answer_no_accents, money_markers): return False if len(answer.strip()) < 40: return False return True def rag_pipeline( query: str, intent: str, top_k: int = 5, min_confidence: float = 0.3, context: Optional[List[Dict[str, Any]]] = None, use_llm: bool = True ) -> Dict[str, Any]: """ Complete RAG pipeline: retrieval + answer generation. Args: query: User query. intent: Detected intent. top_k: Number of documents to retrieve. min_confidence: Minimum confidence threshold. context: Optional conversation context. use_llm: Whether to use LLM for answer generation. Returns: Dictionary with 'answer', 'documents', 'count', 'confidence', 'content_type'. """ # Map intent to content type intent_to_type = { 'search_procedure': 'procedure', 'search_fine': 'fine', 'search_office': 'office', 'search_advisory': 'advisory', 'search_legal': 'legal', 'general_query': 'general', 'greeting': 'general', } content_type = intent_to_type.get(intent, 'procedure') # Retrieve documents documents = retrieve_top_k_documents(query, content_type, top_k=top_k) # Enable LLM automatically for casual conversation intents llm_allowed = use_llm or intent in {"general_query", "greeting"} structured_used = False answer: Optional[str] = None if intent == "search_legal" and documents: llm = get_llm_generator() if llm: prefill_summary = _build_legal_prefill(documents) structured = llm.generate_structured_legal_answer( query, documents, prefill_summary=prefill_summary, ) if structured: answer = format_structured_legal_answer(structured) structured_used = True citation_block = _generate_legal_citation_block(documents) if citation_block: answer = ( f"{answer.rstrip()}\n\nTrích dẫn chi tiết:\n{citation_block}" ) if answer is None: answer = generate_answer_template( query, documents, content_type, context=context, use_llm=llm_allowed ) # Fallback nếu intent pháp luật nhưng câu LLM không đạt tiêu chí if ( intent == "search_legal" and documents and isinstance(answer, str) and not structured_used ): if not _is_valid_legal_answer(answer, documents): print("[RAG] ⚠️ Fallback: invalid legal answer detected", flush=True) answer = _generate_legal_answer(query, documents) else: citation_block = _generate_legal_answer(query, documents) if citation_block.strip(): answer = f"{answer.rstrip()}\n\nTrích dẫn chi tiết:\n{citation_block}" # Calculate confidence (simple: based on number of results and scores) confidence = min(1.0, len(documents) / top_k) if documents and hasattr(documents[0], '_hybrid_score'): confidence = max(confidence, documents[0]._hybrid_score) return { 'answer': answer, 'documents': documents, 'count': len(documents), 'confidence': confidence, 'content_type': content_type }