Spaces:

minhvtt
/

ChatbotRAG

Running

App Files Files Community

minhvtt commited on 9 days ago

Commit

70413d7

verified ·

1 Parent(s): 885208c

Update intent_classifier.py

Browse files

Files changed (1) hide show

intent_classifier.py +201 -188

intent_classifier.py CHANGED Viewed

@@ -1,188 +1,201 @@
-"""
-Intent Classifier for Hybrid RAG + FSM Chatbot
-Detects user intent to route between scenario flows and RAG queries
-"""
-from typing import Dict, Optional, List
-import re
-class IntentClassifier:
-    """
-    Classify user intent using keyword matching
-    Routes to either:
-    - Scenario flows (scripted conversations)
-    - RAG queries (knowledge retrieval)
-    """
-    def __init__(self, scenarios_dir: str = "scenarios"):
-        """
-        Initialize with auto-loading triggers from scenario JSON files
-        Args:
-            scenarios_dir: Directory containing scenario JSON files
-        """
-        # Auto-load scenario patterns from JSON files
-        self.scenario_patterns = self._load_scenario_patterns(scenarios_dir)
-        # General question patterns (RAG)
-        self.general_patterns = [
-            # Location
-            "ở đâu", "địa điểm", "location", "where",
-            "chỗ nào", "tổ chức tại",
-            # Time
-            "mấy giờ", "khi nào", "when", "time",
-            "bao giờ", "thời gian", "ngày nào",
-            # Info
-            "thông tin", "info", "information",
-            "chi tiết", "details", "về",
-            # Parking
-            "đậu xe", "parking", "gửi xe",
-            # Contact
-            "liên hệ", "contact", "số điện thoại",
-            # Events/content - NEW (Bug fix #3)
-            "sự kiện", "event", "đâu", "show nào",
-            "line-up", "lineup", "performer"
-        ]
-    def _load_scenario_patterns(self, scenarios_dir: str) -> dict:
-        """
-        Auto-load triggers from all scenario JSON files
-        Returns:
-            {"scenario_id": ["trigger1", "trigger2", ...]}
-        """
-        import json
-        import os
-        patterns = {}
-        if not os.path.exists(scenarios_dir):
-            print(f"⚠ Scenarios directory not found: {scenarios_dir}")
-            return patterns
-        for filename in os.listdir(scenarios_dir):
-            if filename.endswith('.json'):
-                filepath = os.path.join(scenarios_dir, filename)
-                try:
-                    with open(filepath, 'r', encoding='utf-8') as f:
-                        scenario = json.load(f)
-                        scenario_id = scenario.get('scenario_id')
-                        triggers = scenario.get('triggers', [])
-                        if scenario_id and triggers:
-                            patterns[scenario_id] = triggers
-                            print(f"✓ Loaded triggers for: {scenario_id} ({len(triggers)} patterns)")
-                except Exception as e:
-                    print(f"⚠ Error loading {filename}: {e}")
-        return patterns
-    def classify(
-        self,
-        message: str,
-        conversation_state: Optional[Dict] = None
-    ) -> str:
-        """
-        Classify user intent with IMPROVED mid-scenario detection (Bug fix #3)
-        Returns:
-            - "scenario:{scenario_id}" - Trigger new scenario
-            - "scenario:continue" - Continue active scenario
-            - "rag:general" - General RAG query (no active scenario)
-            - "rag:with_resume" - RAG query mid-scenario (then resume)
-        """
-        message_lower = message.lower().strip()
-        # Check if user is in active scenario
-        active_scenario = conversation_state.get('active_scenario') if conversation_state else None
-        if active_scenario:
-            # User is in a scenario - check if this is off-topic or continuation
-            # IMPROVED: Detect off-topic questions better
-            # Check for question words + patterns
-            question_indicators = ["?", "đâu", "gì", "sao", "where", "what", "how", "when"]
-            has_question = any(q in message_lower for q in question_indicators)
-            # Check if matches general patterns
-            matches_general = self._matches_any_pattern(message_lower, self.general_patterns)
-            # Short messages with questions are likely off-topic
-            word_count = len(message_lower.split())
-            is_short_question = word_count <= 4 and has_question
-            # Decision logic
-            if matches_general or is_short_question:
-                # User asking off-topic question → RAG with resume
-                print(f"🔀 Off-topic detected: '{message}' → rag:with_resume")
-                return "rag:with_resume"
-            else:
-                # Normal scenario continuation
-                return "scenario:continue"
-        # Not in scenario - check for scenario triggers
-        for scenario_id, patterns in self.scenario_patterns.items():
-            for pattern in patterns:
-                if pattern.lower() in message_lower:
-                    return f"scenario:{scenario_id}"
-        # No scenario match - general RAG query
-        return "rag:general"
-    def _matches_any_pattern(self, message: str, patterns: List[str]) -> bool:
-        """
-        Check if message matches any pattern in list
-        """
-        for pattern in patterns:
-            # Simple substring match
-            if pattern in message:
-                return True
-            # Word boundary check
-            if re.search(rf'\b{re.escape(pattern)}\b', message, re.IGNORECASE):
-                return True
-        return False
-    def get_scenario_type(self, intent: str) -> Optional[str]:
-        """
-        Extract scenario type from intent string
-        Args:
-            intent: "scenario:price_inquiry" or "scenario:continue"
-        Returns:
-            "price_inquiry" or None
-        """
-        if not intent.startswith("scenario:"):
-            return None
-        parts = intent.split(":", 1)
-        if len(parts) < 2:
-            return None
-        scenario_type = parts[1]
-        if scenario_type == "continue":
-            return None
-        return scenario_type
-    def add_scenario_pattern(self, scenario_id: str, patterns: List[str]):
-        """
-        Dynamically add new scenario patterns
-        """
-        if scenario_id in self.scenario_patterns:
-            self.scenario_patterns[scenario_id].extend(patterns)
-        else:
-            self.scenario_patterns[scenario_id] = patterns
-    def add_general_pattern(self, patterns: List[str]):
-        """
-        Dynamically add new general question patterns
-        """
-        self.general_patterns.extend(patterns)

+"""
+Intent Classifier for Hybrid RAG + FSM Chatbot
+Detects user intent to route between scenario flows and RAG queries
+"""
+from typing import Dict, Optional, List
+import re
+class IntentClassifier:
+    """
+    Classify user intent using keyword matching
+    Routes to either:
+    - Scenario flows (scripted conversations)
+    - RAG queries (knowledge retrieval)
+    """
+    def __init__(self, scenarios_dir: str = "scenarios"):
+        """
+        Initialize with auto-loading triggers from scenario JSON files
+        Args:
+            scenarios_dir: Directory containing scenario JSON files
+        """
+        # Auto-load scenario patterns from JSON files
+        self.scenario_patterns = self._load_scenario_patterns(scenarios_dir)
+        # General question patterns (RAG)
+        self.general_patterns = [
+            # Location
+            "ở đâu", "địa điểm", "location", "where",
+            "chỗ nào", "tổ chức tại",
+            # Time
+            "mấy giờ", "khi nào", "when", "time",
+            "bao giờ", "thời gian", "ngày nào",
+            # Info
+            "thông tin", "info", "information",
+            "chi tiết", "details", "về",
+            # Parking
+            "đậu xe", "parking", "gửi xe",
+            # Contact
+            "liên hệ", "contact", "số điện thoại",
+            # Events/content
+            "sự kiện", "event", "đâu", "show nào",
+            "line-up", "lineup", "performer"
+        ]
+    def _load_scenario_patterns(self, scenarios_dir: str) -> dict:
+        """
+        Auto-load triggers from all scenario JSON files
+        Returns:
+            {"scenario_id": ["trigger1", "trigger2", ...]}
+        """
+        import json
+        import os
+        patterns = {}
+        if not os.path.exists(scenarios_dir):
+            print(f"⚠ Scenarios directory not found: {scenarios_dir}")
+            return patterns
+        for filename in os.listdir(scenarios_dir):
+            if filename.endswith('.json'):
+                filepath = os.path.join(scenarios_dir, filename)
+                try:
+                    with open(filepath, 'r', encoding='utf-8') as f:
+                        scenario = json.load(f)
+                        scenario_id = scenario.get('scenario_id')
+                        triggers = scenario.get('triggers', [])
+                        if scenario_id and triggers:
+                            patterns[scenario_id] = triggers
+                            print(f"✓ Loaded triggers for: {scenario_id} ({len(triggers)} patterns)")
+                except Exception as e:
+                    print(f"⚠ Error loading {filename}: {e}")
+        return patterns
+    def classify(
+        self,
+        message: str,
+        conversation_state: Optional[Dict] = None
+    ) -> str:
+        """
+        Classify user intent with improved mid-scenario detection
+        Returns:
+            - "scenario:{scenario_id}" - Trigger new scenario
+            - "scenario:continue" - Continue active scenario
+            - "rag:general" - General RAG query (no active scenario)
+            - "rag:with_resume" - RAG query mid-scenario (then resume)
+        """
+        message_lower = message.lower().strip()
+        # Check if user is in active scenario
+        active_scenario = conversation_state.get('active_scenario') if conversation_state else None
+        if active_scenario:
+            # User is in a scenario - check if this is off-topic or continuation
+            # Valid choice keywords (answers to scenario questions)
+            choice_keywords = [
+                # Event recommendation choices
+                'giá', 'price', 'vé', 'ticket',
+                'lineup', 'line-up', 'nghệ sĩ', 'artist',
+                'địa điểm', 'location', 'chỗ',
+                'thời gian', 'time', 'lịch',
+                # General answers
+                'có', 'yes', 'ok', 'được', 'không', 'no',
+                'chill', 'sôi động', 'hài', 'workshop',
+                '1', '2', '3', '4', '5'  # Ratings or choices
+            ]
+            # Check if message matches valid answer
+            is_valid_answer = any(keyword in message_lower for keyword in choice_keywords)
+            # Check if this is a question (off-topic)
+            has_question_mark = "?" in message
+            question_words = ["gì", "sao", "thế nào", "bao nhiêu", "mấy giờ", "ai", "how", "what", "why"]
+            has_question_word = any(qw in message_lower for qw in question_words)
+            # Classify as off-topic ONLY if:
+            # 1. Has question mark OR question words
+            # 2. AND does NOT match valid answer keywords
+            # 3. AND is asking about new information
+            is_off_topic = (has_question_mark or has_question_word) and not is_valid_answer
+            if is_off_topic:
+                print(f"🔀 Off-topic question detected: '{message}' → rag:with_resume")
+                return "rag:with_resume"
+            else:
+                # Normal scenario continuation
+                return "scenario:continue"
+        # Not in scenario - check for scenario triggers
+        for scenario_id, patterns in self.scenario_patterns.items():
+            for pattern in patterns:
+                if pattern.lower() in message_lower:
+                    return f"scenario:{scenario_id}"
+        # No scenario match - general RAG query
+        return "rag:general"
+    def _matches_any_pattern(self, message: str, patterns: List[str]) -> bool:
+        """
+        Check if message matches any pattern in list
+        """
+        for pattern in patterns:
+            # Simple substring match
+            if pattern in message:
+                return True
+            # Word boundary check
+            if re.search(rf'\b{re.escape(pattern)}\b', message, re.IGNORECASE):
+                return True
+        return False
+    def get_scenario_type(self, intent: str) -> Optional[str]:
+        """
+        Extract scenario type from intent string
+        Args:
+            intent: "scenario:price_inquiry" or "scenario:continue"
+        Returns:
+            "price_inquiry" or None
+        """
+        if not intent.startswith("scenario:"):
+            return None
+        parts = intent.split(":", 1)
+        if len(parts) < 2:
+            return None
+        scenario_type = parts[1]
+        if scenario_type == "continue":
+            return None
+        return scenario_type
+    def add_scenario_pattern(self, scenario_id: str, patterns: List[str]):
+        """
+        Dynamically add new scenario patterns
+        """
+        if scenario_id in self.scenario_patterns:
+            self.scenario_patterns[scenario_id].extend(patterns)
+        else:
+            self.scenario_patterns[scenario_id] = patterns
+    def add_general_pattern(self, patterns: List[str]):
+        """
+        Dynamically add new general question patterns
+        """
+        self.general_patterns.extend(patterns)