davidtran999 commited on
Commit
dd22707
·
verified ·
1 Parent(s): e1ecd91

Upload backend/chatbot/entity_extraction.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. backend/chatbot/entity_extraction.py +252 -0
backend/chatbot/entity_extraction.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Entity extraction utilities for extracting fine codes, procedure names, and resolving pronouns.
3
+ """
4
+ import re
5
+ from typing import List, Dict, Any, Optional, Tuple
6
+ from hue_portal.core.models import Fine, Procedure, Office
7
+
8
+
9
+ def extract_fine_code(text: str) -> Optional[str]:
10
+ """
11
+ Extract fine code (V001, V002, etc.) from text.
12
+
13
+ Args:
14
+ text: Input text.
15
+
16
+ Returns:
17
+ Fine code string or None if not found.
18
+ """
19
+ # Pattern: V followed by 3 digits
20
+ pattern = r'\bV\d{3}\b'
21
+ matches = re.findall(pattern, text, re.IGNORECASE)
22
+ if matches:
23
+ return matches[0].upper()
24
+ return None
25
+
26
+
27
+ def extract_procedure_name(text: str) -> Optional[str]:
28
+ """
29
+ Extract procedure name from text by matching against database.
30
+
31
+ Args:
32
+ text: Input text.
33
+
34
+ Returns:
35
+ Procedure name or None if not found.
36
+ """
37
+ text_lower = text.lower()
38
+
39
+ # Get all procedures and check for matches
40
+ procedures = Procedure.objects.all()
41
+ for procedure in procedures:
42
+ procedure_title_lower = procedure.title.lower()
43
+ # Check if procedure title appears in text
44
+ if procedure_title_lower in text_lower or text_lower in procedure_title_lower:
45
+ return procedure.title
46
+
47
+ return None
48
+
49
+
50
+ def extract_office_name(text: str) -> Optional[str]:
51
+ """
52
+ Extract office/unit name from text by matching against database.
53
+
54
+ Args:
55
+ text: Input text.
56
+
57
+ Returns:
58
+ Office name or None if not found.
59
+ """
60
+ text_lower = text.lower()
61
+
62
+ # Get all offices and check for matches
63
+ offices = Office.objects.all()
64
+ for office in offices:
65
+ office_name_lower = office.unit_name.lower()
66
+ # Check if office name appears in text
67
+ if office_name_lower in text_lower or text_lower in office_name_lower:
68
+ return office.unit_name
69
+
70
+ return None
71
+
72
+
73
+ def extract_reference_pronouns(text: str, context: Optional[List[Dict[str, Any]]] = None) -> List[str]:
74
+ """
75
+ Extract reference pronouns from text.
76
+
77
+ Args:
78
+ text: Input text.
79
+ context: Optional context from recent messages.
80
+
81
+ Returns:
82
+ List of pronouns found.
83
+ """
84
+ # Vietnamese reference pronouns
85
+ pronouns = [
86
+ "cái đó", "cái này", "cái kia",
87
+ "như vậy", "như thế",
88
+ "thủ tục đó", "thủ tục này",
89
+ "mức phạt đó", "mức phạt này",
90
+ "đơn vị đó", "đơn vị này",
91
+ "nó", "đó", "này", "kia"
92
+ ]
93
+
94
+ text_lower = text.lower()
95
+ found_pronouns = []
96
+
97
+ for pronoun in pronouns:
98
+ if pronoun in text_lower:
99
+ found_pronouns.append(pronoun)
100
+
101
+ return found_pronouns
102
+
103
+
104
+ def resolve_pronouns(query: str, recent_messages: List[Dict[str, Any]]) -> str:
105
+ """
106
+ Resolve pronouns in query by replacing them with actual entities from context.
107
+
108
+ Args:
109
+ query: Current query with pronouns.
110
+ recent_messages: List of recent messages with role, content, intent, entities.
111
+
112
+ Returns:
113
+ Enhanced query with pronouns resolved.
114
+ """
115
+ if not recent_messages:
116
+ return query
117
+
118
+ # Check for pronouns
119
+ pronouns = extract_reference_pronouns(query)
120
+ if not pronouns:
121
+ return query
122
+
123
+ # Look for entities in recent messages (reverse order - most recent first)
124
+ resolved_query = query
125
+ entities_found = {}
126
+
127
+ for msg in reversed(recent_messages):
128
+ # Check message content for entities
129
+ content = msg.get("content", "")
130
+
131
+ # Extract fine code
132
+ fine_code = extract_fine_code(content)
133
+ if fine_code and "fine_code" not in entities_found:
134
+ entities_found["fine_code"] = fine_code
135
+
136
+ # Extract procedure name
137
+ procedure_name = extract_procedure_name(content)
138
+ if procedure_name and "procedure_name" not in entities_found:
139
+ entities_found["procedure_name"] = procedure_name
140
+
141
+ # Extract office name
142
+ office_name = extract_office_name(content)
143
+ if office_name and "office_name" not in entities_found:
144
+ entities_found["office_name"] = office_name
145
+
146
+ # Check entities field
147
+ msg_entities = msg.get("entities", {})
148
+ for key, value in msg_entities.items():
149
+ if key not in entities_found:
150
+ entities_found[key] = value
151
+
152
+ # Check intent to infer entity type
153
+ intent = msg.get("intent", "")
154
+ if intent == "search_fine" and "fine_name" not in entities_found:
155
+ # Try to extract fine name from content
156
+ # Look for patterns like "Vượt đèn đỏ", "Không đội mũ bảo hiểm"
157
+ fine_keywords = ["vượt đèn đỏ", "mũ bảo hiểm", "nồng độ cồn", "tốc độ"]
158
+ for keyword in fine_keywords:
159
+ if keyword in content.lower():
160
+ entities_found["fine_name"] = keyword
161
+ break
162
+
163
+ if intent == "search_procedure" and "procedure_name" not in entities_found:
164
+ # Try to extract procedure name from content
165
+ procedure_keywords = ["đăng ký", "thủ tục", "cư trú", "antt", "pccc"]
166
+ for keyword in procedure_keywords:
167
+ if keyword in content.lower():
168
+ entities_found["procedure_name"] = keyword
169
+ break
170
+
171
+ # Replace pronouns with entities
172
+ query_lower = query.lower()
173
+
174
+ # Replace "cái đó", "cái này", "nó" with most relevant entity
175
+ if any(pronoun in query_lower for pronoun in ["cái đó", "cái này", "nó", "đó"]):
176
+ if "fine_name" in entities_found:
177
+ resolved_query = re.sub(
178
+ r'\b(cái đó|cái này|nó|đó)\b',
179
+ entities_found["fine_name"],
180
+ resolved_query,
181
+ flags=re.IGNORECASE
182
+ )
183
+ elif "procedure_name" in entities_found:
184
+ resolved_query = re.sub(
185
+ r'\b(cái đó|cái này|nó|đó)\b',
186
+ entities_found["procedure_name"],
187
+ resolved_query,
188
+ flags=re.IGNORECASE
189
+ )
190
+ elif "office_name" in entities_found:
191
+ resolved_query = re.sub(
192
+ r'\b(cái đó|cái này|nó|đó)\b',
193
+ entities_found["office_name"],
194
+ resolved_query,
195
+ flags=re.IGNORECASE
196
+ )
197
+
198
+ # Replace "thủ tục đó", "thủ tục này" with procedure name
199
+ if "thủ tục" in query_lower and "procedure_name" in entities_found:
200
+ resolved_query = re.sub(
201
+ r'\bthủ tục (đó|này)\b',
202
+ entities_found["procedure_name"],
203
+ resolved_query,
204
+ flags=re.IGNORECASE
205
+ )
206
+
207
+ # Replace "mức phạt đó", "mức phạt này" with fine name
208
+ if "mức phạt" in query_lower and "fine_name" in entities_found:
209
+ resolved_query = re.sub(
210
+ r'\bmức phạt (đó|này)\b',
211
+ entities_found["fine_name"],
212
+ resolved_query,
213
+ flags=re.IGNORECASE
214
+ )
215
+
216
+ return resolved_query
217
+
218
+
219
+ def extract_all_entities(text: str) -> Dict[str, Any]:
220
+ """
221
+ Extract all entities from text.
222
+
223
+ Args:
224
+ text: Input text.
225
+
226
+ Returns:
227
+ Dictionary with all extracted entities.
228
+ """
229
+ entities = {}
230
+
231
+ # Extract fine code
232
+ fine_code = extract_fine_code(text)
233
+ if fine_code:
234
+ entities["fine_code"] = fine_code
235
+
236
+ # Extract procedure name
237
+ procedure_name = extract_procedure_name(text)
238
+ if procedure_name:
239
+ entities["procedure_name"] = procedure_name
240
+
241
+ # Extract office name
242
+ office_name = extract_office_name(text)
243
+ if office_name:
244
+ entities["office_name"] = office_name
245
+
246
+ # Extract pronouns
247
+ pronouns = extract_reference_pronouns(text)
248
+ if pronouns:
249
+ entities["pronouns"] = pronouns
250
+
251
+ return entities
252
+