ContactSearchAssistant / name_extraction_service.py
Muhammed Essam
Initial commit: Voice Assistant demo
8ef276c
# name_extraction_service.py
import logging
from typing import List, Dict, Any, Optional
from gliner import GLiNER
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class NameExtractor:
"""
Service for extracting person names from text using GLiNER.
GLiNER is a zero-shot NER model that can extract entities without
being limited to predefined entity types. It's especially good for:
- Multilingual name extraction (English + Arabic)
- Flexible entity extraction
- Lightweight and fast (~100-200ms)
Size: ~150MB model
Speed: ~100-200ms per query
"""
def __init__(self, model_name: str = "urchade/gliner_small-v2.1"):
"""
Initialize the name extraction service.
Args:
model_name: GLiNER model to use. Options:
- "urchade/gliner_small-v2.1" (150MB, balanced)
- "urchade/gliner_multi-v2.1" (multilingual, better for Arabic)
- "urchade/gliner_large-v2.1" (larger, more accurate)
"""
logger.info(f"Loading GLiNER model: {model_name}")
# Load the pre-trained model
# This downloads the model on first run (~150MB)
self.model = GLiNER.from_pretrained(model_name)
# Define the entity labels we want to extract
self.labels = ["person", "name", "employee"]
logger.info(f"βœ“ GLiNER model loaded successfully")
logger.info(f"Entity labels: {self.labels}")
def extract_names(self, text: str, threshold: float = 0.3) -> List[str]:
"""
Extract person names from text.
Args:
text: Input text (e.g., "find Ahmed in IT")
threshold: Confidence threshold (0-1). Lower = more names but less precise.
Default 0.3 is good for most cases.
Returns:
List of extracted names
Example:
>>> extractor.extract_names("find Ahmed Hassan in IT")
['Ahmed Hassan']
>>> extractor.extract_names("connect me with Sarah from HR")
['Sarah']
"""
logger.info(f"Extracting names from: {text}")
# Predict entities using GLiNER
entities = self.model.predict_entities(
text,
self.labels,
threshold=threshold
)
# Extract just the text of person entities
names = [entity["text"] for entity in entities]
# Remove duplicates while preserving order
unique_names = list(dict.fromkeys(names))
logger.info(f"βœ“ Found {len(unique_names)} name(s): {unique_names}")
return unique_names
def extract_names_with_context(
self,
text: str,
threshold: float = 0.3
) -> List[Dict[str, Any]]:
"""
Extract person names with additional context (position, confidence).
Args:
text: Input text
threshold: Confidence threshold (0-1)
Returns:
List of dictionaries with name details:
[
{
"name": "Ahmed Hassan",
"start": 5,
"end": 17,
"confidence": 0.95,
"label": "person"
}
]
"""
logger.info(f"Extracting names with context from: {text}")
# Predict entities
entities = self.model.predict_entities(
text,
self.labels,
threshold=threshold
)
# Format results
results = []
for entity in entities:
results.append({
"name": entity["text"],
"start": entity["start"],
"end": entity["end"],
"confidence": round(entity["score"], 2),
"label": entity["label"]
})
logger.info(f"βœ“ Found {len(results)} name(s) with context")
return results
def extract_from_query(
self,
query: str,
extract_divisions: bool = False
) -> Dict[str, Any]:
"""
Extract names and optionally division keywords from a query.
Args:
query: User query text
extract_divisions: Whether to also extract division/department mentions
Returns:
Dictionary with extracted information:
{
"names": ["Ahmed", "Sarah"],
"has_names": True,
"count": 2,
"divisions": ["IT", "HR"] (if extract_divisions=True)
}
"""
# Extract names
names = self.extract_names(query)
result = {
"names": names,
"has_names": len(names) > 0,
"count": len(names)
}
# Optionally extract division keywords
if extract_divisions:
# Common division/department keywords
division_keywords = [
"IT", "HR", "Finance", "Legal", "Accounting",
"Marketing", "Sales", "Operations", "Engineering",
"Security", "Facilities", "Purchasing", "Audit"
]
query_upper = query.upper()
found_divisions = [
kw for kw in division_keywords
if kw in query_upper
]
result["divisions"] = found_divisions
result["has_divisions"] = len(found_divisions) > 0
return result