ContactSearchAssistant / contact_search_service.py
Muhammed Essam
Initial commit: Voice Assistant demo
8ef276c
# contact_search_service.py
"""
Contact search service with intelligent matching:
- Name-based search (exact and fuzzy matching)
- Division-based search
- Combined search (name + division)
- Confidence scoring
"""
import logging
from typing import List, Dict, Optional, Tuple
from difflib import SequenceMatcher
import re
from contacts_data import (
get_all_contacts,
get_contacts_by_division,
get_contact_by_name
)
from name_extraction_service import NameExtractor
from embedding_service import EmbeddingService
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ContactSearchService:
"""
Service for searching contacts with intelligent matching.
Features:
- Exact name matching (100% confidence)
- Fuzzy name matching (partial names, typos)
- Division-based matching
- Combined search (name + division)
- Multi-language support (English and Arabic)
"""
def __init__(
self,
name_extractor: NameExtractor,
embedding_service: EmbeddingService
):
"""
Initialize the contact search service.
Args:
name_extractor: NameExtractor service for extracting names from queries
embedding_service: EmbeddingService for division matching
"""
self.name_extractor = name_extractor
self.embedding_service = embedding_service
self.all_contacts = get_all_contacts()
logger.info(f"ContactSearchService initialized with {len(self.all_contacts)} contacts")
def search_contacts(
self,
query: str,
top_k: int = 10,
min_confidence: float = 0.3
) -> List[Dict]:
"""
Search for contacts based on query.
Process:
1. Extract names from query
2. Find matching divisions
3. Match contacts by:
- Exact name match (if name found) โ†’ confidence = 1.0
- Fuzzy name match โ†’ confidence based on similarity
- Division match โ†’ confidence from embedding service
- Combined match (name + division) โ†’ boosted confidence
4. Sort by confidence (exact matches first)
Args:
query: Search query (English or Arabic)
top_k: Maximum number of results to return
min_confidence: Minimum confidence threshold (0.0-1.0)
Returns:
List of matched contacts with confidence scores
"""
logger.info(f"Searching contacts for query: '{query}'")
# Step 1: Extract names from query
extracted_names = self.name_extractor.extract_names(query)
logger.info(f"Extracted names: {extracted_names}")
# Step 2: Find matching divisions
division_matches = self.embedding_service.find_division(query, top_k=3)
logger.info(f"Found {len(division_matches)} division matches")
# Step 3: Match contacts
matched_contacts = []
has_names = len(extracted_names) > 0
has_divisions = len(division_matches) > 0
requested_divisions = [dm.division for dm in division_matches] if has_divisions else []
# Strategy A: If we have names, search by name
name_matches = {} # Track name matches by contact ID
if extracted_names:
for name in extracted_names:
# Try exact match first
exact_match = get_contact_by_name(name)
if exact_match:
contact_id = exact_match["id"]
name_matches[contact_id] = {
"contact": exact_match,
"confidence": 1.0,
"similarity": 1.0,
"match_type": "exact"
}
logger.info(f"โœ“ Exact name match: {exact_match['full_name_en']}")
else:
# Fuzzy name matching
fuzzy_matches = self._fuzzy_name_search(name, top_k=10)
for contact, similarity in fuzzy_matches:
contact_id = contact["id"]
# Only keep best match for each contact
if contact_id not in name_matches or similarity > name_matches[contact_id]["similarity"]:
name_matches[contact_id] = {
"contact": contact,
"confidence": round(0.5 + (similarity * 0.45), 2),
"similarity": round(similarity, 2),
"match_type": "fuzzy"
}
logger.info(
f"Fuzzy name match: {contact['full_name_en']} "
f"(similarity: {similarity:.2f})"
)
# Strategy B: Division-based search
division_matches_dict = {} # Track division matches by contact ID
for div_match in division_matches:
division = div_match.division
division_confidence = div_match.confidence
# Get contacts in this division
division_contacts = get_contacts_by_division(division)
for contact in division_contacts:
contact_id = contact["id"]
# Only keep best division match for each contact
if contact_id not in division_matches_dict or division_confidence > division_matches_dict[contact_id]["confidence"]:
division_matches_dict[contact_id] = {
"contact": contact,
"confidence": division_confidence,
"division": division
}
# Strategy C: Combine matches intelligently
# Priority when BOTH name and division are specified:
# 1. Name + Correct Division = HIGHEST (both match)
# 2. Correct Division only = HIGH (division is most important)
# 3. Name + Wrong Division = LOW (penalize wrong division)
all_contact_ids = set(name_matches.keys()) | set(division_matches_dict.keys())
for contact_id in all_contact_ids:
has_name_match = contact_id in name_matches
has_division_match = contact_id in division_matches_dict
if has_name_match and has_division_match:
# BOTH name and division match - BEST CASE
name_data = name_matches[contact_id]
div_data = division_matches_dict[contact_id]
contact = name_data["contact"]
# When both match: take MAX of the two confidences and add a boost
# This ensures division + name is always better than division alone
combined_confidence = max(name_data["confidence"], div_data["confidence"]) + 0.15
combined_confidence = min(1.0, combined_confidence)
contact_result = contact.copy()
contact_result["confidence"] = round(combined_confidence, 2)
contact_result["match_reason"] = "name_and_division_match"
contact_result["name_confidence"] = name_data["confidence"]
contact_result["division_confidence"] = div_data["confidence"]
matched_contacts.append(contact_result)
logger.info(
f"โœ“ BOTH match: {contact['full_name_en']} in {div_data['division']} "
f"(final confidence: {contact_result['confidence']})"
)
elif has_division_match:
# Division match only (no name specified, or name didn't match this person)
div_data = division_matches_dict[contact_id]
contact = div_data["contact"]
contact_result = contact.copy()
contact_result["confidence"] = div_data["confidence"]
contact_result["match_reason"] = "division_match"
contact_result["division_confidence"] = div_data["confidence"]
matched_contacts.append(contact_result)
elif has_name_match:
# Name match but WRONG division (or no division specified)
name_data = name_matches[contact_id]
contact = name_data["contact"]
# If division was specified in query AND has reasonable confidence (>= 0.58)
# Apply penalty for being in wrong division
# If division confidence is very low (< 0.58), treat as name-only search
# This threshold helps avoid false division matches from words like "Find" (which scores ~0.56)
# while still catching abbreviations like "App Dev" (which scores ~0.59)
has_strong_division_match = has_divisions and division_matches[0].confidence >= 0.58
if has_strong_division_match:
# Heavy penalty for wrong division when division was specified with confidence
penalized_confidence = name_data["confidence"] * 0.3 # 70% penalty
contact_result = contact.copy()
contact_result["confidence"] = round(penalized_confidence, 2)
contact_result["match_reason"] = "name_match_wrong_division"
contact_result["name_confidence"] = name_data["confidence"]
contact_result["requested_division"] = ", ".join(requested_divisions[:2])
matched_contacts.append(contact_result)
logger.info(
f"Name match with WRONG division: {contact['full_name_en']} "
f"in {contact['division']} (wanted: {requested_divisions[0]}, "
f"confidence: {contact_result['confidence']})"
)
else:
# No division specified OR weak division match - use name confidence as-is
contact_result = contact.copy()
contact_result["confidence"] = name_data["confidence"]
contact_result["match_reason"] = f"{name_data['match_type']}_name_match"
contact_result["name_confidence"] = name_data["confidence"]
matched_contacts.append(contact_result)
# Step 4: Remove duplicates (keep highest confidence)
unique_contacts = {}
for contact in matched_contacts:
contact_id = contact["id"]
if contact_id not in unique_contacts:
unique_contacts[contact_id] = contact
else:
# Keep the one with higher confidence
if contact["confidence"] > unique_contacts[contact_id]["confidence"]:
unique_contacts[contact_id] = contact
# Convert back to list
matched_contacts = list(unique_contacts.values())
# Step 5: Filter by minimum confidence
matched_contacts = [
c for c in matched_contacts if c["confidence"] >= min_confidence
]
# Step 6: Sort by confidence (descending) - exact matches will be first
matched_contacts.sort(key=lambda x: x["confidence"], reverse=True)
# Step 7: Limit to top_k
matched_contacts = matched_contacts[:top_k]
logger.info(f"โœ“ Returning {len(matched_contacts)} matched contacts")
return matched_contacts
def _fuzzy_name_search(
self,
name: str,
top_k: int = 5,
min_similarity: float = 0.75 # Increased from 0.6 to avoid false matches
) -> List[Tuple[Dict, float]]:
"""
Fuzzy name matching using string similarity with stricter rules.
Args:
name: Name to search for
top_k: Number of top matches to return
min_similarity: Minimum similarity threshold (0.0-1.0)
Returns:
List of (contact, similarity_score) tuples
"""
matches = []
# Normalize name for comparison
name_normalized = self._normalize_name(name)
# Get first letter for initial matching (helps avoid false positives)
name_first_letter = name_normalized[0] if name_normalized else ''
for contact in self.all_contacts:
# Check against both Arabic and English names
full_name_en_normalized = self._normalize_name(contact["full_name_en"])
full_name_ar_normalized = self._normalize_name(contact["full_name_ar"])
first_name_en_normalized = self._normalize_name(contact["first_name_en"])
first_name_ar_normalized = self._normalize_name(contact["first_name_ar"])
last_name_en_normalized = self._normalize_name(contact["last_name_en"])
last_name_ar_normalized = self._normalize_name(contact["last_name_ar"])
# Calculate similarity against various name combinations
name_candidates = [
(full_name_en_normalized, "full_en"),
(full_name_ar_normalized, "full_ar"),
(first_name_en_normalized, "first_en"),
(first_name_ar_normalized, "first_ar"),
(last_name_en_normalized, "last_en"),
(last_name_ar_normalized, "last_ar"),
]
best_similarity = 0
best_match_type = None
for candidate_name, match_type in name_candidates:
if not candidate_name:
continue
similarity = self._string_similarity(name_normalized, candidate_name)
# Apply stricter rules for fuzzy matching:
# 1. Names should start with the same letter (for English names)
# 2. Or have very high similarity (>= 0.85)
if match_type.endswith('_en'):
candidate_first_letter = candidate_name[0] if candidate_name else ''
# Require same first letter OR very high similarity
if candidate_first_letter != name_first_letter and similarity < 0.85:
continue # Skip this match
if similarity > best_similarity:
best_similarity = similarity
best_match_type = match_type
if best_similarity >= min_similarity:
matches.append((contact, best_similarity))
# Sort by similarity (descending)
matches.sort(key=lambda x: x[1], reverse=True)
return matches[:top_k]
def _normalize_name(self, name: str) -> str:
"""Normalize name for comparison (lowercase, remove extra spaces)"""
return re.sub(r'\s+', ' ', name.strip().lower())
def _string_similarity(self, s1: str, s2: str) -> float:
"""
Calculate string similarity using SequenceMatcher.
Returns:
Similarity score between 0.0 and 1.0
"""
return SequenceMatcher(None, s1, s2).ratio()
def get_contact_stats(self) -> Dict:
"""Get statistics about the contact database"""
from collections import Counter
dept_counts = Counter(contact["department"] for contact in self.all_contacts)
div_counts = Counter(contact["division"] for contact in self.all_contacts)
return {
"total_contacts": len(self.all_contacts),
"departments": len(dept_counts),
"divisions": len(div_counts),
"contacts_by_department": dict(dept_counts),
"contacts_by_division": dict(div_counts),
}
if __name__ == "__main__":
# Test the contact search service
from name_extraction_service import NameExtractor
from embedding_service import EmbeddingService
print("Initializing services...")
name_extractor = NameExtractor()
embedding_service = EmbeddingService()
search_service = ContactSearchService(name_extractor, embedding_service)
print("\nContact Database Stats:")
stats = search_service.get_contact_stats()
print(f"Total contacts: {stats['total_contacts']}")
print(f"Departments: {stats['departments']}")
print(f"Divisions: {stats['divisions']}")
# Test queries
test_queries = [
"Find Ahmed in IT",
"I need to talk to someone in HR",
"ู…ุญู…ุฏ ููŠ ุงู„ู…ุงู„ูŠุฉ", # "Mohammed in Finance" in Arabic
"Finance accounting help",
]
print("\n" + "="*80)
print("Testing Contact Search")
print("="*80)
for query in test_queries:
print(f"\nQuery: '{query}'")
print("-" * 80)
results = search_service.search_contacts(query, top_k=3)
if results:
for i, contact in enumerate(results, 1):
print(f"{i}. {contact['full_name_en']} ({contact['full_name_ar']})")
print(f" {contact['title_en']} - {contact['division']}")
print(f" {contact['email']} | Ext: {contact['extension']}")
print(f" Confidence: {contact['confidence']:.2f} | Reason: {contact['match_reason']}")
else:
print("No matches found.")