Spaces:

MEssamOrg
/

ContactSearchAssistant

Sleeping

ContactSearchAssistant / contact_search_service.py

Muhammed Essam

Initial commit: Voice Assistant demo

8ef276c 20 days ago

17.1 kB

	# contact_search_service.py
	"""
	Contact search service with intelligent matching:
	- Name-based search (exact and fuzzy matching)
	- Division-based search
	- Combined search (name + division)
	- Confidence scoring
	"""

	import logging
	from typing import List, Dict, Optional, Tuple
	from difflib import SequenceMatcher
	import re

	from contacts_data import (
	get_all_contacts,
	get_contacts_by_division,
	get_contact_by_name
	)
	from name_extraction_service import NameExtractor
	from embedding_service import EmbeddingService

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	class ContactSearchService:
	"""
	Service for searching contacts with intelligent matching.

	Features:
	- Exact name matching (100% confidence)
	- Fuzzy name matching (partial names, typos)
	- Division-based matching
	- Combined search (name + division)
	- Multi-language support (English and Arabic)
	"""

	def __init__(
	self,
	name_extractor: NameExtractor,
	embedding_service: EmbeddingService
	):
	"""
	Initialize the contact search service.

	Args:
	name_extractor: NameExtractor service for extracting names from queries
	embedding_service: EmbeddingService for division matching
	"""
	self.name_extractor = name_extractor
	self.embedding_service = embedding_service
	self.all_contacts = get_all_contacts()

	logger.info(f"ContactSearchService initialized with {len(self.all_contacts)} contacts")

	def search_contacts(
	self,
	query: str,
	top_k: int = 10,
	min_confidence: float = 0.3
	) -> List[Dict]:
	"""
	Search for contacts based on query.

	Process:
	1. Extract names from query
	2. Find matching divisions
	3. Match contacts by:
	- Exact name match (if name found) → confidence = 1.0
	- Fuzzy name match → confidence based on similarity
	- Division match → confidence from embedding service
	- Combined match (name + division) → boosted confidence
	4. Sort by confidence (exact matches first)

	Args:
	query: Search query (English or Arabic)
	top_k: Maximum number of results to return
	min_confidence: Minimum confidence threshold (0.0-1.0)

	Returns:
	List of matched contacts with confidence scores
	"""
	logger.info(f"Searching contacts for query: '{query}'")

	# Step 1: Extract names from query
	extracted_names = self.name_extractor.extract_names(query)
	logger.info(f"Extracted names: {extracted_names}")

	# Step 2: Find matching divisions
	division_matches = self.embedding_service.find_division(query, top_k=3)
	logger.info(f"Found {len(division_matches)} division matches")

	# Step 3: Match contacts
	matched_contacts = []
	has_names = len(extracted_names) > 0
	has_divisions = len(division_matches) > 0
	requested_divisions = [dm.division for dm in division_matches] if has_divisions else []

	# Strategy A: If we have names, search by name
	name_matches = {} # Track name matches by contact ID
	if extracted_names:
	for name in extracted_names:
	# Try exact match first
	exact_match = get_contact_by_name(name)
	if exact_match:
	contact_id = exact_match["id"]
	name_matches[contact_id] = {
	"contact": exact_match,
	"confidence": 1.0,
	"similarity": 1.0,
	"match_type": "exact"
	}
	logger.info(f"✓ Exact name match: {exact_match['full_name_en']}")
	else:
	# Fuzzy name matching
	fuzzy_matches = self._fuzzy_name_search(name, top_k=10)
	for contact, similarity in fuzzy_matches:
	contact_id = contact["id"]
	# Only keep best match for each contact
	if contact_id not in name_matches or similarity > name_matches[contact_id]["similarity"]:
	name_matches[contact_id] = {
	"contact": contact,
	"confidence": round(0.5 + (similarity * 0.45), 2),
	"similarity": round(similarity, 2),
	"match_type": "fuzzy"
	}
	logger.info(
	f"Fuzzy name match: {contact['full_name_en']} "
	f"(similarity: {similarity:.2f})"
	)

	# Strategy B: Division-based search
	division_matches_dict = {} # Track division matches by contact ID
	for div_match in division_matches:
	division = div_match.division
	division_confidence = div_match.confidence

	# Get contacts in this division
	division_contacts = get_contacts_by_division(division)

	for contact in division_contacts:
	contact_id = contact["id"]
	# Only keep best division match for each contact
	if contact_id not in division_matches_dict or division_confidence > division_matches_dict[contact_id]["confidence"]:
	division_matches_dict[contact_id] = {
	"contact": contact,
	"confidence": division_confidence,
	"division": division
	}

	# Strategy C: Combine matches intelligently
	# Priority when BOTH name and division are specified:
	# 1. Name + Correct Division = HIGHEST (both match)
	# 2. Correct Division only = HIGH (division is most important)
	# 3. Name + Wrong Division = LOW (penalize wrong division)

	all_contact_ids = set(name_matches.keys()) \| set(division_matches_dict.keys())

	for contact_id in all_contact_ids:
	has_name_match = contact_id in name_matches
	has_division_match = contact_id in division_matches_dict

	if has_name_match and has_division_match:
	# BOTH name and division match - BEST CASE
	name_data = name_matches[contact_id]
	div_data = division_matches_dict[contact_id]
	contact = name_data["contact"]

	# When both match: take MAX of the two confidences and add a boost
	# This ensures division + name is always better than division alone
	combined_confidence = max(name_data["confidence"], div_data["confidence"]) + 0.15
	combined_confidence = min(1.0, combined_confidence)

	contact_result = contact.copy()
	contact_result["confidence"] = round(combined_confidence, 2)
	contact_result["match_reason"] = "name_and_division_match"
	contact_result["name_confidence"] = name_data["confidence"]
	contact_result["division_confidence"] = div_data["confidence"]
	matched_contacts.append(contact_result)

	logger.info(
	f"✓ BOTH match: {contact['full_name_en']} in {div_data['division']} "
	f"(final confidence: {contact_result['confidence']})"
	)

	elif has_division_match:
	# Division match only (no name specified, or name didn't match this person)
	div_data = division_matches_dict[contact_id]
	contact = div_data["contact"]

	contact_result = contact.copy()
	contact_result["confidence"] = div_data["confidence"]
	contact_result["match_reason"] = "division_match"
	contact_result["division_confidence"] = div_data["confidence"]
	matched_contacts.append(contact_result)

	elif has_name_match:
	# Name match but WRONG division (or no division specified)
	name_data = name_matches[contact_id]
	contact = name_data["contact"]

	# If division was specified in query AND has reasonable confidence (>= 0.58)
	# Apply penalty for being in wrong division
	# If division confidence is very low (< 0.58), treat as name-only search
	# This threshold helps avoid false division matches from words like "Find" (which scores ~0.56)
	# while still catching abbreviations like "App Dev" (which scores ~0.59)
	has_strong_division_match = has_divisions and division_matches[0].confidence >= 0.58

	if has_strong_division_match:
	# Heavy penalty for wrong division when division was specified with confidence
	penalized_confidence = name_data["confidence"] * 0.3 # 70% penalty
	contact_result = contact.copy()
	contact_result["confidence"] = round(penalized_confidence, 2)
	contact_result["match_reason"] = "name_match_wrong_division"
	contact_result["name_confidence"] = name_data["confidence"]
	contact_result["requested_division"] = ", ".join(requested_divisions[:2])
	matched_contacts.append(contact_result)

	logger.info(
	f"Name match with WRONG division: {contact['full_name_en']} "
	f"in {contact['division']} (wanted: {requested_divisions[0]}, "
	f"confidence: {contact_result['confidence']})"
	)
	else:
	# No division specified OR weak division match - use name confidence as-is
	contact_result = contact.copy()
	contact_result["confidence"] = name_data["confidence"]
	contact_result["match_reason"] = f"{name_data['match_type']}_name_match"
	contact_result["name_confidence"] = name_data["confidence"]
	matched_contacts.append(contact_result)

	# Step 4: Remove duplicates (keep highest confidence)
	unique_contacts = {}
	for contact in matched_contacts:
	contact_id = contact["id"]
	if contact_id not in unique_contacts:
	unique_contacts[contact_id] = contact
	else:
	# Keep the one with higher confidence
	if contact["confidence"] > unique_contacts[contact_id]["confidence"]:
	unique_contacts[contact_id] = contact

	# Convert back to list
	matched_contacts = list(unique_contacts.values())

	# Step 5: Filter by minimum confidence
	matched_contacts = [
	c for c in matched_contacts if c["confidence"] >= min_confidence
	]

	# Step 6: Sort by confidence (descending) - exact matches will be first
	matched_contacts.sort(key=lambda x: x["confidence"], reverse=True)

	# Step 7: Limit to top_k
	matched_contacts = matched_contacts[:top_k]

	logger.info(f"✓ Returning {len(matched_contacts)} matched contacts")

	return matched_contacts

	def _fuzzy_name_search(
	self,
	name: str,
	top_k: int = 5,
	min_similarity: float = 0.75 # Increased from 0.6 to avoid false matches
	) -> List[Tuple[Dict, float]]:
	"""
	Fuzzy name matching using string similarity with stricter rules.

	Args:
	name: Name to search for
	top_k: Number of top matches to return
	min_similarity: Minimum similarity threshold (0.0-1.0)

	Returns:
	List of (contact, similarity_score) tuples
	"""
	matches = []

	# Normalize name for comparison
	name_normalized = self._normalize_name(name)

	# Get first letter for initial matching (helps avoid false positives)
	name_first_letter = name_normalized[0] if name_normalized else ''

	for contact in self.all_contacts:
	# Check against both Arabic and English names
	full_name_en_normalized = self._normalize_name(contact["full_name_en"])
	full_name_ar_normalized = self._normalize_name(contact["full_name_ar"])
	first_name_en_normalized = self._normalize_name(contact["first_name_en"])
	first_name_ar_normalized = self._normalize_name(contact["first_name_ar"])
	last_name_en_normalized = self._normalize_name(contact["last_name_en"])
	last_name_ar_normalized = self._normalize_name(contact["last_name_ar"])

	# Calculate similarity against various name combinations
	name_candidates = [
	(full_name_en_normalized, "full_en"),
	(full_name_ar_normalized, "full_ar"),
	(first_name_en_normalized, "first_en"),
	(first_name_ar_normalized, "first_ar"),
	(last_name_en_normalized, "last_en"),
	(last_name_ar_normalized, "last_ar"),
	]

	best_similarity = 0
	best_match_type = None

	for candidate_name, match_type in name_candidates:
	if not candidate_name:
	continue

	similarity = self._string_similarity(name_normalized, candidate_name)

	# Apply stricter rules for fuzzy matching:
	# 1. Names should start with the same letter (for English names)
	# 2. Or have very high similarity (>= 0.85)
	if match_type.endswith('_en'):
	candidate_first_letter = candidate_name[0] if candidate_name else ''
	# Require same first letter OR very high similarity
	if candidate_first_letter != name_first_letter and similarity < 0.85:
	continue # Skip this match

	if similarity > best_similarity:
	best_similarity = similarity
	best_match_type = match_type

	if best_similarity >= min_similarity:
	matches.append((contact, best_similarity))

	# Sort by similarity (descending)
	matches.sort(key=lambda x: x[1], reverse=True)

	return matches[:top_k]

	def _normalize_name(self, name: str) -> str:
	"""Normalize name for comparison (lowercase, remove extra spaces)"""
	return re.sub(r'\s+', ' ', name.strip().lower())

	def _string_similarity(self, s1: str, s2: str) -> float:
	"""
	Calculate string similarity using SequenceMatcher.

	Returns:
	Similarity score between 0.0 and 1.0
	"""
	return SequenceMatcher(None, s1, s2).ratio()

	def get_contact_stats(self) -> Dict:
	"""Get statistics about the contact database"""
	from collections import Counter

	dept_counts = Counter(contact["department"] for contact in self.all_contacts)
	div_counts = Counter(contact["division"] for contact in self.all_contacts)

	return {
	"total_contacts": len(self.all_contacts),
	"departments": len(dept_counts),
	"divisions": len(div_counts),
	"contacts_by_department": dict(dept_counts),
	"contacts_by_division": dict(div_counts),
	}


	if __name__ == "__main__":
	# Test the contact search service
	from name_extraction_service import NameExtractor
	from embedding_service import EmbeddingService

	print("Initializing services...")
	name_extractor = NameExtractor()
	embedding_service = EmbeddingService()
	search_service = ContactSearchService(name_extractor, embedding_service)

	print("\nContact Database Stats:")
	stats = search_service.get_contact_stats()
	print(f"Total contacts: {stats['total_contacts']}")
	print(f"Departments: {stats['departments']}")
	print(f"Divisions: {stats['divisions']}")

	# Test queries
	test_queries = [
	"Find Ahmed in IT",
	"I need to talk to someone in HR",
	"محمد في المالية", # "Mohammed in Finance" in Arabic
	"Finance accounting help",
	]

	print("\n" + "="*80)
	print("Testing Contact Search")
	print("="*80)

	for query in test_queries:
	print(f"\nQuery: '{query}'")
	print("-" * 80)

	results = search_service.search_contacts(query, top_k=3)

	if results:
	for i, contact in enumerate(results, 1):
	print(f"{i}. {contact['full_name_en']} ({contact['full_name_ar']})")
	print(f" {contact['title_en']} - {contact['division']}")
	print(f" {contact['email']} \| Ext: {contact['extension']}")
	print(f" Confidence: {contact['confidence']:.2f} \| Reason: {contact['match_reason']}")
	else:
	print("No matches found.")