Spaces:
Sleeping
Sleeping
| # contact_search_service.py | |
| """ | |
| Contact search service with intelligent matching: | |
| - Name-based search (exact and fuzzy matching) | |
| - Division-based search | |
| - Combined search (name + division) | |
| - Confidence scoring | |
| """ | |
| import logging | |
| from typing import List, Dict, Optional, Tuple | |
| from difflib import SequenceMatcher | |
| import re | |
| from contacts_data import ( | |
| get_all_contacts, | |
| get_contacts_by_division, | |
| get_contact_by_name | |
| ) | |
| from name_extraction_service import NameExtractor | |
| from embedding_service import EmbeddingService | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class ContactSearchService: | |
| """ | |
| Service for searching contacts with intelligent matching. | |
| Features: | |
| - Exact name matching (100% confidence) | |
| - Fuzzy name matching (partial names, typos) | |
| - Division-based matching | |
| - Combined search (name + division) | |
| - Multi-language support (English and Arabic) | |
| """ | |
| def __init__( | |
| self, | |
| name_extractor: NameExtractor, | |
| embedding_service: EmbeddingService | |
| ): | |
| """ | |
| Initialize the contact search service. | |
| Args: | |
| name_extractor: NameExtractor service for extracting names from queries | |
| embedding_service: EmbeddingService for division matching | |
| """ | |
| self.name_extractor = name_extractor | |
| self.embedding_service = embedding_service | |
| self.all_contacts = get_all_contacts() | |
| logger.info(f"ContactSearchService initialized with {len(self.all_contacts)} contacts") | |
| def search_contacts( | |
| self, | |
| query: str, | |
| top_k: int = 10, | |
| min_confidence: float = 0.3 | |
| ) -> List[Dict]: | |
| """ | |
| Search for contacts based on query. | |
| Process: | |
| 1. Extract names from query | |
| 2. Find matching divisions | |
| 3. Match contacts by: | |
| - Exact name match (if name found) โ confidence = 1.0 | |
| - Fuzzy name match โ confidence based on similarity | |
| - Division match โ confidence from embedding service | |
| - Combined match (name + division) โ boosted confidence | |
| 4. Sort by confidence (exact matches first) | |
| Args: | |
| query: Search query (English or Arabic) | |
| top_k: Maximum number of results to return | |
| min_confidence: Minimum confidence threshold (0.0-1.0) | |
| Returns: | |
| List of matched contacts with confidence scores | |
| """ | |
| logger.info(f"Searching contacts for query: '{query}'") | |
| # Step 1: Extract names from query | |
| extracted_names = self.name_extractor.extract_names(query) | |
| logger.info(f"Extracted names: {extracted_names}") | |
| # Step 2: Find matching divisions | |
| division_matches = self.embedding_service.find_division(query, top_k=3) | |
| logger.info(f"Found {len(division_matches)} division matches") | |
| # Step 3: Match contacts | |
| matched_contacts = [] | |
| has_names = len(extracted_names) > 0 | |
| has_divisions = len(division_matches) > 0 | |
| requested_divisions = [dm.division for dm in division_matches] if has_divisions else [] | |
| # Strategy A: If we have names, search by name | |
| name_matches = {} # Track name matches by contact ID | |
| if extracted_names: | |
| for name in extracted_names: | |
| # Try exact match first | |
| exact_match = get_contact_by_name(name) | |
| if exact_match: | |
| contact_id = exact_match["id"] | |
| name_matches[contact_id] = { | |
| "contact": exact_match, | |
| "confidence": 1.0, | |
| "similarity": 1.0, | |
| "match_type": "exact" | |
| } | |
| logger.info(f"โ Exact name match: {exact_match['full_name_en']}") | |
| else: | |
| # Fuzzy name matching | |
| fuzzy_matches = self._fuzzy_name_search(name, top_k=10) | |
| for contact, similarity in fuzzy_matches: | |
| contact_id = contact["id"] | |
| # Only keep best match for each contact | |
| if contact_id not in name_matches or similarity > name_matches[contact_id]["similarity"]: | |
| name_matches[contact_id] = { | |
| "contact": contact, | |
| "confidence": round(0.5 + (similarity * 0.45), 2), | |
| "similarity": round(similarity, 2), | |
| "match_type": "fuzzy" | |
| } | |
| logger.info( | |
| f"Fuzzy name match: {contact['full_name_en']} " | |
| f"(similarity: {similarity:.2f})" | |
| ) | |
| # Strategy B: Division-based search | |
| division_matches_dict = {} # Track division matches by contact ID | |
| for div_match in division_matches: | |
| division = div_match.division | |
| division_confidence = div_match.confidence | |
| # Get contacts in this division | |
| division_contacts = get_contacts_by_division(division) | |
| for contact in division_contacts: | |
| contact_id = contact["id"] | |
| # Only keep best division match for each contact | |
| if contact_id not in division_matches_dict or division_confidence > division_matches_dict[contact_id]["confidence"]: | |
| division_matches_dict[contact_id] = { | |
| "contact": contact, | |
| "confidence": division_confidence, | |
| "division": division | |
| } | |
| # Strategy C: Combine matches intelligently | |
| # Priority when BOTH name and division are specified: | |
| # 1. Name + Correct Division = HIGHEST (both match) | |
| # 2. Correct Division only = HIGH (division is most important) | |
| # 3. Name + Wrong Division = LOW (penalize wrong division) | |
| all_contact_ids = set(name_matches.keys()) | set(division_matches_dict.keys()) | |
| for contact_id in all_contact_ids: | |
| has_name_match = contact_id in name_matches | |
| has_division_match = contact_id in division_matches_dict | |
| if has_name_match and has_division_match: | |
| # BOTH name and division match - BEST CASE | |
| name_data = name_matches[contact_id] | |
| div_data = division_matches_dict[contact_id] | |
| contact = name_data["contact"] | |
| # When both match: take MAX of the two confidences and add a boost | |
| # This ensures division + name is always better than division alone | |
| combined_confidence = max(name_data["confidence"], div_data["confidence"]) + 0.15 | |
| combined_confidence = min(1.0, combined_confidence) | |
| contact_result = contact.copy() | |
| contact_result["confidence"] = round(combined_confidence, 2) | |
| contact_result["match_reason"] = "name_and_division_match" | |
| contact_result["name_confidence"] = name_data["confidence"] | |
| contact_result["division_confidence"] = div_data["confidence"] | |
| matched_contacts.append(contact_result) | |
| logger.info( | |
| f"โ BOTH match: {contact['full_name_en']} in {div_data['division']} " | |
| f"(final confidence: {contact_result['confidence']})" | |
| ) | |
| elif has_division_match: | |
| # Division match only (no name specified, or name didn't match this person) | |
| div_data = division_matches_dict[contact_id] | |
| contact = div_data["contact"] | |
| contact_result = contact.copy() | |
| contact_result["confidence"] = div_data["confidence"] | |
| contact_result["match_reason"] = "division_match" | |
| contact_result["division_confidence"] = div_data["confidence"] | |
| matched_contacts.append(contact_result) | |
| elif has_name_match: | |
| # Name match but WRONG division (or no division specified) | |
| name_data = name_matches[contact_id] | |
| contact = name_data["contact"] | |
| # If division was specified in query AND has reasonable confidence (>= 0.58) | |
| # Apply penalty for being in wrong division | |
| # If division confidence is very low (< 0.58), treat as name-only search | |
| # This threshold helps avoid false division matches from words like "Find" (which scores ~0.56) | |
| # while still catching abbreviations like "App Dev" (which scores ~0.59) | |
| has_strong_division_match = has_divisions and division_matches[0].confidence >= 0.58 | |
| if has_strong_division_match: | |
| # Heavy penalty for wrong division when division was specified with confidence | |
| penalized_confidence = name_data["confidence"] * 0.3 # 70% penalty | |
| contact_result = contact.copy() | |
| contact_result["confidence"] = round(penalized_confidence, 2) | |
| contact_result["match_reason"] = "name_match_wrong_division" | |
| contact_result["name_confidence"] = name_data["confidence"] | |
| contact_result["requested_division"] = ", ".join(requested_divisions[:2]) | |
| matched_contacts.append(contact_result) | |
| logger.info( | |
| f"Name match with WRONG division: {contact['full_name_en']} " | |
| f"in {contact['division']} (wanted: {requested_divisions[0]}, " | |
| f"confidence: {contact_result['confidence']})" | |
| ) | |
| else: | |
| # No division specified OR weak division match - use name confidence as-is | |
| contact_result = contact.copy() | |
| contact_result["confidence"] = name_data["confidence"] | |
| contact_result["match_reason"] = f"{name_data['match_type']}_name_match" | |
| contact_result["name_confidence"] = name_data["confidence"] | |
| matched_contacts.append(contact_result) | |
| # Step 4: Remove duplicates (keep highest confidence) | |
| unique_contacts = {} | |
| for contact in matched_contacts: | |
| contact_id = contact["id"] | |
| if contact_id not in unique_contacts: | |
| unique_contacts[contact_id] = contact | |
| else: | |
| # Keep the one with higher confidence | |
| if contact["confidence"] > unique_contacts[contact_id]["confidence"]: | |
| unique_contacts[contact_id] = contact | |
| # Convert back to list | |
| matched_contacts = list(unique_contacts.values()) | |
| # Step 5: Filter by minimum confidence | |
| matched_contacts = [ | |
| c for c in matched_contacts if c["confidence"] >= min_confidence | |
| ] | |
| # Step 6: Sort by confidence (descending) - exact matches will be first | |
| matched_contacts.sort(key=lambda x: x["confidence"], reverse=True) | |
| # Step 7: Limit to top_k | |
| matched_contacts = matched_contacts[:top_k] | |
| logger.info(f"โ Returning {len(matched_contacts)} matched contacts") | |
| return matched_contacts | |
| def _fuzzy_name_search( | |
| self, | |
| name: str, | |
| top_k: int = 5, | |
| min_similarity: float = 0.75 # Increased from 0.6 to avoid false matches | |
| ) -> List[Tuple[Dict, float]]: | |
| """ | |
| Fuzzy name matching using string similarity with stricter rules. | |
| Args: | |
| name: Name to search for | |
| top_k: Number of top matches to return | |
| min_similarity: Minimum similarity threshold (0.0-1.0) | |
| Returns: | |
| List of (contact, similarity_score) tuples | |
| """ | |
| matches = [] | |
| # Normalize name for comparison | |
| name_normalized = self._normalize_name(name) | |
| # Get first letter for initial matching (helps avoid false positives) | |
| name_first_letter = name_normalized[0] if name_normalized else '' | |
| for contact in self.all_contacts: | |
| # Check against both Arabic and English names | |
| full_name_en_normalized = self._normalize_name(contact["full_name_en"]) | |
| full_name_ar_normalized = self._normalize_name(contact["full_name_ar"]) | |
| first_name_en_normalized = self._normalize_name(contact["first_name_en"]) | |
| first_name_ar_normalized = self._normalize_name(contact["first_name_ar"]) | |
| last_name_en_normalized = self._normalize_name(contact["last_name_en"]) | |
| last_name_ar_normalized = self._normalize_name(contact["last_name_ar"]) | |
| # Calculate similarity against various name combinations | |
| name_candidates = [ | |
| (full_name_en_normalized, "full_en"), | |
| (full_name_ar_normalized, "full_ar"), | |
| (first_name_en_normalized, "first_en"), | |
| (first_name_ar_normalized, "first_ar"), | |
| (last_name_en_normalized, "last_en"), | |
| (last_name_ar_normalized, "last_ar"), | |
| ] | |
| best_similarity = 0 | |
| best_match_type = None | |
| for candidate_name, match_type in name_candidates: | |
| if not candidate_name: | |
| continue | |
| similarity = self._string_similarity(name_normalized, candidate_name) | |
| # Apply stricter rules for fuzzy matching: | |
| # 1. Names should start with the same letter (for English names) | |
| # 2. Or have very high similarity (>= 0.85) | |
| if match_type.endswith('_en'): | |
| candidate_first_letter = candidate_name[0] if candidate_name else '' | |
| # Require same first letter OR very high similarity | |
| if candidate_first_letter != name_first_letter and similarity < 0.85: | |
| continue # Skip this match | |
| if similarity > best_similarity: | |
| best_similarity = similarity | |
| best_match_type = match_type | |
| if best_similarity >= min_similarity: | |
| matches.append((contact, best_similarity)) | |
| # Sort by similarity (descending) | |
| matches.sort(key=lambda x: x[1], reverse=True) | |
| return matches[:top_k] | |
| def _normalize_name(self, name: str) -> str: | |
| """Normalize name for comparison (lowercase, remove extra spaces)""" | |
| return re.sub(r'\s+', ' ', name.strip().lower()) | |
| def _string_similarity(self, s1: str, s2: str) -> float: | |
| """ | |
| Calculate string similarity using SequenceMatcher. | |
| Returns: | |
| Similarity score between 0.0 and 1.0 | |
| """ | |
| return SequenceMatcher(None, s1, s2).ratio() | |
| def get_contact_stats(self) -> Dict: | |
| """Get statistics about the contact database""" | |
| from collections import Counter | |
| dept_counts = Counter(contact["department"] for contact in self.all_contacts) | |
| div_counts = Counter(contact["division"] for contact in self.all_contacts) | |
| return { | |
| "total_contacts": len(self.all_contacts), | |
| "departments": len(dept_counts), | |
| "divisions": len(div_counts), | |
| "contacts_by_department": dict(dept_counts), | |
| "contacts_by_division": dict(div_counts), | |
| } | |
| if __name__ == "__main__": | |
| # Test the contact search service | |
| from name_extraction_service import NameExtractor | |
| from embedding_service import EmbeddingService | |
| print("Initializing services...") | |
| name_extractor = NameExtractor() | |
| embedding_service = EmbeddingService() | |
| search_service = ContactSearchService(name_extractor, embedding_service) | |
| print("\nContact Database Stats:") | |
| stats = search_service.get_contact_stats() | |
| print(f"Total contacts: {stats['total_contacts']}") | |
| print(f"Departments: {stats['departments']}") | |
| print(f"Divisions: {stats['divisions']}") | |
| # Test queries | |
| test_queries = [ | |
| "Find Ahmed in IT", | |
| "I need to talk to someone in HR", | |
| "ู ุญู ุฏ ูู ุงูู ุงููุฉ", # "Mohammed in Finance" in Arabic | |
| "Finance accounting help", | |
| ] | |
| print("\n" + "="*80) | |
| print("Testing Contact Search") | |
| print("="*80) | |
| for query in test_queries: | |
| print(f"\nQuery: '{query}'") | |
| print("-" * 80) | |
| results = search_service.search_contacts(query, top_k=3) | |
| if results: | |
| for i, contact in enumerate(results, 1): | |
| print(f"{i}. {contact['full_name_en']} ({contact['full_name_ar']})") | |
| print(f" {contact['title_en']} - {contact['division']}") | |
| print(f" {contact['email']} | Ext: {contact['extension']}") | |
| print(f" Confidence: {contact['confidence']:.2f} | Reason: {contact['match_reason']}") | |
| else: | |
| print("No matches found.") | |