Spaces:

MEssamOrg
/

ContactSearchAssistant

Sleeping

ContactSearchAssistant / name_extraction_service.py

Muhammed Essam

Initial commit: Voice Assistant demo

8ef276c 20 days ago

5.48 kB

	# name_extraction_service.py
	import logging
	from typing import List, Dict, Any, Optional
	from gliner import GLiNER

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	class NameExtractor:
	"""
	Service for extracting person names from text using GLiNER.

	GLiNER is a zero-shot NER model that can extract entities without
	being limited to predefined entity types. It's especially good for:
	- Multilingual name extraction (English + Arabic)
	- Flexible entity extraction
	- Lightweight and fast (~100-200ms)

	Size: ~150MB model
	Speed: ~100-200ms per query
	"""

	def __init__(self, model_name: str = "urchade/gliner_small-v2.1"):
	"""
	Initialize the name extraction service.

	Args:
	model_name: GLiNER model to use. Options:
	- "urchade/gliner_small-v2.1" (150MB, balanced)
	- "urchade/gliner_multi-v2.1" (multilingual, better for Arabic)
	- "urchade/gliner_large-v2.1" (larger, more accurate)
	"""
	logger.info(f"Loading GLiNER model: {model_name}")

	# Load the pre-trained model
	# This downloads the model on first run (~150MB)
	self.model = GLiNER.from_pretrained(model_name)

	# Define the entity labels we want to extract
	self.labels = ["person", "name", "employee"]

	logger.info(f"✓ GLiNER model loaded successfully")
	logger.info(f"Entity labels: {self.labels}")

	def extract_names(self, text: str, threshold: float = 0.3) -> List[str]:
	"""
	Extract person names from text.

	Args:
	text: Input text (e.g., "find Ahmed in IT")
	threshold: Confidence threshold (0-1). Lower = more names but less precise.
	Default 0.3 is good for most cases.

	Returns:
	List of extracted names

	Example:
	>>> extractor.extract_names("find Ahmed Hassan in IT")
	['Ahmed Hassan']

	>>> extractor.extract_names("connect me with Sarah from HR")
	['Sarah']
	"""
	logger.info(f"Extracting names from: {text}")

	# Predict entities using GLiNER
	entities = self.model.predict_entities(
	text,
	self.labels,
	threshold=threshold
	)

	# Extract just the text of person entities
	names = [entity["text"] for entity in entities]

	# Remove duplicates while preserving order
	unique_names = list(dict.fromkeys(names))

	logger.info(f"✓ Found {len(unique_names)} name(s): {unique_names}")

	return unique_names

	def extract_names_with_context(
	self,
	text: str,
	threshold: float = 0.3
	) -> List[Dict[str, Any]]:
	"""
	Extract person names with additional context (position, confidence).

	Args:
	text: Input text
	threshold: Confidence threshold (0-1)

	Returns:
	List of dictionaries with name details:
	[
	{
	"name": "Ahmed Hassan",
	"start": 5,
	"end": 17,
	"confidence": 0.95,
	"label": "person"
	}
	]
	"""
	logger.info(f"Extracting names with context from: {text}")

	# Predict entities
	entities = self.model.predict_entities(
	text,
	self.labels,
	threshold=threshold
	)

	# Format results
	results = []
	for entity in entities:
	results.append({
	"name": entity["text"],
	"start": entity["start"],
	"end": entity["end"],
	"confidence": round(entity["score"], 2),
	"label": entity["label"]
	})

	logger.info(f"✓ Found {len(results)} name(s) with context")

	return results

	def extract_from_query(
	self,
	query: str,
	extract_divisions: bool = False
	) -> Dict[str, Any]:
	"""
	Extract names and optionally division keywords from a query.

	Args:
	query: User query text
	extract_divisions: Whether to also extract division/department mentions

	Returns:
	Dictionary with extracted information:
	{
	"names": ["Ahmed", "Sarah"],
	"has_names": True,
	"count": 2,
	"divisions": ["IT", "HR"] (if extract_divisions=True)
	}
	"""
	# Extract names
	names = self.extract_names(query)

	result = {
	"names": names,
	"has_names": len(names) > 0,
	"count": len(names)
	}

	# Optionally extract division keywords
	if extract_divisions:
	# Common division/department keywords
	division_keywords = [
	"IT", "HR", "Finance", "Legal", "Accounting",
	"Marketing", "Sales", "Operations", "Engineering",
	"Security", "Facilities", "Purchasing", "Audit"
	]

	query_upper = query.upper()
	found_divisions = [
	kw for kw in division_keywords
	if kw in query_upper
	]

	result["divisions"] = found_divisions
	result["has_divisions"] = len(found_divisions) > 0

	return result