Spaces:
Sleeping
Sleeping
| # name_extraction_service.py | |
| import logging | |
| from typing import List, Dict, Any, Optional | |
| from gliner import GLiNER | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class NameExtractor: | |
| """ | |
| Service for extracting person names from text using GLiNER. | |
| GLiNER is a zero-shot NER model that can extract entities without | |
| being limited to predefined entity types. It's especially good for: | |
| - Multilingual name extraction (English + Arabic) | |
| - Flexible entity extraction | |
| - Lightweight and fast (~100-200ms) | |
| Size: ~150MB model | |
| Speed: ~100-200ms per query | |
| """ | |
| def __init__(self, model_name: str = "urchade/gliner_small-v2.1"): | |
| """ | |
| Initialize the name extraction service. | |
| Args: | |
| model_name: GLiNER model to use. Options: | |
| - "urchade/gliner_small-v2.1" (150MB, balanced) | |
| - "urchade/gliner_multi-v2.1" (multilingual, better for Arabic) | |
| - "urchade/gliner_large-v2.1" (larger, more accurate) | |
| """ | |
| logger.info(f"Loading GLiNER model: {model_name}") | |
| # Load the pre-trained model | |
| # This downloads the model on first run (~150MB) | |
| self.model = GLiNER.from_pretrained(model_name) | |
| # Define the entity labels we want to extract | |
| self.labels = ["person", "name", "employee"] | |
| logger.info(f"β GLiNER model loaded successfully") | |
| logger.info(f"Entity labels: {self.labels}") | |
| def extract_names(self, text: str, threshold: float = 0.3) -> List[str]: | |
| """ | |
| Extract person names from text. | |
| Args: | |
| text: Input text (e.g., "find Ahmed in IT") | |
| threshold: Confidence threshold (0-1). Lower = more names but less precise. | |
| Default 0.3 is good for most cases. | |
| Returns: | |
| List of extracted names | |
| Example: | |
| >>> extractor.extract_names("find Ahmed Hassan in IT") | |
| ['Ahmed Hassan'] | |
| >>> extractor.extract_names("connect me with Sarah from HR") | |
| ['Sarah'] | |
| """ | |
| logger.info(f"Extracting names from: {text}") | |
| # Predict entities using GLiNER | |
| entities = self.model.predict_entities( | |
| text, | |
| self.labels, | |
| threshold=threshold | |
| ) | |
| # Extract just the text of person entities | |
| names = [entity["text"] for entity in entities] | |
| # Remove duplicates while preserving order | |
| unique_names = list(dict.fromkeys(names)) | |
| logger.info(f"β Found {len(unique_names)} name(s): {unique_names}") | |
| return unique_names | |
| def extract_names_with_context( | |
| self, | |
| text: str, | |
| threshold: float = 0.3 | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Extract person names with additional context (position, confidence). | |
| Args: | |
| text: Input text | |
| threshold: Confidence threshold (0-1) | |
| Returns: | |
| List of dictionaries with name details: | |
| [ | |
| { | |
| "name": "Ahmed Hassan", | |
| "start": 5, | |
| "end": 17, | |
| "confidence": 0.95, | |
| "label": "person" | |
| } | |
| ] | |
| """ | |
| logger.info(f"Extracting names with context from: {text}") | |
| # Predict entities | |
| entities = self.model.predict_entities( | |
| text, | |
| self.labels, | |
| threshold=threshold | |
| ) | |
| # Format results | |
| results = [] | |
| for entity in entities: | |
| results.append({ | |
| "name": entity["text"], | |
| "start": entity["start"], | |
| "end": entity["end"], | |
| "confidence": round(entity["score"], 2), | |
| "label": entity["label"] | |
| }) | |
| logger.info(f"β Found {len(results)} name(s) with context") | |
| return results | |
| def extract_from_query( | |
| self, | |
| query: str, | |
| extract_divisions: bool = False | |
| ) -> Dict[str, Any]: | |
| """ | |
| Extract names and optionally division keywords from a query. | |
| Args: | |
| query: User query text | |
| extract_divisions: Whether to also extract division/department mentions | |
| Returns: | |
| Dictionary with extracted information: | |
| { | |
| "names": ["Ahmed", "Sarah"], | |
| "has_names": True, | |
| "count": 2, | |
| "divisions": ["IT", "HR"] (if extract_divisions=True) | |
| } | |
| """ | |
| # Extract names | |
| names = self.extract_names(query) | |
| result = { | |
| "names": names, | |
| "has_names": len(names) > 0, | |
| "count": len(names) | |
| } | |
| # Optionally extract division keywords | |
| if extract_divisions: | |
| # Common division/department keywords | |
| division_keywords = [ | |
| "IT", "HR", "Finance", "Legal", "Accounting", | |
| "Marketing", "Sales", "Operations", "Engineering", | |
| "Security", "Facilities", "Purchasing", "Audit" | |
| ] | |
| query_upper = query.upper() | |
| found_divisions = [ | |
| kw for kw in division_keywords | |
| if kw in query_upper | |
| ] | |
| result["divisions"] = found_divisions | |
| result["has_divisions"] = len(found_divisions) > 0 | |
| return result | |