import re from typing import List def extract_dates(text: str) -> List[str]: """Extract dates from text""" patterns = [ r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', # MM/DD/YYYY or DD-MM-YYYY r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b', # YYYY-MM-DD r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b' # Month DD, YYYY ] dates = [] for pattern in patterns: dates.extend(re.findall(pattern, text, re.IGNORECASE)) return list(set(dates)) def extract_amounts(text: str) -> List[str]: """Extract monetary amounts from text""" pattern = r'\$\s?\d+(?:,\d{3})*(?:\.\d{2})?' return re.findall(pattern, text) def extract_emails(text: str) -> List[str]: """Extract email addresses from text""" pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' return re.findall(pattern, text) def extract_phone_numbers(text: str) -> List[str]: """Extract phone numbers from text""" patterns = [ r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', # 123-456-7890 r'\(\d{3}\)\s*\d{3}[-.]?\d{4}\b' # (123) 456-7890 ] numbers = [] for pattern in patterns: numbers.extend(re.findall(pattern, text)) return list(set(numbers)) def clean_text(text: str) -> str: """Clean and normalize text""" # Remove extra whitespace text = re.sub(r'\s+', ' ', text) # Remove special characters (keep punctuation) text = re.sub(r'[^\w\s.,!?;:()\-\'\"]', '', text) return text.strip()