LifeAdmin-AI / utils /text_utils.py
Maheen001's picture
Create utils/text_utils.py
401cf22 verified
import re
from typing import List
def extract_dates(text: str) -> List[str]:
"""Extract dates from text"""
patterns = [
r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', # MM/DD/YYYY or DD-MM-YYYY
r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b', # YYYY-MM-DD
r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b' # Month DD, YYYY
]
dates = []
for pattern in patterns:
dates.extend(re.findall(pattern, text, re.IGNORECASE))
return list(set(dates))
def extract_amounts(text: str) -> List[str]:
"""Extract monetary amounts from text"""
pattern = r'\$\s?\d+(?:,\d{3})*(?:\.\d{2})?'
return re.findall(pattern, text)
def extract_emails(text: str) -> List[str]:
"""Extract email addresses from text"""
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
return re.findall(pattern, text)
def extract_phone_numbers(text: str) -> List[str]:
"""Extract phone numbers from text"""
patterns = [
r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', # 123-456-7890
r'\(\d{3}\)\s*\d{3}[-.]?\d{4}\b' # (123) 456-7890
]
numbers = []
for pattern in patterns:
numbers.extend(re.findall(pattern, text))
return list(set(numbers))
def clean_text(text: str) -> str:
"""Clean and normalize text"""
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters (keep punctuation)
text = re.sub(r'[^\w\s.,!?;:()\-\'\"]', '', text)
return text.strip()