Spaces:
Running
Running
| import re | |
| from typing import List | |
| def extract_dates(text: str) -> List[str]: | |
| """Extract dates from text""" | |
| patterns = [ | |
| r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', # MM/DD/YYYY or DD-MM-YYYY | |
| r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b', # YYYY-MM-DD | |
| r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b' # Month DD, YYYY | |
| ] | |
| dates = [] | |
| for pattern in patterns: | |
| dates.extend(re.findall(pattern, text, re.IGNORECASE)) | |
| return list(set(dates)) | |
| def extract_amounts(text: str) -> List[str]: | |
| """Extract monetary amounts from text""" | |
| pattern = r'\$\s?\d+(?:,\d{3})*(?:\.\d{2})?' | |
| return re.findall(pattern, text) | |
| def extract_emails(text: str) -> List[str]: | |
| """Extract email addresses from text""" | |
| pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
| return re.findall(pattern, text) | |
| def extract_phone_numbers(text: str) -> List[str]: | |
| """Extract phone numbers from text""" | |
| patterns = [ | |
| r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', # 123-456-7890 | |
| r'\(\d{3}\)\s*\d{3}[-.]?\d{4}\b' # (123) 456-7890 | |
| ] | |
| numbers = [] | |
| for pattern in patterns: | |
| numbers.extend(re.findall(pattern, text)) | |
| return list(set(numbers)) | |
| def clean_text(text: str) -> str: | |
| """Clean and normalize text""" | |
| # Remove extra whitespace | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove special characters (keep punctuation) | |
| text = re.sub(r'[^\w\s.,!?;:()\-\'\"]', '', text) | |
| return text.strip() |