Spaces:
Running
Running
File size: 1,534 Bytes
401cf22 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
import re
from typing import List
def extract_dates(text: str) -> List[str]:
"""Extract dates from text"""
patterns = [
r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', # MM/DD/YYYY or DD-MM-YYYY
r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b', # YYYY-MM-DD
r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b' # Month DD, YYYY
]
dates = []
for pattern in patterns:
dates.extend(re.findall(pattern, text, re.IGNORECASE))
return list(set(dates))
def extract_amounts(text: str) -> List[str]:
"""Extract monetary amounts from text"""
pattern = r'\$\s?\d+(?:,\d{3})*(?:\.\d{2})?'
return re.findall(pattern, text)
def extract_emails(text: str) -> List[str]:
"""Extract email addresses from text"""
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
return re.findall(pattern, text)
def extract_phone_numbers(text: str) -> List[str]:
"""Extract phone numbers from text"""
patterns = [
r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', # 123-456-7890
r'\(\d{3}\)\s*\d{3}[-.]?\d{4}\b' # (123) 456-7890
]
numbers = []
for pattern in patterns:
numbers.extend(re.findall(pattern, text))
return list(set(numbers))
def clean_text(text: str) -> str:
"""Clean and normalize text"""
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters (keep punctuation)
text = re.sub(r'[^\w\s.,!?;:()\-\'\"]', '', text)
return text.strip() |