Spaces:
Running
Running
Create utils/text_utils.py
Browse files- utils/text_utils.py +52 -0
utils/text_utils.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import List
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def extract_dates(text: str) -> List[str]:
|
| 6 |
+
"""Extract dates from text"""
|
| 7 |
+
patterns = [
|
| 8 |
+
r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', # MM/DD/YYYY or DD-MM-YYYY
|
| 9 |
+
r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b', # YYYY-MM-DD
|
| 10 |
+
r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b' # Month DD, YYYY
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
dates = []
|
| 14 |
+
for pattern in patterns:
|
| 15 |
+
dates.extend(re.findall(pattern, text, re.IGNORECASE))
|
| 16 |
+
|
| 17 |
+
return list(set(dates))
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def extract_amounts(text: str) -> List[str]:
|
| 21 |
+
"""Extract monetary amounts from text"""
|
| 22 |
+
pattern = r'\$\s?\d+(?:,\d{3})*(?:\.\d{2})?'
|
| 23 |
+
return re.findall(pattern, text)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def extract_emails(text: str) -> List[str]:
|
| 27 |
+
"""Extract email addresses from text"""
|
| 28 |
+
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
| 29 |
+
return re.findall(pattern, text)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def extract_phone_numbers(text: str) -> List[str]:
|
| 33 |
+
"""Extract phone numbers from text"""
|
| 34 |
+
patterns = [
|
| 35 |
+
r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', # 123-456-7890
|
| 36 |
+
r'\(\d{3}\)\s*\d{3}[-.]?\d{4}\b' # (123) 456-7890
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
numbers = []
|
| 40 |
+
for pattern in patterns:
|
| 41 |
+
numbers.extend(re.findall(pattern, text))
|
| 42 |
+
|
| 43 |
+
return list(set(numbers))
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def clean_text(text: str) -> str:
|
| 47 |
+
"""Clean and normalize text"""
|
| 48 |
+
# Remove extra whitespace
|
| 49 |
+
text = re.sub(r'\s+', ' ', text)
|
| 50 |
+
# Remove special characters (keep punctuation)
|
| 51 |
+
text = re.sub(r'[^\w\s.,!?;:()\-\'\"]', '', text)
|
| 52 |
+
return text.strip()
|