Maheen001 commited on
Commit
401cf22
·
verified ·
1 Parent(s): d411b46

Create utils/text_utils.py

Browse files
Files changed (1) hide show
  1. utils/text_utils.py +52 -0
utils/text_utils.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List
3
+
4
+
5
+ def extract_dates(text: str) -> List[str]:
6
+ """Extract dates from text"""
7
+ patterns = [
8
+ r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', # MM/DD/YYYY or DD-MM-YYYY
9
+ r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b', # YYYY-MM-DD
10
+ r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b' # Month DD, YYYY
11
+ ]
12
+
13
+ dates = []
14
+ for pattern in patterns:
15
+ dates.extend(re.findall(pattern, text, re.IGNORECASE))
16
+
17
+ return list(set(dates))
18
+
19
+
20
+ def extract_amounts(text: str) -> List[str]:
21
+ """Extract monetary amounts from text"""
22
+ pattern = r'\$\s?\d+(?:,\d{3})*(?:\.\d{2})?'
23
+ return re.findall(pattern, text)
24
+
25
+
26
+ def extract_emails(text: str) -> List[str]:
27
+ """Extract email addresses from text"""
28
+ pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
29
+ return re.findall(pattern, text)
30
+
31
+
32
+ def extract_phone_numbers(text: str) -> List[str]:
33
+ """Extract phone numbers from text"""
34
+ patterns = [
35
+ r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', # 123-456-7890
36
+ r'\(\d{3}\)\s*\d{3}[-.]?\d{4}\b' # (123) 456-7890
37
+ ]
38
+
39
+ numbers = []
40
+ for pattern in patterns:
41
+ numbers.extend(re.findall(pattern, text))
42
+
43
+ return list(set(numbers))
44
+
45
+
46
+ def clean_text(text: str) -> str:
47
+ """Clean and normalize text"""
48
+ # Remove extra whitespace
49
+ text = re.sub(r'\s+', ' ', text)
50
+ # Remove special characters (keep punctuation)
51
+ text = re.sub(r'[^\w\s.,!?;:()\-\'\"]', '', text)
52
+ return text.strip()