princemaxp commited on
Commit
3bb3451
·
verified ·
1 Parent(s): 49f1a98

Create parse_email.py

Browse files
Files changed (1) hide show
  1. parse_email.py +27 -0
parse_email.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import email
2
+ import re
3
+ from bs4 import BeautifulSoup
4
+
5
+ def parse_email(file_path):
6
+ with open(file_path, "rb") as f:
7
+ msg = email.message_from_binary_file(f)
8
+
9
+ headers = dict(msg.items())
10
+
11
+ # Extract body (handle plain + HTML)
12
+ body = ""
13
+ if msg.is_multipart():
14
+ for part in msg.walk():
15
+ if part.get_content_type() == "text/plain":
16
+ body += part.get_payload(decode=True).decode(errors="ignore")
17
+ elif part.get_content_type() == "text/html":
18
+ html = part.get_payload(decode=True).decode(errors="ignore")
19
+ soup = BeautifulSoup(html, "html.parser")
20
+ body += soup.get_text()
21
+ else:
22
+ body = msg.get_payload(decode=True).decode(errors="ignore")
23
+
24
+ # Extract URLs
25
+ urls = re.findall(r'(https?://\S+)', body)
26
+
27
+ return headers, body, urls