CySecGuardians / parse_email.py
princemaxp's picture
Update parse_email.py
cdb486e verified
raw
history blame
1.49 kB
import email
from email import policy
from bs4 import BeautifulSoup
import re
def parse_email(file_path):
with open(file_path, "rb") as f:
msg = email.message_from_binary_file(f, policy=policy.default)
# --- 1. Extract headers ---
headers = dict(msg.items())
# --- 2. Extract body (text + html) ---
body = ""
if msg.is_multipart():
for part in msg.walk():
content_type = part.get_content_type()
if content_type == "text/plain":
try:
body += part.get_content()
except:
pass
elif content_type == "text/html":
try:
html_body = part.get_content()
soup = BeautifulSoup(html_body, "html.parser")
body += soup.get_text(" ", strip=True)
except:
pass
else:
try:
body = msg.get_content()
except:
body = ""
# --- 3. Extract URLs ---
urls = set()
urls.update(re.findall(r"https?://[^\s]+", body))
for part in msg.walk():
if part.get_content_type() == "text/html":
try:
html_body = part.get_content()
soup = BeautifulSoup(html_body, "html.parser")
for link in soup.find_all("a", href=True):
urls.add(link["href"])
except:
pass
return headers, body, list(urls)