Spaces:

princemaxp
/

CySecGuardians

Running

CySecGuardians / parse_email.py

Update parse_email.py

cdb486e verified 3 months ago

1.49 kB

	import email
	from email import policy
	from bs4 import BeautifulSoup
	import re

	def parse_email(file_path):
	with open(file_path, "rb") as f:
	msg = email.message_from_binary_file(f, policy=policy.default)

	# --- 1. Extract headers ---
	headers = dict(msg.items())

	# --- 2. Extract body (text + html) ---
	body = ""
	if msg.is_multipart():
	for part in msg.walk():
	content_type = part.get_content_type()
	if content_type == "text/plain":
	try:
	body += part.get_content()
	except:
	pass
	elif content_type == "text/html":
	try:
	html_body = part.get_content()
	soup = BeautifulSoup(html_body, "html.parser")
	body += soup.get_text(" ", strip=True)
	except:
	pass
	else:
	try:
	body = msg.get_content()
	except:
	body = ""

	# --- 3. Extract URLs ---
	urls = set()
	urls.update(re.findall(r"https?://[^\s]+", body))

	for part in msg.walk():
	if part.get_content_type() == "text/html":
	try:
	html_body = part.get_content()
	soup = BeautifulSoup(html_body, "html.parser")
	for link in soup.find_all("a", href=True):
	urls.add(link["href"])
	except:
	pass

	return headers, body, list(urls)