Spaces:

princemaxp
/

CySecGuardians

Sleeping

App Files Files Community

CySecGuardians / parse_email.py

princemaxp

Update parse_email.py

f88bfb7 verified 3 months ago

raw

history blame contribute delete

3.14 kB

	# parse_email.py
	import email
	from email import policy
	from bs4 import BeautifulSoup
	import re
	import base64
	import io

	def _extract_inline_images_from_html(html):
	images = []
	soup = BeautifulSoup(html or "", "html.parser")
	for img in soup.find_all("img"):
	src = img.get("src", "")
	if src.startswith("data:image/"):
	# e.g. data:image/png;base64,iVBORw0...
	try:
	header, b64 = src.split(",", 1)
	data = base64.b64decode(b64)
	images.append(data)
	except Exception:
	continue
	return images

	def parse_email(file_path):
	"""
	Returns: headers(dict), subject(str), body(str), urls(list), images(list of bytes)
	"""
	with open(file_path, "rb") as f:
	msg = email.message_from_binary_file(f, policy=policy.default)

	headers = dict(msg.items())
	subject = headers.get("Subject", "") or ""

	body = ""
	images = []

	# Walk parts - handle multipart and attachments
	if msg.is_multipart():
	for part in msg.walk():
	ctype = part.get_content_type()
	disp = str(part.get("Content-Disposition") or "").lower()
	# attachments that are images
	if ctype.startswith("image/"):
	try:
	data = part.get_payload(decode=True)
	if data:
	images.append(data)
	except Exception:
	pass

	# text/plain
	if ctype == "text/plain" and "attachment" not in disp:
	try:
	body += part.get_content()
	except Exception:
	pass

	# text/html
	if ctype == "text/html" and "attachment" not in disp:
	try:
	html_body = part.get_content()
	# extract inline images from this html (data URIs)
	images += _extract_inline_images_from_html(html_body)
	# convert html to text
	soup = BeautifulSoup(html_body, "html.parser")
	body += soup.get_text(" ", strip=True)
	except Exception:
	pass
	else:
	# not multipart
	try:
	if msg.get_content_type() == "text/html":
	html_body = msg.get_content()
	images += _extract_inline_images_from_html(html_body)
	soup = BeautifulSoup(html_body, "html.parser")
	body = soup.get_text(" ", strip=True)
	else:
	body = msg.get_content()
	except Exception:
	body = ""

	# URL extraction (from combined body)
	urls = set()
	try:
	urls.update(re.findall(r"https?://[^\s\"'<>]+", body))
	except Exception:
	pass

	# Also try to find URLs in headers (e.g., List-Unsubscribe) or other parts
	for k, v in headers.items():
	try:
	urls.update(re.findall(r"https?://[^\s\"'<>]+", str(v)))
	except Exception:
	pass

	return headers, subject, body, list(urls), images