Spaces:
Sleeping
Sleeping
File size: 3,144 Bytes
f88bfb7 3bb3451 86cb7f3 3bb3451 86cb7f3 f88bfb7 3bb3451 f88bfb7 3bb3451 86cb7f3 3bb3451 f88bfb7 86cb7f3 3bb3451 f88bfb7 3bb3451 f88bfb7 86cb7f3 f88bfb7 86cb7f3 f88bfb7 86cb7f3 f88bfb7 86cb7f3 f88bfb7 86cb7f3 3bb3451 f88bfb7 86cb7f3 f88bfb7 86cb7f3 f88bfb7 86cb7f3 f88bfb7 3bb3451 f88bfb7 3bb3451 f88bfb7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# parse_email.py
import email
from email import policy
from bs4 import BeautifulSoup
import re
import base64
import io
def _extract_inline_images_from_html(html):
images = []
soup = BeautifulSoup(html or "", "html.parser")
for img in soup.find_all("img"):
src = img.get("src", "")
if src.startswith("data:image/"):
# e.g. data:image/png;base64,iVBORw0...
try:
header, b64 = src.split(",", 1)
data = base64.b64decode(b64)
images.append(data)
except Exception:
continue
return images
def parse_email(file_path):
"""
Returns: headers(dict), subject(str), body(str), urls(list), images(list of bytes)
"""
with open(file_path, "rb") as f:
msg = email.message_from_binary_file(f, policy=policy.default)
headers = dict(msg.items())
subject = headers.get("Subject", "") or ""
body = ""
images = []
# Walk parts - handle multipart and attachments
if msg.is_multipart():
for part in msg.walk():
ctype = part.get_content_type()
disp = str(part.get("Content-Disposition") or "").lower()
# attachments that are images
if ctype.startswith("image/"):
try:
data = part.get_payload(decode=True)
if data:
images.append(data)
except Exception:
pass
# text/plain
if ctype == "text/plain" and "attachment" not in disp:
try:
body += part.get_content()
except Exception:
pass
# text/html
if ctype == "text/html" and "attachment" not in disp:
try:
html_body = part.get_content()
# extract inline images from this html (data URIs)
images += _extract_inline_images_from_html(html_body)
# convert html to text
soup = BeautifulSoup(html_body, "html.parser")
body += soup.get_text(" ", strip=True)
except Exception:
pass
else:
# not multipart
try:
if msg.get_content_type() == "text/html":
html_body = msg.get_content()
images += _extract_inline_images_from_html(html_body)
soup = BeautifulSoup(html_body, "html.parser")
body = soup.get_text(" ", strip=True)
else:
body = msg.get_content()
except Exception:
body = ""
# URL extraction (from combined body)
urls = set()
try:
urls.update(re.findall(r"https?://[^\s\"'<>]+", body))
except Exception:
pass
# Also try to find URLs in headers (e.g., List-Unsubscribe) or other parts
for k, v in headers.items():
try:
urls.update(re.findall(r"https?://[^\s\"'<>]+", str(v)))
except Exception:
pass
return headers, subject, body, list(urls), images
|