Spaces:
Sleeping
Sleeping
| # parse_email.py | |
| import email | |
| from email import policy | |
| from bs4 import BeautifulSoup | |
| import re | |
| import base64 | |
| import io | |
| def _extract_inline_images_from_html(html): | |
| images = [] | |
| soup = BeautifulSoup(html or "", "html.parser") | |
| for img in soup.find_all("img"): | |
| src = img.get("src", "") | |
| if src.startswith("data:image/"): | |
| # e.g. data:image/png;base64,iVBORw0... | |
| try: | |
| header, b64 = src.split(",", 1) | |
| data = base64.b64decode(b64) | |
| images.append(data) | |
| except Exception: | |
| continue | |
| return images | |
| def parse_email(file_path): | |
| """ | |
| Returns: headers(dict), subject(str), body(str), urls(list), images(list of bytes) | |
| """ | |
| with open(file_path, "rb") as f: | |
| msg = email.message_from_binary_file(f, policy=policy.default) | |
| headers = dict(msg.items()) | |
| subject = headers.get("Subject", "") or "" | |
| body = "" | |
| images = [] | |
| # Walk parts - handle multipart and attachments | |
| if msg.is_multipart(): | |
| for part in msg.walk(): | |
| ctype = part.get_content_type() | |
| disp = str(part.get("Content-Disposition") or "").lower() | |
| # attachments that are images | |
| if ctype.startswith("image/"): | |
| try: | |
| data = part.get_payload(decode=True) | |
| if data: | |
| images.append(data) | |
| except Exception: | |
| pass | |
| # text/plain | |
| if ctype == "text/plain" and "attachment" not in disp: | |
| try: | |
| body += part.get_content() | |
| except Exception: | |
| pass | |
| # text/html | |
| if ctype == "text/html" and "attachment" not in disp: | |
| try: | |
| html_body = part.get_content() | |
| # extract inline images from this html (data URIs) | |
| images += _extract_inline_images_from_html(html_body) | |
| # convert html to text | |
| soup = BeautifulSoup(html_body, "html.parser") | |
| body += soup.get_text(" ", strip=True) | |
| except Exception: | |
| pass | |
| else: | |
| # not multipart | |
| try: | |
| if msg.get_content_type() == "text/html": | |
| html_body = msg.get_content() | |
| images += _extract_inline_images_from_html(html_body) | |
| soup = BeautifulSoup(html_body, "html.parser") | |
| body = soup.get_text(" ", strip=True) | |
| else: | |
| body = msg.get_content() | |
| except Exception: | |
| body = "" | |
| # URL extraction (from combined body) | |
| urls = set() | |
| try: | |
| urls.update(re.findall(r"https?://[^\s\"'<>]+", body)) | |
| except Exception: | |
| pass | |
| # Also try to find URLs in headers (e.g., List-Unsubscribe) or other parts | |
| for k, v in headers.items(): | |
| try: | |
| urls.update(re.findall(r"https?://[^\s\"'<>]+", str(v))) | |
| except Exception: | |
| pass | |
| return headers, subject, body, list(urls), images | |