Spaces:
Running
Running
| import email | |
| from email import policy | |
| from bs4 import BeautifulSoup | |
| import re | |
| def parse_email(file_path): | |
| with open(file_path, "rb") as f: | |
| msg = email.message_from_binary_file(f, policy=policy.default) | |
| # --- 1. Extract headers --- | |
| headers = dict(msg.items()) | |
| # --- 2. Extract body (text + html) --- | |
| body = "" | |
| if msg.is_multipart(): | |
| for part in msg.walk(): | |
| content_type = part.get_content_type() | |
| if content_type == "text/plain": | |
| try: | |
| body += part.get_content() | |
| except: | |
| pass | |
| elif content_type == "text/html": | |
| try: | |
| html_body = part.get_content() | |
| soup = BeautifulSoup(html_body, "html.parser") | |
| body += soup.get_text(" ", strip=True) | |
| except: | |
| pass | |
| else: | |
| try: | |
| body = msg.get_content() | |
| except: | |
| body = "" | |
| # --- 3. Extract URLs --- | |
| urls = set() | |
| urls.update(re.findall(r"https?://[^\s]+", body)) | |
| for part in msg.walk(): | |
| if part.get_content_type() == "text/html": | |
| try: | |
| html_body = part.get_content() | |
| soup = BeautifulSoup(html_body, "html.parser") | |
| for link in soup.find_all("a", href=True): | |
| urls.add(link["href"]) | |
| except: | |
| pass | |
| return headers, body, list(urls) | |