Spaces:
Sleeping
Sleeping
| import email | |
| import re | |
| from bs4 import BeautifulSoup | |
| def parse_email(file_path): | |
| with open(file_path, "rb") as f: | |
| msg = email.message_from_binary_file(f) | |
| headers = dict(msg.items()) | |
| # Extract body (handle plain + HTML) | |
| body = "" | |
| if msg.is_multipart(): | |
| for part in msg.walk(): | |
| if part.get_content_type() == "text/plain": | |
| body += part.get_payload(decode=True).decode(errors="ignore") | |
| elif part.get_content_type() == "text/html": | |
| html = part.get_payload(decode=True).decode(errors="ignore") | |
| soup = BeautifulSoup(html, "html.parser") | |
| body += soup.get_text() | |
| else: | |
| body = msg.get_payload(decode=True).decode(errors="ignore") | |
| # Extract URLs | |
| urls = re.findall(r'(https?://\S+)', body) | |
| return headers, body, urls |