File size: 861 Bytes
3bb3451
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import email
import re
from bs4 import BeautifulSoup

def parse_email(file_path):
    with open(file_path, "rb") as f:
        msg = email.message_from_binary_file(f)

    headers = dict(msg.items())
    
    # Extract body (handle plain + HTML)
    body = ""
    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == "text/plain":
                body += part.get_payload(decode=True).decode(errors="ignore")
            elif part.get_content_type() == "text/html":
                html = part.get_payload(decode=True).decode(errors="ignore")
                soup = BeautifulSoup(html, "html.parser")
                body += soup.get_text()
    else:
        body = msg.get_payload(decode=True).decode(errors="ignore")

    # Extract URLs
    urls = re.findall(r'(https?://\S+)', body)

    return headers, body, urls