import re
import difflib
import whois
from datetime import datetime

# Official brand domains (extend as needed)
BRAND_OFFICIAL = {
    "paypal": ["paypal.com"],
    "amazon": ["amazon.com"],
    "google": ["google.com", "gmail.com"],
    "microsoft": ["microsoft.com", "outlook.com", "live.com"],
    "apple": ["apple.com"],
    "flowtoscale": ["flowtoscale.com"],  # Example from your case
}

# Suspicious / cheap TLDs often abused
SUSPICIOUS_TLDS = {"info", "xyz", "top", "click", "work", "loan", "tk"}


def get_domain_age_days(domain: str):
    """Return domain age in days (or None if lookup fails)."""
    try:
        w = whois.whois(domain)
        creation_date = w.creation_date

        # Handle weird formats
        if isinstance(creation_date, list) and creation_date:
            creation_date = creation_date[0]
        if isinstance(creation_date, str):
            try:
                creation_date = datetime.fromisoformat(creation_date)
            except Exception:
                creation_date = None

        if creation_date and isinstance(creation_date, datetime):
            return (datetime.now() - creation_date).days
    except Exception as e:
        # Do not crash if WHOIS fails on Hugging Face
        print(f"[WHOIS ERROR] Could not fetch age for {domain}: {e}")
        return None
    return None


def parse_auth_results(auth_header: str):
    """
    Parse the Authentication-Results header and return a readable summary.
    """
    auth_header = (auth_header or "").lower()
    findings = []

    if not auth_header:
        return "No Authentication-Results header found"

    # SPF
    if "spf=pass" in auth_header:
        findings.append("SPF passed")
    elif "spf=fail" in auth_header:
        findings.append("SPF failed")

    # DKIM
    if "dkim=pass" in auth_header:
        findings.append("DKIM passed")
    elif "dkim=fail" in auth_header or "dkim=permerror" in auth_header:
        findings.append("DKIM failed")

    # DMARC
    if "dmarc=pass" in auth_header:
        findings.append("DMARC passed")
    elif "dmarc=fail" in auth_header:
        findings.append("DMARC failed")

    if not findings:
        return "Authentication results unclear or missing"

    return ", ".join(findings)


def analyze_headers(headers, body=""):
    """
    Input: headers dict, optional body text
    Output: (findings: list[str], score: int, auth_summary: str)
    """
    findings = []
    score = 0
    headers = headers or {}

    auth_results = (headers.get("Authentication-Results") or headers.get("Authentication-results") or "").lower()

    # Strict auth failures
    if "dkim=fail" in auth_results or "dkim=permerror" in auth_results:
        findings.append("Header: DKIM check failed")
        score += 30
    if "spf=fail" in auth_results:
        findings.append("Header: SPF check failed")
        score += 30
    if "dmarc=fail" in auth_results:
        findings.append("Header: DMARC check failed")
        score += 30

    # Softer auth problems
    if any(x in auth_results for x in ["spf=softfail", "spf=neutral", "spf=none"]):
        findings.append("Header: SPF not properly aligned")
        score += 10
    if any(x in auth_results for x in ["dmarc=temperror", "dkim=temperror"]):
        findings.append("Header: Temporary auth errors (DKIM/DMARC)")
        score += 5

    # From and Reply-To domain compare
    from_addr = headers.get("From", "") or ""
    reply_to = headers.get("Reply-To", "") or ""
    from_domain_m = re.search(r'@([a-zA-Z0-9.-]+)', from_addr)
    reply_domain_m = re.search(r'@([a-zA-Z0-9.-]+)', reply_to)
    if from_domain_m and reply_domain_m:
        from_domain = from_domain_m.group(1).lower()
        reply_domain = reply_domain_m.group(1).lower()
        if from_domain != reply_domain:
            findings.append(f"Header: Reply-To domain mismatch (From: {from_domain}, Reply-To: {reply_domain})")
            score += 20
    else:
        from_domain = from_domain_m.group(1).lower() if from_domain_m else ""

    # Sender domain analysis
    if from_domain:
        parts = from_domain.split('.')
        tld = parts[-1]

        # free provider detection
        if from_domain in ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]:
            findings.append(f"Header: Free email provider used ({from_domain})")
            score += 8

        # suspicious domain structure
        if len(parts) > 4 or (parts and any(ch.isdigit() for ch in parts[0])):
            findings.append(f"Header: Suspicious-looking domain structure ({from_domain})")
            score += 15

        # suspicious TLD
        if tld in SUSPICIOUS_TLDS:
            findings.append(f"Header: Suspicious/abused TLD used ({tld})")
            score += 20

        # Domain age check (robust)
        age_days = get_domain_age_days(from_domain)
        if age_days is not None and age_days < 90:
            findings.append(f"Header: Domain {from_domain} is very new ({age_days} days old)")
            score += 35

        # brand-squatting / look-alike check
        for brand, official_list in BRAND_OFFICIAL.items():
            if brand in from_domain:
                is_official = any(
                    from_domain.endswith("." + off) or from_domain == off
                    for off in official_list
                )
                if not is_official:
                    findings.append(f"Header: Domain contains brand '{brand}' but is not official ({from_domain})")
                    score += 30

            # fuzzy look-alike
            for legit in official_list:
                ratio = difflib.SequenceMatcher(None, from_domain, legit).ratio()
                if ratio > 0.7 and from_domain != legit:
                    findings.append(f"Header: Possible look-alike spoofing ({from_domain} vs {legit})")
                    score += 40

        # Content-to-domain mismatch (organization spoofing)
        if body and "ravenmail" in body.lower() and "ravenmail" not in from_domain:
            findings.append("Header/Content: Possible spoofing — mentions RavenMail but sender domain is unrelated")
            score += 40

    # Bcc usage
    if headers.get("Bcc") or headers.get("bcc"):
        findings.append("Header: Email sent with BCC (common in mass phishing)")
        score += 12

    if not findings:
        return ["No suspicious issues found in headers."], 0, "No Authentication-Results header found"

    # Return findings, cumulative score, and parsed authentication summary
    return findings, score, parse_auth_results(auth_results)