Spaces:

princemaxp
/

CySecGuardians

Sleeping

App Files Files Community

princemaxp commited on Sep 24

Commit

d76bad0

verified ·

1 Parent(s): 1258978

Update header_analyzer.py

Browse files

Files changed (1) hide show

header_analyzer.py +67 -22

header_analyzer.py CHANGED Viewed

@@ -1,19 +1,37 @@
-# header_analyzer.py
 import re
-# Simple brand list and their official domains for brand-squatting detection
 BRAND_OFFICIAL = {
     "paypal": ["paypal.com"],
     "amazon": ["amazon.com"],
     "google": ["google.com", "gmail.com"],
     "microsoft": ["microsoft.com", "outlook.com", "live.com"],
     "apple": ["apple.com"],
-    "bank": [],  # generic bank keyword detection (no official domain)
 }
-def analyze_headers(headers):
     """
-    Input: headers dict
     Output: (findings: list[str], score: int)
     """
     findings = []
@@ -41,7 +59,7 @@ def analyze_headers(headers):
         findings.append("Header: Temporary auth errors (DKIM/DMARC)")
         score += 5
-    # From and Reply-To domain compare (domain-level)
     from_addr = headers.get("From", "") or ""
     reply_to = headers.get("Reply-To", "") or ""
     from_domain_m = re.search(r'@([a-zA-Z0-9.-]+)', from_addr)
@@ -52,32 +70,59 @@ def analyze_headers(headers):
         if from_domain != reply_domain:
             findings.append(f"Header: Reply-To domain mismatch (From: {from_domain}, Reply-To: {reply_domain})")
             score += 20
-    # Suspicious looking sender domain & brand-squatting detection
-    sender = from_addr or ""
-    match = re.search(r'@([a-zA-Z0-9.-]+)', sender)
-    if match:
-        domain = match.group(1).lower()
-        parts = domain.split('.')
         # free provider detection
-        if any(free in domain for free in ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]):
-            findings.append(f"Header: Free email provider used ({domain})")
             score += 8
-        # suspicious structural domain heuristics
         if len(parts) > 4 or (parts and any(ch.isdigit() for ch in parts[0])):
-            findings.append(f"Header: Suspicious-looking domain structure ({domain})")
             score += 15
-        # brand-squatting: if domain contains a known brand but isn't exactly an official brand domain
         for brand, official_list in BRAND_OFFICIAL.items():
-            if brand in domain:
-                # check if domain is exactly an official domain or subdomain of official
-                is_official = any(domain.endswith("." + off) or domain == off for off in official_list) if official_list else False
                 if not is_official:
-                    findings.append(f"Header: Domain contains brand '{brand}' but is not official ({domain})")
                     score += 30
-    # Bcc usage detection
     if headers.get("Bcc") or headers.get("bcc"):
         findings.append("Header: Email sent with BCC (common in mass phishing)")
         score += 12

 import re
+import difflib
+import whois
+from datetime import datetime
+# Official brand domains (extend as needed)
 BRAND_OFFICIAL = {
     "paypal": ["paypal.com"],
     "amazon": ["amazon.com"],
     "google": ["google.com", "gmail.com"],
     "microsoft": ["microsoft.com", "outlook.com", "live.com"],
     "apple": ["apple.com"],
+    "flowtoscale": ["flowtoscale.com"],  # Example from your case
 }
+# Suspicious / cheap TLDs often abused
+SUSPICIOUS_TLDS = {"info", "xyz", "top", "click", "work", "loan", "tk"}
+def get_domain_age_days(domain: str):
+    """Return domain age in days (or None if lookup fails)."""
+    try:
+        w = whois.whois(domain)
+        creation_date = w.creation_date
+        if isinstance(creation_date, list):  # sometimes returns list
+            creation_date = creation_date[0]
+        if creation_date:
+            return (datetime.now() - creation_date).days
+    except Exception:
+        return None
+    return None
+def analyze_headers(headers, body=""):
     """
+    Input: headers dict, optional body text
     Output: (findings: list[str], score: int)
     """
     findings = []
         findings.append("Header: Temporary auth errors (DKIM/DMARC)")
         score += 5
+    # From and Reply-To domain compare
     from_addr = headers.get("From", "") or ""
     reply_to = headers.get("Reply-To", "") or ""
     from_domain_m = re.search(r'@([a-zA-Z0-9.-]+)', from_addr)
         if from_domain != reply_domain:
             findings.append(f"Header: Reply-To domain mismatch (From: {from_domain}, Reply-To: {reply_domain})")
             score += 20
+    else:
+        from_domain = from_domain_m.group(1).lower() if from_domain_m else ""
+    # Sender domain analysis
+    if from_domain:
+        parts = from_domain.split('.')
+        tld = parts[-1]
         # free provider detection
+        if from_domain in ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]:
+            findings.append(f"Header: Free email provider used ({from_domain})")
             score += 8
+        # suspicious domain structure
         if len(parts) > 4 or (parts and any(ch.isdigit() for ch in parts[0])):
+            findings.append(f"Header: Suspicious-looking domain structure ({from_domain})")
             score += 15
+        # suspicious TLD
+        if tld in SUSPICIOUS_TLDS:
+            findings.append(f"Header: Suspicious/abused TLD used ({tld})")
+            score += 20
+        # Domain age check
+        age_days = get_domain_age_days(from_domain)
+        if age_days is not None and age_days < 90:
+            findings.append(f"Header: Domain {from_domain} is very new ({age_days} days old)")
+            score += 35
+        # brand-squatting / look-alike check
         for brand, official_list in BRAND_OFFICIAL.items():
+            if brand in from_domain:
+                is_official = any(
+                    from_domain.endswith("." + off) or from_domain == off
+                    for off in official_list
+                )
                 if not is_official:
+                    findings.append(f"Header: Domain contains brand '{brand}' but is not official ({from_domain})")
                     score += 30
+            # fuzzy look-alike
+            for legit in official_list:
+                ratio = difflib.SequenceMatcher(None, from_domain, legit).ratio()
+                if ratio > 0.7 and from_domain != legit:
+                    findings.append(f"Header: Possible look-alike spoofing ({from_domain} vs {legit})")
+                    score += 40
+        # Content-to-domain mismatch (organization spoofing)
+        if body and "ravenmail" in body.lower() and "ravenmail" not in from_domain:
+            findings.append("Header/Content: Possible spoofing — mentions RavenMail but sender domain is unrelated")
+            score += 40
+    # Bcc usage
     if headers.get("Bcc") or headers.get("bcc"):
         findings.append("Header: Email sent with BCC (common in mass phishing)")
         score += 12