princemaxp commited on
Commit
d76bad0
·
verified ·
1 Parent(s): 1258978

Update header_analyzer.py

Browse files
Files changed (1) hide show
  1. header_analyzer.py +67 -22
header_analyzer.py CHANGED
@@ -1,19 +1,37 @@
1
- # header_analyzer.py
2
  import re
 
 
 
3
 
4
- # Simple brand list and their official domains for brand-squatting detection
5
  BRAND_OFFICIAL = {
6
  "paypal": ["paypal.com"],
7
  "amazon": ["amazon.com"],
8
  "google": ["google.com", "gmail.com"],
9
  "microsoft": ["microsoft.com", "outlook.com", "live.com"],
10
  "apple": ["apple.com"],
11
- "bank": [], # generic bank keyword detection (no official domain)
12
  }
13
 
14
- def analyze_headers(headers):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  """
16
- Input: headers dict
17
  Output: (findings: list[str], score: int)
18
  """
19
  findings = []
@@ -41,7 +59,7 @@ def analyze_headers(headers):
41
  findings.append("Header: Temporary auth errors (DKIM/DMARC)")
42
  score += 5
43
 
44
- # From and Reply-To domain compare (domain-level)
45
  from_addr = headers.get("From", "") or ""
46
  reply_to = headers.get("Reply-To", "") or ""
47
  from_domain_m = re.search(r'@([a-zA-Z0-9.-]+)', from_addr)
@@ -52,32 +70,59 @@ def analyze_headers(headers):
52
  if from_domain != reply_domain:
53
  findings.append(f"Header: Reply-To domain mismatch (From: {from_domain}, Reply-To: {reply_domain})")
54
  score += 20
 
 
 
 
 
 
 
55
 
56
- # Suspicious looking sender domain & brand-squatting detection
57
- sender = from_addr or ""
58
- match = re.search(r'@([a-zA-Z0-9.-]+)', sender)
59
- if match:
60
- domain = match.group(1).lower()
61
- parts = domain.split('.')
62
  # free provider detection
63
- if any(free in domain for free in ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]):
64
- findings.append(f"Header: Free email provider used ({domain})")
65
  score += 8
66
- # suspicious structural domain heuristics
 
67
  if len(parts) > 4 or (parts and any(ch.isdigit() for ch in parts[0])):
68
- findings.append(f"Header: Suspicious-looking domain structure ({domain})")
69
  score += 15
70
 
71
- # brand-squatting: if domain contains a known brand but isn't exactly an official brand domain
 
 
 
 
 
 
 
 
 
 
 
72
  for brand, official_list in BRAND_OFFICIAL.items():
73
- if brand in domain:
74
- # check if domain is exactly an official domain or subdomain of official
75
- is_official = any(domain.endswith("." + off) or domain == off for off in official_list) if official_list else False
 
 
76
  if not is_official:
77
- findings.append(f"Header: Domain contains brand '{brand}' but is not official ({domain})")
78
  score += 30
79
 
80
- # Bcc usage detection
 
 
 
 
 
 
 
 
 
 
 
 
81
  if headers.get("Bcc") or headers.get("bcc"):
82
  findings.append("Header: Email sent with BCC (common in mass phishing)")
83
  score += 12
 
 
1
  import re
2
+ import difflib
3
+ import whois
4
+ from datetime import datetime
5
 
6
+ # Official brand domains (extend as needed)
7
  BRAND_OFFICIAL = {
8
  "paypal": ["paypal.com"],
9
  "amazon": ["amazon.com"],
10
  "google": ["google.com", "gmail.com"],
11
  "microsoft": ["microsoft.com", "outlook.com", "live.com"],
12
  "apple": ["apple.com"],
13
+ "flowtoscale": ["flowtoscale.com"], # Example from your case
14
  }
15
 
16
+ # Suspicious / cheap TLDs often abused
17
+ SUSPICIOUS_TLDS = {"info", "xyz", "top", "click", "work", "loan", "tk"}
18
+
19
+ def get_domain_age_days(domain: str):
20
+ """Return domain age in days (or None if lookup fails)."""
21
+ try:
22
+ w = whois.whois(domain)
23
+ creation_date = w.creation_date
24
+ if isinstance(creation_date, list): # sometimes returns list
25
+ creation_date = creation_date[0]
26
+ if creation_date:
27
+ return (datetime.now() - creation_date).days
28
+ except Exception:
29
+ return None
30
+ return None
31
+
32
+ def analyze_headers(headers, body=""):
33
  """
34
+ Input: headers dict, optional body text
35
  Output: (findings: list[str], score: int)
36
  """
37
  findings = []
 
59
  findings.append("Header: Temporary auth errors (DKIM/DMARC)")
60
  score += 5
61
 
62
+ # From and Reply-To domain compare
63
  from_addr = headers.get("From", "") or ""
64
  reply_to = headers.get("Reply-To", "") or ""
65
  from_domain_m = re.search(r'@([a-zA-Z0-9.-]+)', from_addr)
 
70
  if from_domain != reply_domain:
71
  findings.append(f"Header: Reply-To domain mismatch (From: {from_domain}, Reply-To: {reply_domain})")
72
  score += 20
73
+ else:
74
+ from_domain = from_domain_m.group(1).lower() if from_domain_m else ""
75
+
76
+ # Sender domain analysis
77
+ if from_domain:
78
+ parts = from_domain.split('.')
79
+ tld = parts[-1]
80
 
 
 
 
 
 
 
81
  # free provider detection
82
+ if from_domain in ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]:
83
+ findings.append(f"Header: Free email provider used ({from_domain})")
84
  score += 8
85
+
86
+ # suspicious domain structure
87
  if len(parts) > 4 or (parts and any(ch.isdigit() for ch in parts[0])):
88
+ findings.append(f"Header: Suspicious-looking domain structure ({from_domain})")
89
  score += 15
90
 
91
+ # suspicious TLD
92
+ if tld in SUSPICIOUS_TLDS:
93
+ findings.append(f"Header: Suspicious/abused TLD used ({tld})")
94
+ score += 20
95
+
96
+ # Domain age check
97
+ age_days = get_domain_age_days(from_domain)
98
+ if age_days is not None and age_days < 90:
99
+ findings.append(f"Header: Domain {from_domain} is very new ({age_days} days old)")
100
+ score += 35
101
+
102
+ # brand-squatting / look-alike check
103
  for brand, official_list in BRAND_OFFICIAL.items():
104
+ if brand in from_domain:
105
+ is_official = any(
106
+ from_domain.endswith("." + off) or from_domain == off
107
+ for off in official_list
108
+ )
109
  if not is_official:
110
+ findings.append(f"Header: Domain contains brand '{brand}' but is not official ({from_domain})")
111
  score += 30
112
 
113
+ # fuzzy look-alike
114
+ for legit in official_list:
115
+ ratio = difflib.SequenceMatcher(None, from_domain, legit).ratio()
116
+ if ratio > 0.7 and from_domain != legit:
117
+ findings.append(f"Header: Possible look-alike spoofing ({from_domain} vs {legit})")
118
+ score += 40
119
+
120
+ # Content-to-domain mismatch (organization spoofing)
121
+ if body and "ravenmail" in body.lower() and "ravenmail" not in from_domain:
122
+ findings.append("Header/Content: Possible spoofing — mentions RavenMail but sender domain is unrelated")
123
+ score += 40
124
+
125
+ # Bcc usage
126
  if headers.get("Bcc") or headers.get("bcc"):
127
  findings.append("Header: Email sent with BCC (common in mass phishing)")
128
  score += 12