Spaces:

princemaxp
/

CySecGuardians

Sleeping

App Files Files Community

princemaxp commited on Sep 24

Commit

4038d7a

verified ·

1 Parent(s): e99affa

Update header_analyzer.py

Browse files

Files changed (1) hide show

header_analyzer.py +158 -156

header_analyzer.py CHANGED Viewed

@@ -1,165 +1,167 @@
-# body_analyzer.py
-import os
 import re
-import requests
-from typing import List
-HF_API_KEY = os.getenv("HF_API_KEY")
-HF_HEADERS = {"Authorization": f"Bearer {HF_API_KEY}"} if HF_API_KEY else {}
-HF_TIMEOUT = 20  # seconds
-# ML model names
-PHISHING_MODEL = "cybersectony/phishing-email-detection-distilbert_v2.4.1"
-ZERO_SHOT_MODEL = "facebook/bart-large-mnli"  # for intent/behavior
-# Suspicious phrase patterns
-SUSPICIOUS_PATTERNS = [
-    "verify your account",
-    "urgent action",
-    "click here",
-    "reset password",
-    "confirm your identity",
-    "bank account",
-    "invoice",
-    "payment required",
-    "unauthorized login",
-    "compromised",
-    "final reminder",
-    "account suspended",
-    "account deactivated",
-    "update your information",
-    "legal action",
-    "limited time offer",
-    "claim your prize",
-    "verify immediately",
-    "verify now",
-    "verify your credentials",
-]
-# Zero-shot candidate labels for intent/behavior
-BEHAVIOR_LABELS = [
-    "credential harvesting",
-    "invoice/payment fraud",
-    "marketing",
-    "benign",
-    "malware",
-    "account takeover",
-]
-def _call_hf_text_model(model_name: str, text: str):
-    if not HF_API_KEY:
-        return None
-    try:
-        payload = {"inputs": text}
-        res = requests.post(
-            f"https://api-inference.huggingface.co/models/{model_name}",
-            headers=HF_HEADERS,
-            json=payload,
-            timeout=HF_TIMEOUT,
-        )
-        return res.json()
-    except Exception:
-        return None
-def _call_hf_zero_shot(text: str, candidate_labels: List[str]):
-    if not HF_API_KEY:
-        return None
     try:
-        payload = {"inputs": text, "parameters": {"candidate_labels": candidate_labels}}
-        res = requests.post(
-            f"https://api-inference.huggingface.co/models/{ZERO_SHOT_MODEL}",
-            headers=HF_HEADERS,
-            json=payload,
-            timeout=HF_TIMEOUT,
-        )
-        return res.json()
     except Exception:
         return None
-def _parse_hf_phishing_model_output(result):
-    if not result:
-        return None, 0.0, {}
-    if isinstance(result, list) and result and isinstance(result[0], dict):
-        r0 = result[0]
-        label = r0.get("label")
-        score = r0.get("score", 0.0)
-        return label, float(score), {label: float(score)}
-    if isinstance(result, dict):
-        labels = result.get("labels") or result.get("label") or []
-        scores = result.get("scores") or result.get("score") or []
-        if isinstance(labels, list) and isinstance(scores, list) and labels and scores:
-            all_probs = {lab: float(sc) for lab, sc in zip(labels, scores)}
-            max_lab = max(all_probs.items(), key=lambda x: x[1])
-            return max_lab[0], float(max_lab[1]), all_probs
-    return None, 0.0, {}
-def analyze_body(subject: str, body: str, urls: list, images: list):
     findings = []
     score = 0
-    highlighted_body = (body or "")
-    combined_lower = ((subject or "") + "\n" + (body or "")).lower()
-    for pattern in SUSPICIOUS_PATTERNS:
-        if pattern in combined_lower:
-            findings.append(f"Suspicious phrase detected: \"{pattern}\"")
-            score += 18
-            try:
-                highlighted_body = re.sub(re.escape(pattern), f"<mark>{pattern}</mark>", highlighted_body, flags=re.IGNORECASE)
-            except Exception:
-                pass
-    # URL checks
-    for u in urls or []:
-        findings.append(f"Suspicious URL detected: {u}")
         score += 10
-        try:
-            highlighted_body = re.sub(re.escape(u), f"<mark>{u}</mark>", highlighted_body, flags=re.IGNORECASE)
-        except Exception:
-            pass
-    # ML phishing model
-    ml_label = None
-    ml_conf = 0.0
-    model_input = "\n".join([subject or "", body or "", "\n".join(urls or [])]).strip()
-    if model_input and HF_API_KEY:
-        raw = _call_hf_text_model(PHISHING_MODEL, model_input)
-        label, conf, _ = _parse_hf_phishing_model_output(raw)
-        if label:
-            ml_label = label
-            ml_conf = conf
-            findings.append(f"HuggingFace phishing model → {label} (conf {conf:.2f})")
-            score += int(conf * 100 * 0.9)
-    # Zero-shot behavior
-    behavior = None
-    behavior_conf = 0.0
-    if HF_API_KEY and model_input:
-        zs = _call_hf_zero_shot(model_input, BEHAVIOR_LABELS)
-        try:
-            if isinstance(zs, dict) and "labels" in zs and "scores" in zs:
-                behavior = zs["labels"][0]
-                behavior_conf = float(zs["scores"][0])
-                findings.append(f"Behavior inference → {behavior} (conf {behavior_conf:.2f})")
-                if behavior_conf >= 0.7:
-                    score += int(behavior_conf * 30)
-        except Exception:
-            pass
-    if ml_conf >= 0.8 and ("phishing" in (ml_label or "").lower()):
-        score = max(score, 80)
-    score = int(max(0, min(score, 100)))
-    # Verdict
-    if score >= 70:
-        verdict = "🚨 Malicious"
-    elif 50 <= score < 70:
-        verdict = "⚠️ Suspicious"
-    elif 30 <= score < 50:
-        verdict = "📩 Spam"
     else:
-        verdict = "✅ Safe"
-        findings.append("No strong phishing signals detected by models/heuristics.")
-    # Return exactly 4 values
-    return findings, score, highlighted_body, verdict

 import re
+import difflib
+import whois
+from datetime import datetime
+# Official brand domains (extend as needed)
+BRAND_OFFICIAL = {
+    "paypal": ["paypal.com"],
+    "amazon": ["amazon.com"],
+    "google": ["google.com", "gmail.com"],
+    "microsoft": ["microsoft.com", "outlook.com", "live.com"],
+    "apple": ["apple.com"],
+    "flowtoscale": ["flowtoscale.com"],  # Example from your case
+}
+# Suspicious / cheap TLDs often abused
+SUSPICIOUS_TLDS = {"info", "xyz", "top", "click", "work", "loan", "tk"}
+def get_domain_age_days(domain: str):
+    """Return domain age in days (or None if lookup fails)."""
     try:
+        w = whois.whois(domain)
+        creation_date = w.creation_date
+        if isinstance(creation_date, list):  # sometimes returns list
+            creation_date = creation_date[0]
+        if creation_date:
+            return (datetime.now() - creation_date).days
     except Exception:
         return None
+    return None
+def parse_auth_results(auth_header: str):
+    """
+    Parse the Authentication-Results header and return a readable summary.
+    """
+    auth_header = (auth_header or "").lower()
+    findings = []
+    if not auth_header:
+        return "No Authentication-Results header found"
+    # SPF
+    if "spf=pass" in auth_header:
+        findings.append("SPF passed")
+    elif "spf=fail" in auth_header:
+        findings.append("SPF failed")
+    # DKIM
+    if "dkim=pass" in auth_header:
+        findings.append("DKIM passed")
+    elif "dkim=fail" in auth_header or "dkim=permerror" in auth_header:
+        findings.append("DKIM failed")
+    # DMARC
+    if "dmarc=pass" in auth_header:
+        findings.append("DMARC passed")
+    elif "dmarc=fail" in auth_header:
+        findings.append("DMARC failed")
+    if not findings:
+        return "Authentication results unclear or missing"
+    return ", ".join(findings)
+def analyze_headers(headers, body=""):
+    """
+    Input: headers dict, optional body text
+    Output: (findings: list[str], score: int, auth_summary: str)
+    """
     findings = []
     score = 0
+    headers = headers or {}
+    auth_results = (headers.get("Authentication-Results") or headers.get("Authentication-results") or "").lower()
+    # Strict auth failures
+    if "dkim=fail" in auth_results or "dkim=permerror" in auth_results:
+        findings.append("Header: DKIM check failed")
+        score += 30
+    if "spf=fail" in auth_results:
+        findings.append("Header: SPF check failed")
+        score += 30
+    if "dmarc=fail" in auth_results:
+        findings.append("Header: DMARC check failed")
+        score += 30
+    # Softer auth problems
+    if any(x in auth_results for x in ["spf=softfail", "spf=neutral", "spf=none"]):
+        findings.append("Header: SPF not properly aligned")
         score += 10
+    if any(x in auth_results for x in ["dmarc=temperror", "dkim=temperror"]):
+        findings.append("Header: Temporary auth errors (DKIM/DMARC)")
+        score += 5
+    # From and Reply-To domain compare
+    from_addr = headers.get("From", "") or ""
+    reply_to = headers.get("Reply-To", "") or ""
+    from_domain_m = re.search(r'@([a-zA-Z0-9.-]+)', from_addr)
+    reply_domain_m = re.search(r'@([a-zA-Z0-9.-]+)', reply_to)
+    if from_domain_m and reply_domain_m:
+        from_domain = from_domain_m.group(1).lower()
+        reply_domain = reply_domain_m.group(1).lower()
+        if from_domain != reply_domain:
+            findings.append(f"Header: Reply-To domain mismatch (From: {from_domain}, Reply-To: {reply_domain})")
+            score += 20
     else:
+        from_domain = from_domain_m.group(1).lower() if from_domain_m else ""
+    # Sender domain analysis
+    if from_domain:
+        parts = from_domain.split('.')
+        tld = parts[-1]
+        # free provider detection
+        if from_domain in ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]:
+            findings.append(f"Header: Free email provider used ({from_domain})")
+            score += 8
+        # suspicious domain structure
+        if len(parts) > 4 or (parts and any(ch.isdigit() for ch in parts[0])):
+            findings.append(f"Header: Suspicious-looking domain structure ({from_domain})")
+            score += 15
+        # suspicious TLD
+        if tld in SUSPICIOUS_TLDS:
+            findings.append(f"Header: Suspicious/abused TLD used ({tld})")
+            score += 20
+        # Domain age check
+        age_days = get_domain_age_days(from_domain)
+        if age_days is not None and age_days < 90:
+            findings.append(f"Header: Domain {from_domain} is very new ({age_days} days old)")
+            score += 35
+        # brand-squatting / look-alike check
+        for brand, official_list in BRAND_OFFICIAL.items():
+            if brand in from_domain:
+                is_official = any(
+                    from_domain.endswith("." + off) or from_domain == off
+                    for off in official_list
+                )
+                if not is_official:
+                    findings.append(f"Header: Domain contains brand '{brand}' but is not official ({from_domain})")
+                    score += 30
+            # fuzzy look-alike
+            for legit in official_list:
+                ratio = difflib.SequenceMatcher(None, from_domain, legit).ratio()
+                if ratio > 0.7 and from_domain != legit:
+                    findings.append(f"Header: Possible look-alike spoofing ({from_domain} vs {legit})")
+                    score += 40
+        # Content-to-domain mismatch (organization spoofing)
+        if body and "ravenmail" in body.lower() and "ravenmail" not in from_domain:
+            findings.append("Header/Content: Possible spoofing — mentions RavenMail but sender domain is unrelated")
+            score += 40
+    # Bcc usage
+    if headers.get("Bcc") or headers.get("bcc"):
+        findings.append("Header: Email sent with BCC (common in mass phishing)")
+        score += 12
+    if not findings:
+        return ["No suspicious issues found in headers."], 0, "No Authentication-Results header found"
+    # Return findings, cumulative score, and parsed authentication summary
+    return findings, score, parse_auth_results(auth_results)