Spaces:

princemaxp
/

CySecGuardians

Sleeping

App Files Files Community

princemaxp commited on Sep 5

Commit

9b5168a

verified ·

1 Parent(s): da36e3d

Update body_analyzer.py

Browse files

Files changed (1) hide show

body_analyzer.py +249 -96

body_analyzer.py CHANGED Viewed

@@ -1,127 +1,280 @@
-import requests
 import os
 import re
 HF_API_KEY = os.getenv("HF_API_KEY")
 HF_HEADERS = {"Authorization": f"Bearer {HF_API_KEY}"} if HF_API_KEY else {}
-MODELS = {
-    "ai_detector": "roberta-base-openai-detector",
-    "sentiment": "finiteautomata/bertweet-base-sentiment-analysis",
-    "spam": "mrm8488/bert-tiny-finetuned-sms-spam-detection",
-}
 SUSPICIOUS_PATTERNS = [
-    r"verify your account",
-    r"urgent action",
-    r"click here",
-    r"reset (your )?password",
-    r"confirm (your )?identity",
-    r"bank account",
-    r"invoice",
-    r"payment (required|overdue|failed|method expired)",
-    r"unauthorized login",
-    r"compromised",
-    r"final reminder",
-    r"account (suspended|deactivated|locked)",
-    r"update your (information|details|billing)",
-    r"legal action",
-    r"free trial",
-    r"limited time offer",
-    r"click below",
-    r"winner",
-    r"congratulations",
-    r"urgent response",
-    r"claim your prize",
-    r"act now",
-    r"unsubscribe",
-    r"lottery",
-    r"risk-free",
 ]
-def query_hf(model, text):
     if not HF_API_KEY:
         return None
     try:
         res = requests.post(
-            f"https://api-inference.huggingface.co/models/{model}",
             headers=HF_HEADERS,
-            json={"inputs": text[:1000]},
-            timeout=15,
         )
         return res.json()
     except Exception:
         return None
-def parse_hf_result(result):
-    # Common shapes: [{"label": "...", "score": ...}] or {"labels":[...], "scores":[...]}
     if not result:
-        return None, None
-    if isinstance(result, list) and result and isinstance(result[0], dict):
-        if "label" in result[0] and "score" in result[0]:
-            return result[0]["label"], result[0]["score"]
     if isinstance(result, dict):
-        labels = result.get("labels") or []
-        scores = result.get("scores") or []
-        if labels and scores:
-            return labels[0], scores[0]
-    return None, None
-def analyze_body(text):
     findings = []
     score = 0
-    body_lower = (text or "").lower()
-    highlighted_body = text or ""
-    # 1) Suspicious phrases
     for pattern in SUSPICIOUS_PATTERNS:
-        matches = re.findall(pattern, body_lower)
-        for match in matches:
-            display = match if isinstance(match, str) else (match[0] if match else "")
-            if not display:
-                continue
-            findings.append(f'Suspicious phrase detected: "{display}"')
-            score += 15  # tuned down to reduce instant Malicious
-            highlighted_body = re.sub(
-                re.escape(display),
-                f"<mark>{display}</mark>",
-                highlighted_body,
-                flags=re.IGNORECASE,
-            )
-    # 2) URLs
-    urls = re.findall(r'https?://[^\s]+', body_lower)
-    for url in urls:
-        findings.append(f"Suspicious URL detected: {url}")
         score += 10
-        highlighted_body = re.sub(re.escape(url), f"<mark>{url}</mark>", highlighted_body, flags=re.IGNORECASE)
-    # 3) AI text detector
-    label, confidence = parse_hf_result(query_hf(MODELS["ai_detector"], text or ""))
-    if label:
-        findings.append(f"Body: AI Detector → {label} (confidence {confidence:.2f})")
-    # 4) Sentiment
-    label, confidence = parse_hf_result(query_hf(MODELS["sentiment"], text or ""))
-    if label:
-        findings.append(f"Body: Sentiment → {label} (confidence {confidence:.2f})")
-        if label.lower() == "negative":
-            score += 10
-    # 5) Spam detector
-    label, confidence = parse_hf_result(query_hf(MODELS["spam"], text or ""))
-    if label:
-        findings.append(f"Body: Spam Detector → {label} (confidence {confidence:.2f})")
-        if label.lower() == "spam":
-            score += 25
-    # 6) Verdict
-    if score >= 50:
-        verdict = "Malicious / Spam"
-    elif score >= 20:
-        verdict = "Suspicious"
     else:
-        verdict = "Safe"
-        findings.append("No suspicious content detected in body.")
     return findings, score, highlighted_body, verdict

+# body_analyzer.py
 import os
 import re
+import requests
+import base64
+import io
+from typing import List
 HF_API_KEY = os.getenv("HF_API_KEY")
 HF_HEADERS = {"Authorization": f"Bearer {HF_API_KEY}"} if HF_API_KEY else {}
+HF_TIMEOUT = 20  # seconds
+# ML model names
+PHISHING_MODEL = "cybersectony/phishing-email-detection-distilbert_v2.4.1"
+ZERO_SHOT_MODEL = "facebook/bart-large-mnli"  # for intent/behavior
+# Suspicious phrase patterns (lowercased when matching)
 SUSPICIOUS_PATTERNS = [
+    "verify your account",
+    "urgent action",
+    "click here",
+    "reset password",
+    "confirm your identity",
+    "bank account",
+    "invoice",
+    "payment required",
+    "unauthorized login",
+    "compromised",
+    "final reminder",
+    "account suspended",
+    "account deactivated",
+    "update your information",
+    "legal action",
+    "limited time offer",
+    "claim your prize",
+    "verify immediately",
+    "verify now",
+    "verify your credentials",
 ]
+# zero-shot candidate labels for message behavior
+BEHAVIOR_LABELS = [
+    "credential harvesting",
+    "invoice/payment fraud",
+    "marketing",
+    "benign",
+    "malware",
+    "account takeover",
+]
+def _call_hf_text_model(model_name: str, text: str):
+    """Call HF Inference API for text. Return raw JSON or None on failure."""
     if not HF_API_KEY:
         return None
     try:
+        payload = {"inputs": text}
+        # For zero-shot, caller will pass parameters in payload if needed
         res = requests.post(
+            f"https://api-inference.huggingface.co/models/{model_name}",
             headers=HF_HEADERS,
+            json=payload,
+            timeout=HF_TIMEOUT,
         )
         return res.json()
     except Exception:
         return None
+def _call_hf_zero_shot(text: str, candidate_labels: List[str]):
+    if not HF_API_KEY:
+        return None
+    try:
+        payload = {"inputs": text, "parameters": {"candidate_labels": candidate_labels}}
+        res = requests.post(
+            f"https://api-inference.huggingface.co/models/{ZERO_SHOT_MODEL}",
+            headers=HF_HEADERS,
+            json=payload,
+            timeout=HF_TIMEOUT,
+        )
+        return res.json()
+    except Exception:
+        return None
+def _call_hf_image_ocr(model_name: str, image_bytes: bytes):
+    """
+    Call HF image OCR model endpoint. Returns string or None.
+    Uses raw bytes upload: content-type application/octet-stream body.
+    """
+    if not HF_API_KEY:
+        return None
+    try:
+        headers = HF_HEADERS.copy()
+        headers["Content-Type"] = "application/octet-stream"
+        res = requests.post(
+            f"https://api-inference.huggingface.co/models/{model_name}",
+            headers=headers,
+            data=image_bytes,
+            timeout=HF_TIMEOUT,
+        )
+        # Many vision models return {"generated_text": "..."} or list; attempt to parse common shapes
+        data = res.json()
+        if isinstance(data, dict):
+            # TrOCR-style may return {"generated_text": "..."}
+            if "generated_text" in data:
+                return data["generated_text"]
+            # Some OCR endpoints may return list of dicts
+        if isinstance(data, list) and data and isinstance(data[0], dict):
+            # choose text-like fields if present
+            candidate = data[0].get("generated_text") or data[0].get("text") or data[0].get("caption")
+            return candidate
+        # fallback: try string concatenation if possible
+        if isinstance(data, str):
+            return data
+    except Exception:
+        pass
+    return None
+# local pytesseract fallback
+def _ocr_local_pytesseract(image_bytes):
+    try:
+        from PIL import Image
+        import pytesseract
+        import io
+        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+        text = pytesseract.image_to_string(image)
+        return text
+    except Exception:
+        return None
+def _parse_hf_phishing_model_output(result):
+    """
+    Expected: model may return list of logits/probs. Try common shapes.
+    Returns: label:str, confidence:float (0..1), all_probs:dict
+    """
     if not result:
+        return None, 0.0, {}
+    # if list of dicts with label & score
+    if isinstance(result, list) and len(result) > 0 and isinstance(result[0], dict):
+        r0 = result[0]
+        label = r0.get("label")
+        score = r0.get("score", 0.0)
+        return label, float(score or 0.0), {label: float(score or 0.0)}
+    # if dict with labels & scores
     if isinstance(result, dict):
+        # sometimes returns {'labels': [...], 'scores': [...]}
+        labels = result.get("labels") or result.get("label") or []
+        scores = result.get("scores") or result.get("score") or []
+        if isinstance(labels, list) and isinstance(scores, list) and labels and scores:
+            all_probs = {lab: float(sc) for lab, sc in zip(labels, scores)}
+            # pick max
+            max_lab = max(all_probs.items(), key=lambda x: x[1])
+            return max_lab[0], float(max_lab[1]), all_probs
+    return None, 0.0, {}
+def analyze_body(subject: str, body: str, urls: list, images: list):
+    """
+    Inputs:
+      subject: email subject (str)
+      body: plaintext combined body (str)
+      urls: list of urls
+      images: list of image bytes
+    Returns:
+      findings (list[str]), score (int 0..100), highlighted_body (str), verdict (str)
+    """
     findings = []
     score = 0
+    highlighted_body = (body or "")  # will attempt to highlight suspicious text/URLs
+    # 1) Basic heuristics on subject + body
+    combined_lower = ((subject or "") + "\n" + (body or "")).lower()
     for pattern in SUSPICIOUS_PATTERNS:
+        if pattern in combined_lower:
+            findings.append(f"Suspicious phrase detected: \"{pattern}\"")
+            # weight subject phrases more heavily
+            if pattern in (subject or "").lower():
+                score += 30
+            else:
+                score += 18
+            try:
+                highlighted_body = re.sub(re.escape(pattern), f"<mark>{pattern}</mark>", highlighted_body, flags=re.IGNORECASE)
+            except Exception:
+                pass
+    # 2) URL heuristics (always include)
+    for u in urls or []:
+        findings.append(f"Suspicious URL detected: {u}")
         score += 10
+        try:
+            highlighted_body = re.sub(re.escape(u), f"<mark>{u}</mark>", highlighted_body, flags=re.IGNORECASE)
+        except Exception:
+            pass
+        # suspicious domain structure bump
+        domain_match = re.search(r"https?://([^/]+)/?", u)
+        if domain_match:
+            domain = domain_match.group(1)
+            if len(domain) > 25 or any(ch.isdigit() for ch in domain.split(".")[0]):
+                findings.append(f"URL: suspicious-looking domain {domain}")
+                score += 10
+    # 3) OCR images
+    ocr_texts = []
+    if images:
+        for img_bytes in images:
+            text = None
+            # Prefer HF TrOCR-like endpoint if HF_API_KEY provided
+            if HF_API_KEY:
+                # try a well-known OCR-capable model; TrOCR base is a candidate
+                ocr_result = _call_hf_image_ocr("microsoft/trocr-base-stage1", img_bytes)
+                if ocr_result:
+                    text = ocr_result
+            if not text:
+                # fallback to local pytesseract
+                text = _ocr_local_pytesseract(img_bytes)
+            if text:
+                ocr_texts.append(text)
+                findings.append("OCR: extracted text from image.")
+                # add small heuristic score for OCR results
+                lower = text.lower()
+                for pat in SUSPICIOUS_PATTERNS:
+                    if pat in lower:
+                        findings.append(f"OCR: suspicious phrase in image -> \"{pat}\"")
+                        score += 20
+    # 4) ML phishing model (Hugging Face)
+    ml_label = None
+    ml_conf = 0.0
+    ml_all = {}
+    model_input = "\n".join([subject or "", body or "", "\n".join(urls or []), "\n".join(ocr_texts or [])]).strip()
+    if model_input and HF_API_KEY:
+        raw = _call_hf_text_model(PHISHING_MODEL, model_input)
+        label, conf, allp = _parse_hf_phishing_model_output(raw)
+        if label:
+            ml_label = label
+            ml_conf = conf
+            ml_all = allp
+            findings.append(f"HuggingFace phishing model → {label} (conf {conf:.2f})")
+            # confidence scaled to score (but cap)
+            score += int(conf * 100 * 0.9)  # slightly reduce to avoid double-counting
+    # 5) Zero-shot behavior intent model (when HF available)
+    behavior = None
+    behavior_conf = 0.0
+    if HF_API_KEY and model_input:
+        zs = _call_hf_zero_shot(model_input, BEHAVIOR_LABELS)
+        try:
+            if isinstance(zs, dict) and "labels" in zs and "scores" in zs:
+                best_label = zs["labels"][0]
+                best_score = float(zs["scores"][0])
+                behavior = best_label
+                behavior_conf = best_score
+                findings.append(f"Behavior inference → {behavior} (conf {behavior_conf:.2f})")
+                # add modest boost for strong behavior confidence
+                if behavior_conf >= 0.7:
+                    score += int(behavior_conf * 30)
+        except Exception:
+            pass
+    # 6) Final heuristics fallbacks
+    # If ML already strongly flagged phishing, ensure high score
+    if ml_conf >= 0.8 and ("phishing" in (ml_label or "").lower()):
+        score = max(score, 80)
+    # clamp
+    try:
+        score = int(max(0, min(score, 100)))
+    except Exception:
+        score = 0
+    # Final verdict mapping (tunable)
+    if score >= 70:
+        verdict = "🚨 Malicious"
+    elif 50 <= score < 70:
+        verdict = "⚠️ Suspicious"
+    elif 30 <= score < 50:
+        verdict = "📩 Spam"
     else:
+        verdict = "✅ Safe"
+        findings.append("No strong phishing signals detected by models/heuristics.")
+    # Return findings, score, highlighted body (with possible <mark> tags), verdict
     return findings, score, highlighted_body, verdict