File size: 2,668 Bytes
d32abd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import streamlit as st
import re
from urllib.parse import urlparse
import csv
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Load model once at startup
model_name = "najla45/phishing_detection_fine_tuned_bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
bert_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

def is_phishing_url(url):
    suspicious_keywords = ['secure', 'account', 'update', 'free', 'login', 'verify', 'banking']
    domain = urlparse(url).netloc
    path = urlparse(url).path

    score = 0
    if re.match(r'https?://\d{1,3}(\.\d{1,3}){3}', url):
        score += 2
    if '-' in domain:
        score += 1
    if not url.startswith("https://"):
        score += 3
    if any(keyword in url.lower() for keyword in suspicious_keywords):
        score += 2
    if len(url) > 75:
        score += 1
    if '@' in url:
        score += 2

    return score

def log_to_csv(input_text, rule_score, bert_label, bert_score, final_decision):
    with open("phishing_log.csv", "a", newline='') as f:
        writer = csv.writer(f)
        writer.writerow([input_text, rule_score, bert_label, f"{bert_score:.2f}", final_decision])

def combined_phishing_detector(url):
    rule_score = 0
    if url.startswith("http"):
        rule_score = is_phishing_url(url)
        rule_result = "Phishing" if rule_score >= 3 else "Safe"
    else:
        rule_result = "Not Applicable"

    bert_result = bert_classifier(url)[0]
    label_map = {"LABEL_0": "safe", "LABEL_1": "phishing"}
    bert_label = label_map.get(bert_result["label"].upper(), "unknown")
    bert_score = bert_result["score"]

    if rule_result == "Phishing" and (bert_label == "phishing" and bert_score > 0.75):
        final_decision = "Phishing"
    else:
        final_decision = "Safe"

    log_to_csv(url, rule_score, bert_label, bert_score, final_decision)
    return final_decision

# ---------------- STREAMLIT UI ----------------

st.set_page_config(page_title="Phishing Detector", page_icon="πŸ”")
st.title("πŸ” Phishing URL & Message Detector")

user_input = st.text_area("Paste a URL or email message below:")

if st.button("Check"):
    if user_input.strip():
        result = combined_phishing_detector(user_input.strip())
        if result == "Phishing":
            st.error(f"🚨 Detected as: {result}")
        else:
            st.success(f"βœ… Detected as: {result}")
    else:
        st.warning("⚠️ Please enter a valid URL or message.")