toxic-comment-classifier_rlhf

Paused

App Files Files Community

JanviMl commited on Mar 25

Commit

c91906e

verified ·

1 Parent(s): 829572e

Update classifier.py

Browse files

Files changed (1) hide show

classifier.py +17 -4

classifier.py CHANGED Viewed

@@ -2,15 +2,16 @@
 import torch
 from model_loader import classifier_model, classifier_tokenizer
 from paraphraser import paraphrase_comment
 def classify_toxic_comment(comment):
     """
     Classify a comment as toxic or non-toxic using the fine-tuned XLM-RoBERTa model.
-    If toxic, paraphrase the comment and re-evaluate.
     Returns the prediction label, confidence, color, toxicity score, bias score, paraphrased comment (if applicable), and its metrics.
     """
     if not comment.strip():
-        return "Error: Please enter a comment.", None, None, None, None, None, None, None, None, None
     # Tokenize the input comment
     inputs = classifier_tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=512)
@@ -34,13 +35,18 @@ def classify_toxic_comment(comment):
     bias_score = 0.01 if label == "Non-Toxic" else 0.15
     bias_score = round(bias_score, 2)
-    # If the comment is toxic, paraphrase it
     paraphrased_comment = None
     paraphrased_prediction = None
     paraphrased_confidence = None
     paraphrased_color = None
     paraphrased_toxicity_score = None
     paraphrased_bias_score = None
     if label == "Toxic":
         # Paraphrase the comment
@@ -61,8 +67,15 @@ def classify_toxic_comment(comment):
         paraphrased_bias_score = 0.01 if paraphrased_label == "Non-Toxic" else 0.15  # Placeholder
         paraphrased_bias_score = round(paraphrased_bias_score, 2)
     return (
         f"Prediction: {label}", confidence, label_color, toxicity_score, bias_score,
         paraphrased_comment, f"Prediction: {paraphrased_label}" if paraphrased_comment else None,
-        paraphrased_confidence, paraphrased_color, paraphrased_toxicity_score, paraphrased_bias_score
     )

 import torch
 from model_loader import classifier_model, classifier_tokenizer
 from paraphraser import paraphrase_comment
+from metrics import compute_semantic_similarity, compute_emotion_shift, compute_empathy_score
 def classify_toxic_comment(comment):
     """
     Classify a comment as toxic or non-toxic using the fine-tuned XLM-RoBERTa model.
+    If toxic, paraphrase the comment, re-evaluate, and compute additional Stage 3 metrics.
     Returns the prediction label, confidence, color, toxicity score, bias score, paraphrased comment (if applicable), and its metrics.
     """
     if not comment.strip():
+        return "Error: Please enter a comment.", None, None, None, None, None, None, None, None, None, None, None, None, None
     # Tokenize the input comment
     inputs = classifier_tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=512)
     bias_score = 0.01 if label == "Non-Toxic" else 0.15
     bias_score = round(bias_score, 2)
+    # If the comment is toxic, paraphrase it and compute additional metrics
     paraphrased_comment = None
     paraphrased_prediction = None
     paraphrased_confidence = None
     paraphrased_color = None
     paraphrased_toxicity_score = None
     paraphrased_bias_score = None
+    semantic_similarity = None
+    original_emotion = None
+    paraphrased_emotion = None
+    emotion_shift_positive = None
+    empathy_score = None
     if label == "Toxic":
         # Paraphrase the comment
         paraphrased_bias_score = 0.01 if paraphrased_label == "Non-Toxic" else 0.15  # Placeholder
         paraphrased_bias_score = round(paraphrased_bias_score, 2)
+        # Compute additional Stage 3 metrics
+        semantic_similarity = compute_semantic_similarity(comment, paraphrased_comment)
+        original_emotion, paraphrased_emotion, emotion_shift_positive = compute_emotion_shift(comment, paraphrased_comment)
+        empathy_score = compute_empathy_score(paraphrased_comment)
     return (
         f"Prediction: {label}", confidence, label_color, toxicity_score, bias_score,
         paraphrased_comment, f"Prediction: {paraphrased_label}" if paraphrased_comment else None,
+        paraphrased_confidence, paraphrased_color, paraphrased_toxicity_score, paraphrased_bias_score,
+        semantic_similarity, f"Original: {original_emotion}, Paraphrased: {paraphrased_emotion}, Positive Shift: {emotion_shift_positive}" if original_emotion else None,
+        empathy_score
     )