Spaces:

ApsidalSolid4
/

CITProjectAIDetector

Running

App Files Files Community

ApsidalSolid4 commited on Feb 20

Commit

25f2b88

verified ·

1 Parent(s): 1bb7d9d

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -28

app.py CHANGED Viewed

@@ -3,11 +3,14 @@ import numpy as np
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch.nn.functional as F
 import spacy
-from typing import List, Dict
 import logging
 import os
 import gradio as gr
 from fastapi.middleware.cors import CORSMiddleware
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -18,7 +21,8 @@ MODEL_NAME = "microsoft/deberta-v3-small"
 WINDOW_SIZE = 17
 WINDOW_OVERLAP = 2
 CONFIDENCE_THRESHOLD = 0.65
-BATCH_SIZE = 16
 class TextWindowProcessor:
     def __init__(self):
@@ -34,13 +38,15 @@ class TextWindowProcessor:
         disabled_pipes = [pipe for pipe in self.nlp.pipe_names if pipe != 'sentencizer']
         self.nlp.disable_pipes(*disabled_pipes)
     def split_into_sentences(self, text: str) -> List[str]:
         doc = self.nlp(text)
         return [str(sent).strip() for sent in doc.sents]
     def create_windows(self, sentences: List[str], window_size: int, overlap: int) -> List[str]:
-        """Create overlapping windows for quick scan mode."""
         if len(sentences) < window_size:
             return [" ".join(sentences)]
@@ -51,21 +57,18 @@ class TextWindowProcessor:
             windows.append(" ".join(window))
         return windows
-    def create_centered_windows(self, sentences: List[str], window_size: int) -> tuple[List[str], List[List[int]]]:
-        """Create centered windows for detailed analysis mode."""
         windows = []
         window_sentence_indices = []
         for i in range(len(sentences)):
             half_window = window_size // 2
             start_idx = max(0, i - half_window)
             end_idx = min(len(sentences), i + half_window + 1)
-            if start_idx == 0:
-                end_idx = min(len(sentences), window_size)
-            elif end_idx == len(sentences):
-                start_idx = max(0, len(sentences) - window_size)
             window = sentences[start_idx:end_idx]
             windows.append(" ".join(window))
             window_sentence_indices.append(list(range(start_idx, end_idx)))
@@ -75,12 +78,17 @@ class TextWindowProcessor:
 class TextClassifier:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model_name = MODEL_NAME
         self.tokenizer = None
         self.model = None
         self.processor = TextWindowProcessor()
         self.initialize_model()
     def initialize_model(self):
         """Initialize the model and tokenizer."""
         logger.info("Initializing model and tokenizer...")
@@ -90,15 +98,19 @@ class TextClassifier:
         self.tokenizer = DebertaV2TokenizerFast.from_pretrained(
             self.model_name,
             model_max_length=MAX_LENGTH,
-            use_fast=False,
-            from_slow=True
         )
         self.model = AutoModelForSequenceClassification.from_pretrained(
             self.model_name,
-            num_labels=2
         ).to(self.device)
         model_path = "model_20250209_184929_acc1.0000.pt"
         if os.path.exists(model_path):
             logger.info(f"Loading custom model from {model_path}")
@@ -123,7 +135,7 @@ class TextClassifier:
         predictions = []
-        # Process windows in batches
         for i in range(0, len(windows), BATCH_SIZE):
             batch_windows = windows[i:i + BATCH_SIZE]
@@ -148,7 +160,11 @@ class TextClassifier:
                     }
                     predictions.append(prediction)
-        # Calculate aggregate prediction
         if not predictions:
             return {
                 'prediction': 'unknown',
@@ -166,7 +182,7 @@ class TextClassifier:
         }
     def detailed_scan(self, text: str) -> Dict:
-        """Perform a detailed scan with sentence-level analysis."""
         if not text.strip():
             return {
                 'sentence_predictions': [],
@@ -207,18 +223,51 @@ class TextClassifier:
                 outputs = self.model(**inputs)
                 probs = F.softmax(outputs.logits, dim=-1)
                 for window_idx, indices in enumerate(batch_indices):
-                    for sent_idx in indices:
-                        sentence_appearances[sent_idx] += 1
-                        sentence_scores[sent_idx]['human_prob'] += probs[window_idx][1].item()
-                        sentence_scores[sent_idx]['ai_prob'] += probs[window_idx][0].item()
-        # Average the scores and create final sentence-level predictions
         sentence_predictions = []
         for i in range(len(sentences)):
             if sentence_appearances[i] > 0:
                 human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
                 ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
                 sentence_predictions.append({
                     'sentence': sentences[i],
                     'human_prob': human_prob,
@@ -282,7 +331,6 @@ class TextClassifier:
 def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
     """Analyze text using specified mode and return formatted results."""
     if mode == "quick":
-        # Quick scan
         result = classifier.quick_scan(text)
         quick_analysis = f"""
@@ -297,10 +345,8 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
             quick_analysis
         )
     else:
-        # Detailed scan
         analysis = classifier.detailed_scan(text)
-        # Format sentence-by-sentence analysis
         detailed_analysis = []
         for pred in analysis['sentence_predictions']:
             confidence = pred['confidence'] * 100
@@ -309,7 +355,6 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
             detailed_analysis.append(f"Confidence: {confidence:.1f}%")
             detailed_analysis.append("-" * 50)
-        # Format overall prediction
         final_pred = analysis['overall_prediction']
         overall_result = f"""
         FINAL PREDICTION: {final_pred['prediction'].upper()}
@@ -354,7 +399,7 @@ demo = gr.Interface(
         ["This is a sample text written by a human. It contains multiple sentences with different ideas. The analysis will show how each sentence is classified.", "detailed"],
     ],
     api_name="predict",
-    flagging_mode="never"  # Updated from allow_flagging
 )
 app = demo.app

 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch.nn.functional as F
 import spacy
+from typing import List, Dict, Tuple
 import logging
 import os
 import gradio as gr
 from fastapi.middleware.cors import CORSMiddleware
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 WINDOW_SIZE = 17
 WINDOW_OVERLAP = 2
 CONFIDENCE_THRESHOLD = 0.65
+BATCH_SIZE = 8  # Reduced batch size for CPU
+MAX_WORKERS = 4  # Number of worker threads for processing
 class TextWindowProcessor:
     def __init__(self):
         disabled_pipes = [pipe for pipe in self.nlp.pipe_names if pipe != 'sentencizer']
         self.nlp.disable_pipes(*disabled_pipes)
+        # Initialize thread pool for parallel processing
+        self.executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
     def split_into_sentences(self, text: str) -> List[str]:
         doc = self.nlp(text)
         return [str(sent).strip() for sent in doc.sents]
     def create_windows(self, sentences: List[str], window_size: int, overlap: int) -> List[str]:
         if len(sentences) < window_size:
             return [" ".join(sentences)]
             windows.append(" ".join(window))
         return windows
+    def create_centered_windows(self, sentences: List[str], window_size: int) -> Tuple[List[str], List[List[int]]]:
+        """Create windows with better boundary handling"""
         windows = []
         window_sentence_indices = []
         for i in range(len(sentences)):
+            # Calculate window boundaries centered on current sentence
             half_window = window_size // 2
             start_idx = max(0, i - half_window)
             end_idx = min(len(sentences), i + half_window + 1)
+            # Create the window
             window = sentences[start_idx:end_idx]
             windows.append(" ".join(window))
             window_sentence_indices.append(list(range(start_idx, end_idx)))
 class TextClassifier:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if self.device.type == 'cpu':
+            # Enable CPU optimizations
+            torch.set_num_threads(MAX_WORKERS)
+            torch.set_num_interop_threads(MAX_WORKERS)
         self.model_name = MODEL_NAME
         self.tokenizer = None
         self.model = None
         self.processor = TextWindowProcessor()
         self.initialize_model()
     def initialize_model(self):
         """Initialize the model and tokenizer."""
         logger.info("Initializing model and tokenizer...")
         self.tokenizer = DebertaV2TokenizerFast.from_pretrained(
             self.model_name,
             model_max_length=MAX_LENGTH,
+            use_fast=True
         )
         self.model = AutoModelForSequenceClassification.from_pretrained(
             self.model_name,
+            num_labels=2,
+            torchscript=True  # Enable TorchScript optimization
         ).to(self.device)
+        if self.device.type == 'cpu':
+            self.model.eval()  # Ensure model is in eval mode for optimization
+            self.model = torch.jit.optimize_for_inference(torch.jit.script(self.model))
         model_path = "model_20250209_184929_acc1.0000.pt"
         if os.path.exists(model_path):
             logger.info(f"Loading custom model from {model_path}")
         predictions = []
+        # Process windows in smaller batches for CPU efficiency
         for i in range(0, len(windows), BATCH_SIZE):
             batch_windows = windows[i:i + BATCH_SIZE]
                     }
                     predictions.append(prediction)
+            # Clean up GPU memory if available
+            del inputs, outputs, probs
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
         if not predictions:
             return {
                 'prediction': 'unknown',
         }
     def detailed_scan(self, text: str) -> Dict:
+        """Perform a detailed scan with improved sentence-level analysis."""
         if not text.strip():
             return {
                 'sentence_predictions': [],
                 outputs = self.model(**inputs)
                 probs = F.softmax(outputs.logits, dim=-1)
+                # Attribute predictions with weighted scoring
                 for window_idx, indices in enumerate(batch_indices):
+                    center_idx = len(indices) // 2
+                    center_weight = 0.7  # Higher weight for center sentence
+                    edge_weight = 0.3 / (len(indices) - 1)  # Distribute remaining weight
+                    for pos, sent_idx in enumerate(indices):
+                        # Apply higher weight to center sentence
+                        weight = center_weight if pos == center_idx else edge_weight
+                        sentence_appearances[sent_idx] += weight
+                        sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
+                        sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
+            # Clean up memory
+            del inputs, outputs, probs
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        # Calculate final predictions with boundary smoothing
         sentence_predictions = []
         for i in range(len(sentences)):
             if sentence_appearances[i] > 0:
                 human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
                 ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
+                # Apply minimal smoothing at prediction boundaries
+                if i > 0 and i < len(sentences) - 1:
+                    prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
+                    prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
+                    next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
+                    next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
+                    # Check if we're at a prediction boundary
+                    current_pred = 'human' if human_prob > ai_prob else 'ai'
+                    prev_pred = 'human' if prev_human > prev_ai else 'ai'
+                    next_pred = 'human' if next_human > next_ai else 'ai'
+                    if current_pred != prev_pred or current_pred != next_pred:
+                        # Small adjustment at boundaries
+                        smooth_factor = 0.1
+                        human_prob = (human_prob * (1 - smooth_factor) +
+                                    (prev_human + next_human) * smooth_factor / 2)
+                        ai_prob = (ai_prob * (1 - smooth_factor) +
+                                (prev_ai + next_ai) * smooth_factor / 2)
                 sentence_predictions.append({
                     'sentence': sentences[i],
                     'human_prob': human_prob,
 def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
     """Analyze text using specified mode and return formatted results."""
     if mode == "quick":
         result = classifier.quick_scan(text)
         quick_analysis = f"""
             quick_analysis
         )
     else:
         analysis = classifier.detailed_scan(text)
         detailed_analysis = []
         for pred in analysis['sentence_predictions']:
             confidence = pred['confidence'] * 100
             detailed_analysis.append(f"Confidence: {confidence:.1f}%")
             detailed_analysis.append("-" * 50)
         final_pred = analysis['overall_prediction']
         overall_result = f"""
         FINAL PREDICTION: {final_pred['prediction'].upper()}
         ["This is a sample text written by a human. It contains multiple sentences with different ideas. The analysis will show how each sentence is classified.", "detailed"],
     ],
     api_name="predict",
+    flagging_mode="never"
 )
 app = demo.app