Spaces:

ApsidalSolid4
/

CITProjectAIDetector

Running

App Files Files Community

ApsidalSolid4 commited on Feb 20

Commit

13fd1cb

verified ·

1 Parent(s): 97a3e71

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -26

app.py CHANGED Viewed

@@ -51,25 +51,22 @@ class TextWindowProcessor:
             windows.append(" ".join(window))
         return windows
-    def create_centered_windows(self, sentences: List[str], window_size: int) -> tuple[List[str], List[List[int]]]:
-        """Create centered windows for detailed analysis mode."""
         windows = []
         window_sentence_indices = []
         for i in range(len(sentences)):
             half_window = window_size // 2
             start_idx = max(0, i - half_window)
             end_idx = min(len(sentences), i + half_window + 1)
-            if start_idx == 0:
-                end_idx = min(len(sentences), window_size)
-            elif end_idx == len(sentences):
-                start_idx = max(0, len(sentences) - window_size)
             window = sentences[start_idx:end_idx]
             windows.append(" ".join(window))
             window_sentence_indices.append(list(range(start_idx, end_idx)))
         return windows, window_sentence_indices
 class TextClassifier:
@@ -166,7 +163,7 @@ class TextClassifier:
         }
     def detailed_scan(self, text: str) -> Dict:
-        """Perform a detailed scan with sentence-level analysis."""
         if not text.strip():
             return {
                 'sentence_predictions': [],
@@ -178,23 +175,23 @@ class TextClassifier:
                     'num_sentences': 0
                 }
             }
         sentences = self.processor.split_into_sentences(text)
         if not sentences:
             return {}
         # Create centered windows for each sentence
         windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
         # Track scores for each sentence
         sentence_appearances = {i: 0 for i in range(len(sentences))}
         sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
         # Process windows in batches
         for i in range(0, len(windows), BATCH_SIZE):
             batch_windows = windows[i:i + BATCH_SIZE]
-            batch_indices = window_sentence_indices[i:i + BATCH_SIZE]
             inputs = self.tokenizer(
                 batch_windows,
                 truncation=True,
@@ -202,23 +199,51 @@ class TextClassifier:
                 max_length=MAX_LENGTH,
                 return_tensors="pt"
             ).to(self.device)
             with torch.no_grad():
                 outputs = self.model(**inputs)
                 probs = F.softmax(outputs.logits, dim=-1)
                 for window_idx, indices in enumerate(batch_indices):
-                    for sent_idx in indices:
-                        sentence_appearances[sent_idx] += 1
-                        sentence_scores[sent_idx]['human_prob'] += probs[window_idx][1].item()
-                        sentence_scores[sent_idx]['ai_prob'] += probs[window_idx][0].item()
-        # Average the scores and create final sentence-level predictions
         sentence_predictions = []
         for i in range(len(sentences)):
             if sentence_appearances[i] > 0:
                 human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
                 ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
                 sentence_predictions.append({
                     'sentence': sentences[i],
                     'human_prob': human_prob,
@@ -226,7 +251,7 @@ class TextClassifier:
                     'prediction': 'human' if human_prob > ai_prob else 'ai',
                     'confidence': max(human_prob, ai_prob)
                 })
         return {
             'sentence_predictions': sentence_predictions,
             'highlighted_text': self.format_predictions_html(sentence_predictions),

             windows.append(" ".join(window))
         return windows
+    def create_centered_windows(self, sentences: List[str], window_size: int) -> Tuple[List[str], List[List[int]]]:
+        """Create windows with better boundary handling"""
         windows = []
         window_sentence_indices = []
         for i in range(len(sentences)):
+            # Calculate window boundaries centered on current sentence
             half_window = window_size // 2
             start_idx = max(0, i - half_window)
             end_idx = min(len(sentences), i + half_window + 1)
+            # Create the window
             window = sentences[start_idx:end_idx]
             windows.append(" ".join(window))
             window_sentence_indices.append(list(range(start_idx, end_idx)))
         return windows, window_sentence_indices
 class TextClassifier:
         }
     def detailed_scan(self, text: str) -> Dict:
+        """Perform a detailed scan with sentence-level analysis and improved boundary handling."""
         if not text.strip():
             return {
                 'sentence_predictions': [],
                     'num_sentences': 0
                 }
             }
         sentences = self.processor.split_into_sentences(text)
         if not sentences:
             return {}
         # Create centered windows for each sentence
         windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
         # Track scores for each sentence
         sentence_appearances = {i: 0 for i in range(len(sentences))}
         sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
         # Process windows in batches
         for i in range(0, len(windows), BATCH_SIZE):
             batch_windows = windows[i:i + BATCH_SIZE]
+            batch_indices = window_sentence_indices[i:i + batch_size]
             inputs = self.tokenizer(
                 batch_windows,
                 truncation=True,
                 max_length=MAX_LENGTH,
                 return_tensors="pt"
             ).to(self.device)
             with torch.no_grad():
                 outputs = self.model(**inputs)
                 probs = F.softmax(outputs.logits, dim=-1)
+                # Attribute predictions with center-weighted approach
                 for window_idx, indices in enumerate(batch_indices):
+                    center_idx = len(indices) // 2
+                    center_weight = 0.7  # Higher weight for center sentence
+                    edge_weight = 0.3 / (len(indices) - 1)  # Distribute remaining weight
+                    for pos, sent_idx in enumerate(indices):
+                        # Apply higher weight to center sentence
+                        weight = center_weight if pos == center_idx else edge_weight
+                        sentence_appearances[sent_idx] += weight
+                        sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
+                        sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
+        # Calculate final predictions with boundary smoothing
         sentence_predictions = []
         for i in range(len(sentences)):
             if sentence_appearances[i] > 0:
                 human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
                 ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
+                # Apply minimal smoothing at prediction boundaries
+                if i > 0 and i < len(sentences) - 1:
+                    prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
+                    prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
+                    next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
+                    next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
+                    # Check if we're at a prediction boundary
+                    current_pred = 'human' if human_prob > ai_prob else 'ai'
+                    prev_pred = 'human' if prev_human > prev_ai else 'ai'
+                    next_pred = 'human' if next_human > next_ai else 'ai'
+                    if current_pred != prev_pred or current_pred != next_pred:
+                        # Small adjustment at boundaries
+                        smooth_factor = 0.1
+                        human_prob = (human_prob * (1 - smooth_factor) +
+                                    (prev_human + next_human) * smooth_factor / 2)
+                        ai_prob = (ai_prob * (1 - smooth_factor) +
+                                 (prev_ai + next_ai) * smooth_factor / 2)
                 sentence_predictions.append({
                     'sentence': sentences[i],
                     'human_prob': human_prob,
                     'prediction': 'human' if human_prob > ai_prob else 'ai',
                     'confidence': max(human_prob, ai_prob)
                 })
         return {
             'sentence_predictions': sentence_predictions,
             'highlighted_text': self.format_predictions_html(sentence_predictions),