Spaces:

ApsidalSolid4
/

CITProjectAIDetector

Running

App Files Files Community

ApsidalSolid4 commited on Mar 13

Commit

83846c5

verified ·

1 Parent(s): b02cae9

Update app.py

Browse files

Files changed (1) hide show

app.py +143 -104

app.py CHANGED Viewed

@@ -11,11 +11,10 @@ from fastapi.middleware.cors import CORSMiddleware
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 import time
-import pandas as pd
 from datetime import datetime
 import threading
 import random
-from openpyxl import load_workbook
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -30,53 +29,54 @@ CONFIDENCE_THRESHOLD = 0.65
 BATCH_SIZE = 8  # Reduced batch size for CPU
 MAX_WORKERS = 4  # Number of worker threads for processing
-class ExcelLogger:
-    def __init__(self, log_dir="logs", excel_file=None):
-        """Initialize the Excel logger.
         Args:
-            log_dir: Directory to store log files
-            excel_file: Specific Excel file name (defaults to predictions_YYYY-MM.xlsx)
         """
         self.log_dir = log_dir
         os.makedirs(log_dir, exist_ok=True)
-        # Use monthly Excel files by default
-        if excel_file is None:
-            current_month = datetime.now().strftime('%Y-%m')
-            excel_file = f"predictions_{current_month}.xlsx"
-        self.excel_path = os.path.join(log_dir, excel_file)
-        # Create excel file with headers if it doesn't exist
-        if not os.path.exists(self.excel_path):
-            self._create_excel_file()
-        # Create a lock for thread safety
-        self.file_lock = threading.Lock()
-    def _create_excel_file(self):
-        """Create a new Excel file with appropriate sheets and headers."""
-        # Create DataFrame for metrics
-        metrics_df = pd.DataFrame(columns=[
-            'timestamp', 'word_count', 'mode', 'prediction',
             'confidence', 'prediction_time_seconds', 'num_sentences'
-        ])
-        # Create DataFrame for text storage
-        text_df = pd.DataFrame(columns=[
-            'entry_id', 'timestamp', 'text'
-        ])
-        # Create Excel writer
-        with pd.ExcelWriter(self.excel_path, engine='openpyxl') as writer:
-            metrics_df.to_excel(writer, sheet_name='Metrics', index=False)
-            text_df.to_excel(writer, sheet_name='TextData', index=False)
-        logger.info(f"Created new Excel log file: {self.excel_path}")
     def log_prediction(self, prediction_data, store_text=True):
-        """Log prediction data to the Excel file.
         Args:
             prediction_data: Dictionary containing prediction metrics
@@ -92,89 +92,95 @@ class ExcelLogger:
         if 'timestamp' not in prediction_data:
             prediction_data['timestamp'] = datetime.now().isoformat()
-        # Add entry_id to the metrics
         metrics_data = prediction_data.copy()
         metrics_data['entry_id'] = entry_id
-        # Start a thread to write data to Excel
         thread = threading.Thread(
-            target=self._write_to_excel,
             args=(metrics_data, text, entry_id, store_text)
         )
         thread.daemon = True
         thread.start()
-    def _write_to_excel(self, metrics_data, text, entry_id, store_text):
-        """Write data to Excel file with retry mechanism for concurrent access."""
         max_retries = 5
         retry_delay = 0.5
         for attempt in range(max_retries):
             try:
-                with self.file_lock:
-                    # Load existing data
-                    metrics_df = pd.read_excel(self.excel_path, sheet_name='Metrics')
-                    # Append new metrics data
-                    new_metrics = pd.DataFrame([metrics_data])
-                    metrics_df = pd.concat([metrics_df, new_metrics], ignore_index=True)
-                    # If text storage is requested
-                    if store_text and text:
-                        try:
-                            text_df = pd.read_excel(self.excel_path, sheet_name='TextData')
-                            # Append new text data
-                            new_text = pd.DataFrame([{
-                                'entry_id': entry_id,
-                                'timestamp': metrics_data['timestamp'],
-                                'text': text
-                            }])
-                            text_df = pd.concat([text_df, new_text], ignore_index=True)
-                        except:
-                            # If TextData sheet doesn't exist or can't be read
-                            text_df = pd.DataFrame([{
-                                'entry_id': entry_id,
-                                'timestamp': metrics_data['timestamp'],
-                                'text': text
-                            }])
-                    # Write back to Excel
-                    with pd.ExcelWriter(self.excel_path, engine='openpyxl', mode='a',
-                                         if_sheet_exists='replace') as writer:
-                        metrics_df.to_excel(writer, sheet_name='Metrics', index=False)
-                        if store_text and text:
-                            text_df.to_excel(writer, sheet_name='TextData', index=False)
-                    # Successfully wrote to file
                     break
             except Exception as e:
-                # If error occurs (likely due to concurrent access), retry after delay
-                logger.warning(f"Error writing to Excel (attempt {attempt+1}/{max_retries}): {e}")
-                time.sleep(retry_delay * (attempt + 1))  # Progressive backoff
         else:
-            # If all retries fail, log to backup file
-            logger.error(f"Failed to write to Excel after {max_retries} attempts, logging to backup file")
-            self._write_to_backup(metrics_data, text, entry_id, store_text)
-    def _write_to_backup(self, metrics_data, text, entry_id, store_text):
-        """Write to backup CSV files if Excel writing fails."""
-        timestamp = datetime.now().strftime('%Y%m%d')
-        # Log metrics to CSV
-        metrics_csv = os.path.join(self.log_dir, f"metrics_backup_{timestamp}.csv")
-        pd.DataFrame([metrics_data]).to_csv(metrics_csv, mode='a', header=not os.path.exists(metrics_csv), index=False)
-        # Log text to separate CSV if needed
         if store_text and text:
-            text_csv = os.path.join(self.log_dir, f"text_backup_{timestamp}.csv")
-            text_data = {
-                'entry_id': entry_id,
-                'timestamp': metrics_data['timestamp'],
-                'text': text
-            }
-            pd.DataFrame([text_data]).to_csv(text_csv, mode='a', header=not os.path.exists(text_csv), index=False)
 class TextWindowProcessor:
@@ -480,7 +486,16 @@ class TextClassifier:
         }
 # Initialize the logger
-excel_logger = ExcelLogger(log_dir=".")
 def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
     """Analyze text using specified mode and return formatted results."""
@@ -532,7 +547,10 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
             "num_sentences": 0,  # No sentence analysis in quick mode
             "text": text
         }
-        excel_logger.log_prediction(log_data)
     else:
         analysis = classifier.detailed_scan(text)
@@ -576,14 +594,17 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
             "num_sentences": num_sentences,
             "text": text
         }
-        excel_logger.log_prediction(log_data)
     return output
 # Initialize the classifier globally
 classifier = TextClassifier()
-# Create Gradio interface with added information about data collection
 demo = gr.Interface(
     fn=lambda text, mode: analyze_text(text, mode, classifier),
     inputs=[
@@ -619,8 +640,26 @@ app.add_middleware(
     allow_headers=["*"],
 )
 # Ensure CORS is applied before launching
 if __name__ == "__main__":
     demo.queue()
     demo.launch(
         server_name="0.0.0.0",

 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 import time
+import csv
 from datetime import datetime
 import threading
 import random
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 BATCH_SIZE = 8  # Reduced batch size for CPU
 MAX_WORKERS = 4  # Number of worker threads for processing
+class CSVLogger:
+    def __init__(self, log_dir="."):
+        """Initialize the CSV logger.
         Args:
+            log_dir: Directory to store CSV log files
         """
         self.log_dir = log_dir
         os.makedirs(log_dir, exist_ok=True)
+        # Create monthly CSV files
+        current_month = datetime.now().strftime('%Y-%m')
+        self.metrics_path = os.path.join(log_dir, f"metrics_{current_month}.csv")
+        self.text_path = os.path.join(log_dir, f"text_data_{current_month}.csv")
+        # Define headers
+        self.metrics_headers = [
+            'entry_id', 'timestamp', 'word_count', 'mode', 'prediction',
             'confidence', 'prediction_time_seconds', 'num_sentences'
+        ]
+        self.text_headers = ['entry_id', 'timestamp', 'text']
+        # Initialize the files if they don't exist
+        self._initialize_files()
+        # Create locks for thread safety
+        self.metrics_lock = threading.Lock()
+        self.text_lock = threading.Lock()
+        print(f"CSV logger initialized with files at: {os.path.abspath(self.metrics_path)}")
+    def _initialize_files(self):
+        """Create the CSV files with headers if they don't exist."""
+        # Initialize metrics file
+        if not os.path.exists(self.metrics_path):
+            with open(self.metrics_path, 'w', newline='') as f:
+                writer = csv.writer(f)
+                writer.writerow(self.metrics_headers)
+        # Initialize text data file
+        if not os.path.exists(self.text_path):
+            with open(self.text_path, 'w', newline='') as f:
+                writer = csv.writer(f)
+                writer.writerow(self.text_headers)
     def log_prediction(self, prediction_data, store_text=True):
+        """Log prediction data to CSV files.
         Args:
             prediction_data: Dictionary containing prediction metrics
         if 'timestamp' not in prediction_data:
             prediction_data['timestamp'] = datetime.now().isoformat()
+        # Add entry_id to metrics data
         metrics_data = prediction_data.copy()
         metrics_data['entry_id'] = entry_id
+        # Start a thread to write data
         thread = threading.Thread(
+            target=self._write_to_csv,
             args=(metrics_data, text, entry_id, store_text)
         )
         thread.daemon = True
         thread.start()
+    def _write_to_csv(self, metrics_data, text, entry_id, store_text):
+        """Write data to CSV files with retry mechanism."""
         max_retries = 5
         retry_delay = 0.5
+        # Write metrics data
         for attempt in range(max_retries):
             try:
+                with self.metrics_lock:
+                    with open(self.metrics_path, 'a', newline='') as f:
+                        writer = csv.writer(f)
+                        # Prepare row in the correct order based on headers
+                        row = [
+                            metrics_data.get('entry_id', ''),
+                            metrics_data.get('timestamp', ''),
+                            metrics_data.get('word_count', 0),
+                            metrics_data.get('mode', ''),
+                            metrics_data.get('prediction', ''),
+                            metrics_data.get('confidence', 0.0),
+                            metrics_data.get('prediction_time_seconds', 0.0),
+                            metrics_data.get('num_sentences', 0)
+                        ]
+                        writer.writerow(row)
+                    print(f"Successfully wrote metrics to CSV, entry_id: {entry_id}")
                     break
             except Exception as e:
+                print(f"Error writing metrics to CSV (attempt {attempt+1}/{max_retries}): {e}")
+                time.sleep(retry_delay * (attempt + 1))
         else:
+            # If all retries fail, write to backup file
+            backup_path = os.path.join(self.log_dir, f"metrics_backup_{datetime.now().strftime('%Y%m%d%H%M%S')}.csv")
+            try:
+                with open(backup_path, 'w', newline='') as f:
+                    writer = csv.writer(f)
+                    writer.writerow(self.metrics_headers)
+                    row = [
+                        metrics_data.get('entry_id', ''),
+                        metrics_data.get('timestamp', ''),
+                        metrics_data.get('word_count', 0),
+                        metrics_data.get('mode', ''),
+                        metrics_data.get('prediction', ''),
+                        metrics_data.get('confidence', 0.0),
+                        metrics_data.get('prediction_time_seconds', 0.0),
+                        metrics_data.get('num_sentences', 0)
+                    ]
+                    writer.writerow(row)
+                print(f"Wrote metrics backup to {backup_path}")
+            except Exception as e:
+                print(f"Error writing metrics backup: {e}")
+        # Write text data if requested
         if store_text and text:
+            for attempt in range(max_retries):
+                try:
+                    with self.text_lock:
+                        with open(self.text_path, 'a', newline='') as f:
+                            writer = csv.writer(f)
+                            # Handle potential newlines in text by replacing them
+                            safe_text = text.replace('\n', ' ').replace('\r', ' ') if text else ''
+                            writer.writerow([entry_id, metrics_data.get('timestamp', ''), safe_text])
+                        print(f"Successfully wrote text data to CSV, entry_id: {entry_id}")
+                        break
+                except Exception as e:
+                    print(f"Error writing text data to CSV (attempt {attempt+1}/{max_retries}): {e}")
+                    time.sleep(retry_delay * (attempt + 1))
+            else:
+                # If all retries fail, write to backup file
+                backup_path = os.path.join(self.log_dir, f"text_backup_{datetime.now().strftime('%Y%m%d%H%M%S')}.csv")
+                try:
+                    with open(backup_path, 'w', newline='') as f:
+                        writer = csv.writer(f)
+                        writer.writerow(self.text_headers)
+                        safe_text = text.replace('\n', ' ').replace('\r', ' ') if text else ''
+                        writer.writerow([entry_id, metrics_data.get('timestamp', ''), safe_text])
+                    print(f"Wrote text data backup to {backup_path}")
+                except Exception as e:
+                    print(f"Error writing text data backup: {e}")
 class TextWindowProcessor:
         }
 # Initialize the logger
+csv_logger = CSVLogger(log_dir=".")
+# Add file listing endpoint for debugging
+def list_files():
+    """List all files in the current directory and subdirectories."""
+    all_files = []
+    for root, dirs, files in os.walk('.'):
+        for file in files:
+            all_files.append(os.path.join(root, file))
+    return all_files
 def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
     """Analyze text using specified mode and return formatted results."""
             "num_sentences": 0,  # No sentence analysis in quick mode
             "text": text
         }
+        # Log to CSV
+        print(f"Logging prediction data: word_count={word_count}, mode={mode}, prediction={prediction}")
+        csv_logger.log_prediction(log_data)
     else:
         analysis = classifier.detailed_scan(text)
             "num_sentences": num_sentences,
             "text": text
         }
+        # Log to CSV
+        print(f"Logging prediction data: word_count={word_count}, mode={mode}, prediction={prediction}")
+        csv_logger.log_prediction(log_data)
     return output
 # Initialize the classifier globally
 classifier = TextClassifier()
+# Create Gradio interface
 demo = gr.Interface(
     fn=lambda text, mode: analyze_text(text, mode, classifier),
     inputs=[
     allow_headers=["*"],
 )
+# Add file listing endpoint for debugging
+@app.get("/list_files")
+async def get_files():
+    return {"files": list_files()}
 # Ensure CORS is applied before launching
 if __name__ == "__main__":
+    # Create empty CSV files if they don't exist
+    current_month = datetime.now().strftime('%Y-%m')
+    metrics_path = f"metrics_{current_month}.csv"
+    text_path = f"text_data_{current_month}.csv"
+    print(f"Current directory: {os.getcwd()}")
+    print(f"Looking for CSV files: {metrics_path}, {text_path}")
+    if not os.path.exists(metrics_path):
+        print(f"Creating metrics CSV file: {metrics_path}")
+    if not os.path.exists(text_path):
+        print(f"Creating text data CSV file: {text_path}")
     demo.queue()
     demo.launch(
         server_name="0.0.0.0",