Spaces:

ApsidalSolid4
/

CITProjectAIDetector

Running

App Files Files Community

ApsidalSolid4 commited on Apr 18

Commit

5f61427

verified ·

1 Parent(s): 937af4d

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -414

app.py CHANGED Viewed

@@ -18,10 +18,6 @@ from openpyxl.utils import get_column_letter
 from io import BytesIO
 import base64
 import hashlib
-import requests
-import tempfile
-from pathlib import Path
-import mimetypes
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -36,17 +32,6 @@ CONFIDENCE_THRESHOLD = 0.65
 BATCH_SIZE = 8  # Reduced batch size for CPU
 MAX_WORKERS = 4  # Number of worker threads for processing
-# IMPORTANT: Set PyTorch thread configuration at the module level
-# before any parallel work starts
-if not torch.cuda.is_available():
-    # Set thread configuration only once at the beginning
-    torch.set_num_threads(MAX_WORKERS)
-    try:
-        # Only set interop threads if it hasn't been set already
-        torch.set_num_interop_threads(MAX_WORKERS)
-    except RuntimeError as e:
-        logger.warning(f"Could not set interop threads: {str(e)}")
 # Get password hash from environment variable (more secure)
 ADMIN_PASSWORD_HASH = os.environ.get('ADMIN_PASSWORD_HASH')
@@ -56,168 +41,10 @@ if not ADMIN_PASSWORD_HASH:
 # Excel file path for logs
 EXCEL_LOG_PATH = "/tmp/prediction_logs.xlsx"
-# OCR API settings
-OCR_API_KEY = "9e11346f1288957"  # Now using the complete key
-OCR_API_ENDPOINT = "https://api.ocr.space/parse/image"
-OCR_MAX_PDF_PAGES = 3
-OCR_MAX_FILE_SIZE_MB = 1
-# Configure logging for OCR module
-ocr_logger = logging.getLogger("ocr_module")
-ocr_logger.setLevel(logging.INFO)
-class OCRProcessor:
-    """
-    Handles OCR processing of image and document files using OCR.space API
-    """
-    def __init__(self, api_key: str = OCR_API_KEY):
-        self.api_key = api_key
-        self.endpoint = OCR_API_ENDPOINT
-    def process_file(self, file_path: str) -> Dict:
-        """
-        Process a file using OCR.space API
-        """
-        start_time = time.time()
-        ocr_logger.info(f"Starting OCR processing for file: {os.path.basename(file_path)}")
-        # Validate file size
-        file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
-        if file_size_mb > OCR_MAX_FILE_SIZE_MB:
-            ocr_logger.warning(f"File size ({file_size_mb:.2f} MB) exceeds limit of {OCR_MAX_FILE_SIZE_MB} MB")
-            return {
-                "success": False,
-                "error": f"File size ({file_size_mb:.2f} MB) exceeds limit of {OCR_MAX_FILE_SIZE_MB} MB",
-                "text": ""
-            }
-        # Determine file type and handle accordingly
-        file_type = self._get_file_type(file_path)
-        ocr_logger.info(f"Detected file type: {file_type}")
-        # Set up API parameters
-        payload = {
-            'isOverlayRequired': 'false',
-            'language': 'eng',
-            'OCREngine': '2',  # Use more accurate engine
-            'scale': 'true',
-            'detectOrientation': 'true',
-        }
-        # For PDF files, check page count limitations
-        if file_type == 'application/pdf':
-            ocr_logger.info("PDF document detected, enforcing page limit")
-            payload['filetype'] = 'PDF'
-        # Prepare file for OCR API - using file data as bytes to avoid file handle issues
-        with open(file_path, 'rb') as f:
-            file_data = f.read()
-        files = {
-            'file': (os.path.basename(file_path), file_data, file_type)
-        }
-        headers = {
-            'apikey': self.api_key,
-        }
-        # Make the OCR API request
-        try:
-            ocr_logger.info(f"Sending request to OCR.space API for file: {os.path.basename(file_path)}")
-            response = requests.post(
-                self.endpoint,
-                files=files,
-                data=payload,
-                headers=headers,
-                timeout=60  # Add 60 second timeout
-            )
-            ocr_logger.info(f"OCR API status code: {response.status_code}")
-            # Log response text for debugging (first 200 chars)
-            response_preview = response.text[:200] if hasattr(response, 'text') else "No text content"
-            ocr_logger.info(f"OCR API response preview: {response_preview}...")
-            try:
-                response.raise_for_status()
-            except Exception as e:
-                ocr_logger.error(f"HTTP Error: {str(e)}")
-                return {
-                    "success": False,
-                    "error": f"OCR API HTTP Error: {str(e)}",
-                    "text": ""
-                }
-            try:
-                result = response.json()
-                ocr_logger.info(f"OCR API exit code: {result.get('OCRExitCode')}")
-                # Process the OCR results
-                if result.get('OCRExitCode') in [1, 2]:  # Success or partial success
-                    extracted_text = self._extract_text_from_result(result)
-                    processing_time = time.time() - start_time
-                    ocr_logger.info(f"OCR processing completed in {processing_time:.2f} seconds")
-                    ocr_logger.info(f"Extracted text word count: {len(extracted_text.split())}")
-                    return {
-                        "success": True,
-                        "text": extracted_text,
-                        "word_count": len(extracted_text.split()),
-                        "processing_time_ms": int(processing_time * 1000)
-                    }
-                else:
-                    error_msg = result.get('ErrorMessage', 'OCR processing failed')
-                    ocr_logger.error(f"OCR API error: {error_msg}")
-                    return {
-                        "success": False,
-                        "error": error_msg,
-                        "text": ""
-                    }
-            except ValueError as e:
-                ocr_logger.error(f"Invalid JSON response: {str(e)}")
-                return {
-                    "success": False,
-                    "error": f"Invalid response from OCR API: {str(e)}",
-                    "text": ""
-                }
-        except requests.exceptions.RequestException as e:
-            ocr_logger.error(f"OCR API request failed: {str(e)}")
-            return {
-                "success": False,
-                "error": f"OCR API request failed: {str(e)}",
-                "text": ""
-            }
-        finally:
-            # No need to close file handle as we're using bytes directly
-            pass
-    def _extract_text_from_result(self, result: Dict) -> str:
-        """
-        Extract all text from the OCR API result
-        """
-        extracted_text = ""
-        if 'ParsedResults' in result and result['ParsedResults']:
-            for parsed_result in result['ParsedResults']:
-                if parsed_result.get('ParsedText'):
-                    extracted_text += parsed_result['ParsedText']
-        return extracted_text
-    def _get_file_type(self, file_path: str) -> str:
-        """
-        Determine MIME type of a file
-        """
-        mime_type, _ = mimetypes.guess_type(file_path)
-        if mime_type is None:
-            # Default to binary if MIME type can't be determined
-            return 'application/octet-stream'
-        return mime_type
 def is_admin_password(input_text: str) -> bool:
     """
     Check if the input text matches the admin password using secure hash comparison.
     """
     # Hash the input text
     input_hash = hashlib.sha256(input_text.strip().encode()).hexdigest()
@@ -278,6 +105,11 @@ class TextWindowProcessor:
 class TextClassifier:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model_name = MODEL_NAME
         self.tokenizer = None
@@ -421,7 +253,7 @@ class TextClassifier:
                 for window_idx, indices in enumerate(batch_indices):
                     center_idx = len(indices) // 2
                     center_weight = 0.7  # Higher weight for center sentence
-                    edge_weight = 0.3 / (len(indices) - 1) if len(indices) > 1 else 0  # Distribute remaining weight
                     for pos, sent_idx in enumerate(indices):
                         # Apply higher weight to center sentence
@@ -444,10 +276,10 @@ class TextClassifier:
                 # Apply minimal smoothing at prediction boundaries
                 if i > 0 and i < len(sentences) - 1:
-                    prev_human = sentence_scores[i-1]['human_prob'] / max(sentence_appearances[i-1], 1e-10)
-                    prev_ai = sentence_scores[i-1]['ai_prob'] / max(sentence_appearances[i-1], 1e-10)
-                    next_human = sentence_scores[i+1]['human_prob'] / max(sentence_appearances[i+1], 1e-10)
-                    next_ai = sentence_scores[i+1]['ai_prob'] / max(sentence_appearances[i+1], 1e-10)
                     # Check if we're at a prediction boundary
                     current_pred = 'human' if human_prob > ai_prob else 'ai'
@@ -522,105 +354,6 @@ class TextClassifier:
             'num_sentences': num_sentences
         }
-# Function to handle file upload, OCR processing, and text analysis
-def handle_file_upload_and_analyze(file_obj, mode: str) -> tuple:
-    """
-    Handle file upload, OCR processing, and text analysis
-    """
-    # Use the global classifier
-    global classifier
-    classifier_to_use = classifier
-    if file_obj is None:
-        return (
-            "No file uploaded",
-            "Please upload a file to analyze",
-            "No file uploaded for analysis"
-        )
-    # Log the type of file object received
-    logger.info(f"Received file upload of type: {type(file_obj)}")
-    try:
-        # Create a temporary file with an appropriate extension based on content
-        if isinstance(file_obj, bytes):
-            content_start = file_obj[:20]  # Look at the first few bytes
-            # Default to .bin extension
-            file_ext = ".bin"
-            # Try to detect PDF files
-            if content_start.startswith(b'%PDF'):
-                file_ext = ".pdf"
-            # For images, detect by common magic numbers
-            elif content_start.startswith(b'\xff\xd8'):  # JPEG
-                file_ext = ".jpg"
-            elif content_start.startswith(b'\x89PNG'):  # PNG
-                file_ext = ".png"
-            elif content_start.startswith(b'GIF'):      # GIF
-                file_ext = ".gif"
-            # Create a temporary file with the detected extension
-            with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as temp_file:
-                temp_file_path = temp_file.name
-                # Write uploaded file data to the temporary file
-                temp_file.write(file_obj)
-                logger.info(f"Saved uploaded file to {temp_file_path}")
-        else:
-            # Handle other file object types (should not typically happen with Gradio)
-            logger.error(f"Unexpected file object type: {type(file_obj)}")
-            return (
-                "File upload error",
-                "Unexpected file format",
-                "Unable to process this file format"
-            )
-        # Process the file with OCR
-        ocr_processor = OCRProcessor()
-        logger.info(f"Starting OCR processing for file: {temp_file_path}")
-        ocr_result = ocr_processor.process_file(temp_file_path)
-        if not ocr_result["success"]:
-            logger.error(f"OCR processing failed: {ocr_result['error']}")
-            return (
-                "OCR Processing Error",
-                ocr_result["error"],
-                "Failed to extract text from the uploaded file"
-            )
-        # Get the extracted text
-        extracted_text = ocr_result["text"]
-        logger.info(f"OCR processing complete. Extracted {len(extracted_text.split())} words")
-        # If no text was extracted
-        if not extracted_text.strip():
-            logger.warning("No text extracted from file")
-            return (
-                "No text extracted",
-                "The OCR process did not extract any text from the uploaded file.",
-                "No text was found in the uploaded file"
-            )
-        # Call the original text analysis function with the extracted text
-        logger.info("Proceeding with text analysis")
-        return analyze_text(extracted_text, mode, classifier_to_use)
-    except Exception as e:
-        logger.error(f"Error in file upload processing: {str(e)}")
-        return (
-            "Error Processing File",
-            f"An error occurred while processing the file: {str(e)}",
-            "File processing error. Please try again or try a different file."
-        )
-    finally:
-        # Clean up the temporary file
-        if 'temp_file_path' in locals() and os.path.exists(temp_file_path):
-            try:
-                os.remove(temp_file_path)
-                logger.info(f"Removed temporary file: {temp_file_path}")
-            except Exception as e:
-                logger.warning(f"Could not remove temporary file: {str(e)}")
 def initialize_excel_log():
     """Initialize the Excel log file if it doesn't exist."""
     if not os.path.exists(EXCEL_LOG_PATH):
@@ -648,7 +381,6 @@ def initialize_excel_log():
         wb.save(EXCEL_LOG_PATH)
         logger.info(f"Initialized Excel log file at {EXCEL_LOG_PATH}")
 def log_prediction_data(input_text, word_count, prediction, confidence, execution_time, mode):
     """Log prediction data to an Excel file in the /tmp directory."""
     # Initialize the Excel file if it doesn't exist
@@ -691,7 +423,6 @@ def log_prediction_data(input_text, word_count, prediction, confidence, executio
         logger.error(f"Error logging prediction data to Excel: {str(e)}")
         return False
 def get_logs_as_base64():
     """Read the Excel logs file and return as base64 for downloading."""
     if not os.path.exists(EXCEL_LOG_PATH):
@@ -710,7 +441,6 @@ def get_logs_as_base64():
         logger.error(f"Error reading Excel logs: {str(e)}")
         return None
 def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
     """Analyze text using specified mode and return formatted results."""
     # Check if the input text matches the admin password using secure comparison
@@ -833,143 +563,51 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
 # Initialize the classifier globally
 classifier = TextClassifier()
-# Create Gradio interface with a file upload button matched to the radio buttons
-def create_interface():
-    # Custom CSS for the interface
-    css = """
-    #analyze-btn {
-        background-color: #FF8C00 !important;
-        border-color: #FF8C00 !important;
-        color: white !important;
-    }
-    /* Style the file upload to be more compact */
-    .file-upload {
-        width: 150px !important;
-        margin-left: 15px !important;
-    }
-    /* Hide file preview elements */
-    .file-upload .file-preview,
-    .file-upload p:not(.file-upload p:first-child),
-    .file-upload svg,
-    .file-upload [data-testid="chunkFileDropArea"],
-    .file-upload .file-drop {
-        display: none !important;
-    }
-    /* Style the upload button */
-    .file-upload button {
-        height: 40px !important;
-        width: 100% !important;
-        background-color: #f0f0f0 !important;
-        border: 1px solid #d9d9d9 !important;
-        border-radius: 4px !important;
-        color: #333 !important;
-        font-size: 14px !important;
-        display: flex !important;
-        align-items: center !important;
-        justify-content: center !important;
-        margin: 0 !important;
-        padding: 0 !important;
-    }
-    /* Hide the "or" text */
-    .file-upload .or {
-        display: none !important;
-    }
-    /* Make the container compact */
-    .file-upload [data-testid="block"] {
-        margin: 0 !important;
-        padding: 0 !important;
-    }
-    """
-    with gr.Blocks(css=css, title="AI Text Detector") as demo:
-        gr.Markdown("# AI Text Detector")
-        gr.Markdown("Analyze text to detect if it was written by a human or AI. Choose between quick scan and detailed sentence-level analysis. 200+ words suggested for accurate predictions.")
-        with gr.Row():
-            # Left column - Input
-            with gr.Column(scale=1):
-                # Text input area
-                text_input = gr.Textbox(
-                    lines=8,
-                    placeholder="Enter text to analyze...",
-                    label="Input Text"
-                )
-                # Analysis Mode section
-                gr.Markdown("Analysis Mode")
-                gr.Markdown("Quick mode for faster analysis. Detailed mode for sentence-level analysis.")
-                # Simple row layout for radio buttons and file upload
-                with gr.Row():
-                    mode_selection = gr.Radio(
-                        choices=["quick", "detailed"],
-                        value="quick",
-                        label="",
-                        show_label=False
-                    )
-                    # Revert to File component but with better styling
-                    file_upload = gr.File(
-                        file_types=["image", "pdf", "doc", "docx"],
-                        elem_classes=["file-upload"]
-                    )
-                # Analyze button
-                analyze_btn = gr.Button("Analyze Text", elem_id="analyze-btn")
-            # Right column - Results
-            with gr.Column(scale=1):
-                output_html = gr.HTML(label="Highlighted Analysis")
-                output_sentences = gr.Textbox(label="Sentence-by-Sentence Analysis", lines=10)
-                output_result = gr.Textbox(label="Overall Result", lines=4)
-        # Connect components
-        analyze_btn.click(
-            fn=lambda text, mode: analyze_text(text, mode, classifier),
-            inputs=[text_input, mode_selection],
-            outputs=[output_html, output_sentences, output_result]
         )
-        # Use the file upload handler without passing classifier (will use global)
-        file_upload.change(
-            fn=handle_file_upload_and_analyze,
-            inputs=[file_upload, mode_selection],
-            outputs=[output_html, output_sentences, output_result]
-        )
-    return demo
-# Setup the app with CORS middleware
-def setup_app():
-    demo = create_interface()
-    # Get the FastAPI app from Gradio
-    app = demo.app
-    # Add CORS middleware
-    app.add_middleware(
-        CORSMiddleware,
-        allow_origins=["*"],  # For development
-        allow_credentials=True,
-        allow_methods=["GET", "POST", "OPTIONS"],
-        allow_headers=["*"],
-    )
-    return demo
-# Initialize the application
 if __name__ == "__main__":
-    demo = setup_app()
-    # Start the server
     demo.queue()
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=True
-    )

 from io import BytesIO
 import base64
 import hashlib
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 BATCH_SIZE = 8  # Reduced batch size for CPU
 MAX_WORKERS = 4  # Number of worker threads for processing
 # Get password hash from environment variable (more secure)
 ADMIN_PASSWORD_HASH = os.environ.get('ADMIN_PASSWORD_HASH')
 # Excel file path for logs
 EXCEL_LOG_PATH = "/tmp/prediction_logs.xlsx"
 def is_admin_password(input_text: str) -> bool:
     """
     Check if the input text matches the admin password using secure hash comparison.
+    This prevents the password from being visible in the source code.
     """
     # Hash the input text
     input_hash = hashlib.sha256(input_text.strip().encode()).hexdigest()
 class TextClassifier:
     def __init__(self):
+        # Set thread configuration before any model loading or parallel work
+        if not torch.cuda.is_available():
+            torch.set_num_threads(MAX_WORKERS)
+            torch.set_num_interop_threads(MAX_WORKERS)
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model_name = MODEL_NAME
         self.tokenizer = None
                 for window_idx, indices in enumerate(batch_indices):
                     center_idx = len(indices) // 2
                     center_weight = 0.7  # Higher weight for center sentence
+                    edge_weight = 0.3 / (len(indices) - 1)  # Distribute remaining weight
                     for pos, sent_idx in enumerate(indices):
                         # Apply higher weight to center sentence
                 # Apply minimal smoothing at prediction boundaries
                 if i > 0 and i < len(sentences) - 1:
+                    prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
+                    prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
+                    next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
+                    next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
                     # Check if we're at a prediction boundary
                     current_pred = 'human' if human_prob > ai_prob else 'ai'
             'num_sentences': num_sentences
         }
 def initialize_excel_log():
     """Initialize the Excel log file if it doesn't exist."""
     if not os.path.exists(EXCEL_LOG_PATH):
         wb.save(EXCEL_LOG_PATH)
         logger.info(f"Initialized Excel log file at {EXCEL_LOG_PATH}")
 def log_prediction_data(input_text, word_count, prediction, confidence, execution_time, mode):
     """Log prediction data to an Excel file in the /tmp directory."""
     # Initialize the Excel file if it doesn't exist
         logger.error(f"Error logging prediction data to Excel: {str(e)}")
         return False
 def get_logs_as_base64():
     """Read the Excel logs file and return as base64 for downloading."""
     if not os.path.exists(EXCEL_LOG_PATH):
         logger.error(f"Error reading Excel logs: {str(e)}")
         return None
 def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
     """Analyze text using specified mode and return formatted results."""
     # Check if the input text matches the admin password using secure comparison
 # Initialize the classifier globally
 classifier = TextClassifier()
+# Create Gradio interface
+demo = gr.Interface(
+    fn=lambda text, mode: analyze_text(text, mode, classifier),
+    inputs=[
+        gr.Textbox(
+            lines=8,
+            placeholder="Enter text to analyze...",
+            label="Input Text"
+        ),
+        gr.Radio(
+            choices=["quick", "detailed"],
+            value="quick",
+            label="Analysis Mode",
+            info="Quick mode for faster analysis, Detailed mode for sentence-level analysis"
         )
+    ],
+    outputs=[
+        gr.HTML(label="Highlighted Analysis"),
+        gr.Textbox(label="Sentence-by-Sentence Analysis", lines=10),
+        gr.Textbox(label="Overall Result", lines=4)
+    ],
+    title="AI Text Detector",
+    description="Analyze text to detect if it was written by a human or AI. Choose between quick scan and detailed sentence-level analysis. 200+ words suggested for accurate predictions.",
+    api_name="predict",
+    flagging_mode="never"
+)
+# Get the FastAPI app from Gradio
+app = demo.app
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # For development
+    allow_credentials=True,
+    allow_methods=["GET", "POST", "OPTIONS"],
+    allow_headers=["*"],
+)
+# Ensure CORS is applied before launching
 if __name__ == "__main__":
     demo.queue()
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=True
+    )