Spaces:

mippia
/

AI-Music-Detection-FST

Running on Zero

File size: 7,748 Bytes

import spaces
import gradio as gr
import torch
import librosa
import numpy as np
from inference import inference
from huggingface_hub import hf_hub_download
import os
from pathlib import Path


def download_models_from_hub():
    """
    Download model checkpoints from Hugging Face Model Hub
    """
    model_dir = Path("checkpoints")
    model_dir.mkdir(exist_ok=True)
    
    # Original checkpoint filenames on HF Hub
    models = {
        "main": "EmbeddingModel_MERT_768-epoch=0073-val_loss=0.1058-val_acc=0.9585-val_f1=0.9366-val_precision=0.9936-val_recall=0.8857.ckpt",
        "backup": "step=007000-val_loss=0.1831-val_acc=0.9278.ckpt"
    }
    
    downloaded_models = {}
    
    for model_name, filename in models.items():
        local_path = model_dir / filename
        
        if not local_path.exists():
            print(f"📥 Downloading {model_name} model from Hugging Face Hub...")
            model_path = hf_hub_download(
                repo_id="mippia/FST-checkpoints",
                filename=filename,
                local_dir=str(model_dir),
                local_dir_use_symlinks=False
            )
            print(f"✅ {model_name} model downloaded successfully!")
            downloaded_models[model_name] = str(local_path)
        else:
            print(f"✅ {model_name} model already exists locally")
            downloaded_models[model_name] = str(local_path)
    
    return downloaded_models

@spaces.GPU
def detect_ai_audio(audio_file):
    """
    Detect whether the uploaded audio file was generated by AI
    """
    if audio_file is None:
        return """
        <div style="text-align: center; padding: 20px; border-radius: 10px; background: linear-gradient(135deg, #ff6b6b22, #ff6b6b11);">
            <div style="font-size: 18px; color: #ff6b6b;">⚠️ Please upload an audio file</div>
        </div>
        """
    
    try:
        result = inference(audio_file)
        
        # Format result with better styling
        if "AI" in str(result).upper() or "artificial" in str(result).lower() or "fake" in str(result).lower():
            status = "AI Generated"
            color = "#ff6b6b"
            confidence = "High confidence this audio was generated by AI"
        else:
            status = "Human Generated"  
            color = "#51cf66"
            confidence = "High confidence this audio was created by humans"
        
        formatted_result = f"""
        <div style="text-align: center; padding: 25px; border-radius: 15px; background: linear-gradient(135deg, {color}22, {color}11); border: 2px solid {color}33;">
            <div style="font-size: 28px; font-weight: bold; color: {color}; margin-bottom: 10px;">{status}</div>
            <div style="font-size: 16px; color: #666; margin-bottom: 8px;">{confidence}</div>
            <div style="font-size: 14px; color: #888;">Raw output: {result}</div>
        </div>
        """
        
        return formatted_result
        
    except Exception as e:
        error_result = f"""
        <div style="text-align: center; padding: 20px; border-radius: 10px; background: linear-gradient(135deg, #ff6b6b22, #ff6b6b11);">
            <div style="font-size: 20px; font-weight: bold; color: #ff6b6b; margin-bottom: 8px;">Error</div>
            <div style="font-size: 14px; color: #666;">Failed to process audio: {str(e)}</div>
        </div>
        """
        return error_result

# Custom CSS for modern design
custom_css = """
/* Global background gradient */
.gradio-container {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
    min-height: 100vh;
}

/* Main container styling */
.main-container {
    background: rgba(255, 255, 255, 0.95) !important;
    backdrop-filter: blur(10px) !important;
    border-radius: 20px !important;
    box-shadow: 0 20px 40px rgba(0,0,0,0.1) !important;
    margin: 20px !important;
    padding: 30px !important;
}

/* Title styling */
h1 {
    background: linear-gradient(135deg, #667eea, #764ba2) !important;
    -webkit-background-clip: text !important;
    -webkit-text-fill-color: transparent !important;
    text-align: center !important;
    font-size: 3em !important;
    font-weight: 800 !important;
    margin-bottom: 10px !important;
}

/* Description text */
.gradio-markdown p {
    text-align: center !important;
    font-size: 1.2em !important;
    color: #555 !important;
    margin-bottom: 30px !important;
}

/* Audio upload component */
.upload-container {
    background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%) !important;
    border-radius: 15px !important;
    padding: 20px !important;
    border: none !important;
    box-shadow: 0 10px 30px rgba(240, 147, 251, 0.3) !important;
    transition: all 0.3s ease !important;
}

.upload-container:hover {
    transform: translateY(-5px) !important;
    box-shadow: 0 15px 40px rgba(240, 147, 251, 0.4) !important;
}

/* Output container */
.output-container {
    background: linear-gradient(135deg, #a8edea 0%, #fed6e3 100%) !important;
    border-radius: 15px !important;
    padding: 20px !important;
    border: none !important;
    box-shadow: 0 10px 30px rgba(168, 237, 234, 0.3) !important;
    min-height: 150px !important;
}

/* Button styling */
.gr-button {
    background: linear-gradient(135deg, #667eea, #764ba2) !important;
    border: none !important;
    border-radius: 25px !important;
    padding: 12px 30px !important;
    font-weight: 600 !important;
    color: white !important;
    box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4) !important;
    transition: all 0.3s ease !important;
}

.gr-button:hover {
    transform: translateY(-2px) !important;
    box-shadow: 0 8px 25px rgba(102, 126, 234, 0.6) !important;
}

/* Animation */
@keyframes fadeInUp {
    from {
        opacity: 0;
        transform: translateY(30px);
    }
    to {
        opacity: 1;
        transform: translateY(0);
    }
}

.gradio-container > div {
    animation: fadeInUp 0.8s ease-out !important;
}

/* Responsive design */
@media (max-width: 768px) {
    h1 {
        font-size: 2em !important;
    }
    
    .main-container {
        margin: 10px !important;
        padding: 20px !important;
    }
}
"""

# Initialize the app
print("🚀 Starting FST AI Audio Detection App...")
print("📦 Initializing models...")

# Download models at startup
models = download_models_from_hub()

# Check if main model is available
if models.get("main"):
    print("✅ Main model ready for inference")
else:
    print("⚠️ Warning: Main model not available, app may not work properly")

# Create Gradio interface
demo = gr.Interface(
    fn=detect_ai_audio,
    inputs=gr.Audio(
        type="filepath", 
        label="Upload Audio File",
        elem_classes=["upload-container"]
    ),
    outputs=gr.HTML(
        label="Detection Result",
        elem_classes=["output-container"]
    ),
    title="AI Audio Detector",
    description="""
    <div style="text-align: center; font-size: 1.2em; color: #555; margin: 20px 0;">
        <p><strong>Advanced AI technology</strong> to accurately detect whether uploaded audio was generated by AI!</p>
        <p>Supported formats: MP3, WAV, M4A, FLAC and various audio formats</p>
        <p>Powered by Fusion Segment Transformer (FST) - ICASSP 2026</p>
    </div>
    """,
    examples=[],
    css=custom_css,
    theme=gr.themes.Soft(
        primary_hue="blue",
        secondary_hue="purple",
        neutral_hue="gray",
        font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"]
    ),
    elem_classes=["main-container"]
)

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True,
        show_api=False,
        show_error=True
    )