import spaces import gradio as gr import torch import librosa import numpy as np from inference import inference from huggingface_hub import hf_hub_download import os from pathlib import Path def download_models_from_hub(): """ Download model checkpoints from Hugging Face Model Hub """ model_dir = Path("checkpoints") model_dir.mkdir(exist_ok=True) # Original checkpoint filenames on HF Hub models = { "main": "EmbeddingModel_MERT_768-epoch=0073-val_loss=0.1058-val_acc=0.9585-val_f1=0.9366-val_precision=0.9936-val_recall=0.8857.ckpt", "backup": "step=007000-val_loss=0.1831-val_acc=0.9278.ckpt" } downloaded_models = {} for model_name, filename in models.items(): local_path = model_dir / filename if not local_path.exists(): print(f"📥 Downloading {model_name} model from Hugging Face Hub...") model_path = hf_hub_download( repo_id="mippia/FST-checkpoints", filename=filename, local_dir=str(model_dir), local_dir_use_symlinks=False ) print(f"✅ {model_name} model downloaded successfully!") downloaded_models[model_name] = str(local_path) else: print(f"✅ {model_name} model already exists locally") downloaded_models[model_name] = str(local_path) return downloaded_models @spaces.GPU def detect_ai_audio(audio_file): """ Detect whether the uploaded audio file was generated by AI """ if audio_file is None: return """
⚠️ Please upload an audio file
""" try: result = inference(audio_file) # Format result with better styling if "AI" in str(result).upper() or "artificial" in str(result).lower() or "fake" in str(result).lower(): status = "AI Generated" color = "#ff6b6b" confidence = "High confidence this audio was generated by AI" else: status = "Human Generated" color = "#51cf66" confidence = "High confidence this audio was created by humans" formatted_result = f"""
{status}
{confidence}
Raw output: {result}
""" return formatted_result except Exception as e: error_result = f"""
Error
Failed to process audio: {str(e)}
""" return error_result # Custom CSS for modern design custom_css = """ /* Global background gradient */ .gradio-container { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; min-height: 100vh; } /* Main container styling */ .main-container { background: rgba(255, 255, 255, 0.95) !important; backdrop-filter: blur(10px) !important; border-radius: 20px !important; box-shadow: 0 20px 40px rgba(0,0,0,0.1) !important; margin: 20px !important; padding: 30px !important; } /* Title styling */ h1 { background: linear-gradient(135deg, #667eea, #764ba2) !important; -webkit-background-clip: text !important; -webkit-text-fill-color: transparent !important; text-align: center !important; font-size: 3em !important; font-weight: 800 !important; margin-bottom: 10px !important; } /* Description text */ .gradio-markdown p { text-align: center !important; font-size: 1.2em !important; color: #555 !important; margin-bottom: 30px !important; } /* Audio upload component */ .upload-container { background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%) !important; border-radius: 15px !important; padding: 20px !important; border: none !important; box-shadow: 0 10px 30px rgba(240, 147, 251, 0.3) !important; transition: all 0.3s ease !important; } .upload-container:hover { transform: translateY(-5px) !important; box-shadow: 0 15px 40px rgba(240, 147, 251, 0.4) !important; } /* Output container */ .output-container { background: linear-gradient(135deg, #a8edea 0%, #fed6e3 100%) !important; border-radius: 15px !important; padding: 20px !important; border: none !important; box-shadow: 0 10px 30px rgba(168, 237, 234, 0.3) !important; min-height: 150px !important; } /* Button styling */ .gr-button { background: linear-gradient(135deg, #667eea, #764ba2) !important; border: none !important; border-radius: 25px !important; padding: 12px 30px !important; font-weight: 600 !important; color: white !important; box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4) !important; transition: all 0.3s ease !important; } .gr-button:hover { transform: translateY(-2px) !important; box-shadow: 0 8px 25px rgba(102, 126, 234, 0.6) !important; } /* Animation */ @keyframes fadeInUp { from { opacity: 0; transform: translateY(30px); } to { opacity: 1; transform: translateY(0); } } .gradio-container > div { animation: fadeInUp 0.8s ease-out !important; } /* Responsive design */ @media (max-width: 768px) { h1 { font-size: 2em !important; } .main-container { margin: 10px !important; padding: 20px !important; } } """ # Initialize the app print("🚀 Starting FST AI Audio Detection App...") print("📦 Initializing models...") # Download models at startup models = download_models_from_hub() # Check if main model is available if models.get("main"): print("✅ Main model ready for inference") else: print("⚠️ Warning: Main model not available, app may not work properly") # Create Gradio interface demo = gr.Interface( fn=detect_ai_audio, inputs=gr.Audio( type="filepath", label="Upload Audio File", elem_classes=["upload-container"] ), outputs=gr.HTML( label="Detection Result", elem_classes=["output-container"] ), title="AI Audio Detector", description="""

Advanced AI technology to accurately detect whether uploaded audio was generated by AI!

Supported formats: MP3, WAV, M4A, FLAC and various audio formats

Powered by Fusion Segment Transformer (FST) - ICASSP 2026

""", examples=[], css=custom_css, theme=gr.themes.Soft( primary_hue="blue", secondary_hue="purple", neutral_hue="gray", font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"] ), elem_classes=["main-container"] ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=True, show_api=False, show_error=True )