Spaces:

thepatch
/

micro-slot-machine

Running on Zero

File size: 19,093 Bytes

import gradio as gr
import spaces
import torchaudio
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
import tempfile
import os
import torch
from gradio_client import Client, handle_file
import random
import time
import io
from pydub import AudioSegment

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# MelodyFlow variation mapping - map your semantic variations to text prompts
VARIATION_PROMPTS = {
    'accordion_folk': 'folk accordion melody with traditional folk instruments',
    'banjo_bluegrass': 'bluegrass banjo with country folk instruments',
    'piano_classical': 'classical piano with orchestral arrangement',
    'celtic': 'celtic harp and flute with traditional irish instruments',
    'strings_quartet': 'string quartet with violin, viola, cello arrangement',
    'synth_retro': 'retro 80s synthesizer with vintage electronic sounds',
    'synth_modern': 'modern synthesizer with contemporary electronic production',
    'synth_edm': 'edm synthesizer with dance electronic beats',
    'lofi_chill': 'lo-fi chill with relaxed jazz hip-hop elements',
    'synth_bass': 'heavy bass synthesizer with sub-bass frequencies',
    'rock_band': 'rock band with electric guitar, bass, and drums',
    'cinematic_epic': 'cinematic epic orchestral with dramatic strings and brass',
    'retro_rpg': 'retro rpg chiptune with 8-bit game music elements',
    'chiptune': '8-bit chiptune with retro video game sounds',
    'steel_drums': 'steel drums with caribbean tropical percussion',
    'gamelan_fusion': 'gamelan fusion with indonesian percussion instruments',
    'music_box': 'music box with delicate mechanical melody',
    'trap_808': 'trap beats with heavy 808 drums and hi-hats',
    'lo_fi_drums': 'lo-fi drums with vinyl crackle and jazz samples',
    'boom_bap': 'boom bap hip-hop with classic drum breaks',
    'percussion_ensemble': 'percussion ensemble with varied drum instruments',
    'future_bass': 'future bass with melodic drops and vocal chops',
    'synthwave_retro': 'synthwave retro with neon 80s aesthetic',
    'melodic_techno': 'melodic techno with driving beats and emotional melodies',
    'dubstep_wobble': 'dubstep with heavy wobble bass and electronic drops',
    'glitch_hop': 'glitch hop with broken beats and digital artifacts',
    'digital_disruption': 'digital disruption with glitchy electronic effects',
    'circuit_bent': 'circuit bent with broken electronic hardware sounds',
    'orchestral_glitch': 'orchestral glitch with classical instruments and digital errors',
    'vapor_drums': 'vaporwave drums with slowed down nostalgic beats',
    'industrial_textures': 'industrial textures with harsh mechanical sounds',
    'jungle_breaks': 'jungle breaks with fast drum and bass rhythms'
}

def preprocess_audio(waveform):
    waveform_np = waveform.cpu().squeeze().numpy()
    return torch.from_numpy(waveform_np).unsqueeze(0).to(device)

# ========== MUSICGEN FUNCTIONS (Local ZeroGPU) ==========

@spaces.GPU(duration=10)
def generate_drum_sample():
    model = MusicGen.get_pretrained('pharoAIsanders420/micro-musicgen-jungle')
    model.set_generation_params(duration=10)
    wav = model.generate_unconditional(1).squeeze(0)
    
    filename_without_extension = f'jungle'
    filename_with_extension = f'{filename_without_extension}.wav'

    audio_write(filename_without_extension, wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True)
    return filename_with_extension

@spaces.GPU
def continue_drum_sample(existing_audio_path):
    if existing_audio_path is None:
        return None
        
    existing_audio, sr = torchaudio.load(existing_audio_path)
    existing_audio = existing_audio.to(device)
    
    prompt_duration = 2
    output_duration = 10

    num_samples = int(prompt_duration * sr)
    if existing_audio.shape[1] < num_samples:
        raise ValueError("The existing audio is too short for the specified prompt duration.")

    start_sample = existing_audio.shape[1] - num_samples
    prompt_waveform = existing_audio[..., start_sample:]

    model = MusicGen.get_pretrained('pharoAIsanders420/micro-musicgen-jungle')
    model.set_generation_params(duration=output_duration)

    output = model.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
    output = output.to(device)

    if output.dim() == 3:
        output = output.squeeze(0)
    if output.dim() == 1:
        output = output.unsqueeze(0)

    combined_audio = torch.cat((existing_audio, output), dim=1)
    combined_audio = combined_audio.cpu()

    combined_file_path = f'./continued_jungle_{random.randint(1000, 9999)}.wav'
    torchaudio.save(combined_file_path, combined_audio, sr)
    return combined_file_path

@spaces.GPU(duration=30)
def generate_music(wav_filename, prompt_duration, musicgen_model, output_duration):
    """Generate music using the BEGINNING of the audio as prompt"""
    if wav_filename is None:
        return None
        
    song, sr = torchaudio.load(wav_filename)
    song = song.to(device)

    model_name = musicgen_model.split(" ")[0]
    model_continue = MusicGen.get_pretrained(model_name)

    model_continue.set_generation_params(
        use_sampling=True,
        top_k=250,
        top_p=0.0,
        temperature=1.0,
        duration=output_duration,
        cfg_coef=3
    )

    prompt_waveform = song[..., :int(prompt_duration * sr)]
    prompt_waveform = preprocess_audio(prompt_waveform)
    
    output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
    output = output.cpu()
    
    if len(output.size()) > 2:
        output = output.squeeze()
    
    filename_without_extension = f'continued_music'
    filename_with_extension = f'{filename_without_extension}.wav'
    audio_write(filename_without_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)

    return filename_with_extension

@spaces.GPU(duration=30)
def continue_music(input_audio_path, prompt_duration, musicgen_model, output_duration):
    """Continue music using the END of the audio as prompt - extends the audio"""
    if input_audio_path is None:
        return None
        
    song, sr = torchaudio.load(input_audio_path)
    song = song.to(device)

    model_name = musicgen_model.split(" ")[0]
    model_continue = MusicGen.get_pretrained(model_name)
    model_continue.set_generation_params(
        use_sampling=True,
        top_k=250,
        top_p=0.0,
        temperature=1.0,
        duration=output_duration,
        cfg_coef=3
    )

    # Load original audio as AudioSegment for easier manipulation
    original_audio = AudioSegment.from_wav(input_audio_path)
    file_paths_for_cleanup = []

    # Get the last `prompt_duration` seconds as the prompt
    num_samples = int(prompt_duration * sr)
    if song.shape[1] < num_samples:
        raise ValueError("The prompt_duration is longer than the current audio length.")

    # Extract the end portion for prompting
    start_sample = song.shape[1] - num_samples
    prompt_waveform = song[..., start_sample:]
    prompt_waveform = preprocess_audio(prompt_waveform)

    # Generate continuation
    output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
    output = output.cpu()

    if len(output.size()) > 2:
        output = output.squeeze()

    # Save the generated audio WITHOUT aggressive loudness processing
    filename_without_extension = f'continue_extension_{random.randint(1000, 9999)}'
    filename_with_extension = f'{filename_without_extension}.wav'
    audio_write(filename_without_extension, output, model_continue.sample_rate, 
               strategy="clip")  # Just prevent clipping, no loudness changes
    
    # Handle the double .wav extension issue
    correct_filename = f'{filename_without_extension}.wav.wav'
    if os.path.exists(correct_filename):
        generated_audio_segment = AudioSegment.from_wav(correct_filename)
        file_paths_for_cleanup.append(correct_filename)
    else:
        generated_audio_segment = AudioSegment.from_wav(filename_with_extension)
        file_paths_for_cleanup.append(filename_with_extension)

    # SMART VOLUME MATCHING: Only match the prompt portion

    # 1. Remove prompt duration from original (no overlap)  
    prompt_duration_ms = int(prompt_duration * 1000)
    original_minus_prompt = original_audio[:-prompt_duration_ms]

    # 2. Extract JUST the prompt portion from generated audio for RMS analysis
    generated_prompt_portion = generated_audio_segment[:prompt_duration_ms]

    # 3. Calculate RMS of the transition points
    original_rms = original_minus_prompt.rms
    prompt_portion_rms = generated_prompt_portion.rms

    print(f"🔊 Smart volume analysis:")
    print(f"   Original ending RMS: {original_rms}")
    print(f"   Generated prompt RMS: {prompt_portion_rms}")
    print(f"   Generated full RMS: {generated_audio_segment.rms}")

    # 4. Match the prompt portion to original level
    if prompt_portion_rms > 0:
        from pydub.utils import ratio_to_db
        volume_adjustment = ratio_to_db(original_rms / prompt_portion_rms)
        print(f"   Applying {volume_adjustment:.1f}dB to entire generated segment")
        
        # Apply to entire segment (preserves the buildup)
        generated_matched = generated_audio_segment + volume_adjustment
    else:
        generated_matched = generated_audio_segment

    # 5. Combine seamlessly
    combined_audio = original_minus_prompt + generated_matched
    
    # Save final result
    combined_audio_filename = f"extended_audio_{random.randint(1000, 9999)}.wav"
    combined_audio.export(combined_audio_filename, format="wav")

    # Cleanup temporary files
    for file_path in file_paths_for_cleanup:
        if os.path.exists(file_path):
            os.remove(file_path)

    return combined_audio_filename

# ========== MELODYFLOW FUNCTIONS (Via Facebook Space) ==========

def transform_with_melodyflow_api(audio_path, variation, custom_prompt="", solver="euler", flowstep=0.12):
    """Transform audio using Facebook/MelodyFlow space API"""
    if audio_path is None:
        return None, "❌ No audio file provided"
    
    base_steps = 125
    effective_steps = 25
    
    try:
        # Initialize client for Facebook MelodyFlow space
        client = Client("facebook/MelodyFlow")
        
        # Determine the prompt to use
        if custom_prompt.strip():
            prompt_text = custom_prompt.strip()
            status_msg = f"✅ Transformed with custom prompt: '{prompt_text}' (flowstep: {flowstep}, {effective_steps} steps)"
        else:
            prompt_text = VARIATION_PROMPTS.get(variation, f"transform this audio to {variation} style")
            status_msg = f"✅ Transformed with {variation} style (flowstep: {flowstep}, {effective_steps} steps)"
        
        # Set steps based on solver and the fact we're doing editing
        # Facebook's space automatically reduces steps for editing:
        # EULER: divides by 5, MIDPOINT: divides by 2
        if solver == "midpoint":
            base_steps = 128
            effective_steps = base_steps // 2  # 64 effective steps
        else:  # euler
            base_steps = 125  
            effective_steps = base_steps // 5  # 25 effective steps
        
        # Call the MelodyFlow API with the base steps (it will auto-reduce)
        result = client.predict(
            model="facebook/melodyflow-t24-30secs",
            text=prompt_text,
            solver=solver,
            steps=base_steps,  # Will be auto-reduced to effective_steps by the space
            target_flowstep=flowstep,  # This is the key parameter!
            regularize=solver == "euler",  # Regularize for euler, not for midpoint
            regularization_strength=0.2,
            duration=30,  # Max duration
            melody=handle_file(audio_path),
            api_name="/predict"
        )
        
        # Result is a tuple of 3 audio files (variations)
        # We'll use the first variation
        if result and len(result) > 0 and result[0]:
            # Save the result locally
            output_filename = f"melodyflow_{variation}_{random.randint(1000, 9999)}.wav"
            
            # Copy the result file to our local filename
            import shutil
            shutil.copy2(result[0], output_filename)
            
            return output_filename, status_msg
        else:
            return None, "❌ MelodyFlow API returned no results"
            
    except Exception as e:
        return None, f"❌ MelodyFlow API error: {str(e)}"

# ========== GRADIO INTERFACE ==========

# Create the interface
with gr.Blocks() as iface:
    gr.Markdown("# 🎰 The Mega Slot Machine")
    gr.Markdown("**Hybrid Multi-Model Pipeline**: MicroMusicGen → MelodyFlow (via API) → MusicGen Fine-tunes")
    gr.Markdown("*Demonstrating the workflow from our Ableton device in a web interface!*")

    with gr.Accordion("How This Works", open=False):
        gr.Markdown("""

        This demo shows how multiple AI models can work together:

        

        1. **Generate** initial audio with MicroMusicGen (super fast jungle drums)

        2. **Transform** it using MelodyFlow (via Facebook's space API) 

        3. **Continue** with MusicGen fine-tunes (trained on specific genres)

        4. **Repeat** the cycle to create infinite musical journeys!

        

        The models run with different PyTorch versions, so we use the Facebook MelodyFlow space via API.

        

        **Performance Note**: For audio transformation, MelodyFlow automatically uses fewer steps than generation:

        - EULER solver: 25 effective steps (fast, good quality)

        - MIDPOINT solver: 64 effective steps (slower, potentially higher quality)

        """)

    # ========== STEP 1: GENERATE ==========
    gr.Markdown("## 🎵 Step 1: Generate Initial Audio")
    
    with gr.Row():
        with gr.Column():
            generate_button = gr.Button("Generate Jungle Drums", variant="primary", size="lg")
            continue_drum_button = gr.Button("Continue Drums", size="sm")

    main_audio = gr.Audio(
        label="🎵 Current Audio (flows through pipeline)", 
        type="filepath", 
        interactive=True, 
        show_download_button=True
    )

    # ========== STEP 2: TRANSFORM ==========
    gr.Markdown("## 🎛️ Step 2: Transform with MelodyFlow")
    
    with gr.Row():
        with gr.Column(scale=2):
            transform_variation = gr.Dropdown(
                label="Transform Style",
                choices=list(VARIATION_PROMPTS.keys()),
                value="synth_modern",
                interactive=True
            )
            
        with gr.Column(scale=3):
            transform_prompt = gr.Textbox(
                label="Custom Prompt (optional)",
                placeholder="Leave empty to use style above, or enter custom transformation prompt",
                lines=2
            )
    
    with gr.Row():
        transform_solver = gr.Dropdown(
            label="Solver",
            choices=["euler", "midpoint"],
            value="euler",
            info="EULER: faster (25 steps), MIDPOINT: slower but potentially higher quality (64 steps)"
        )
        transform_flowstep = gr.Slider(
            label="Transform Intensity (Flowstep)",
            minimum=0.0,
            maximum=0.15,
            step=0.01,
            value=0.12,
            info="Lower values = more dramatic transformation"
        )
        transform_button = gr.Button("🎛️ Transform Audio", variant="secondary", size="lg")
    
    transform_status = gr.Textbox(label="Transform Status", value="Ready to transform", interactive=False)

    # ========== STEP 3: CONTINUE ==========
    gr.Markdown("## 🎼 Step 3: Continue with MusicGen")
    
    with gr.Row():
        with gr.Column():
            prompt_duration = gr.Dropdown(
                label="Prompt Duration (seconds)", 
                choices=list(range(1, 11)), 
                value=5
            )
            output_duration = gr.Slider(
                label="Output Duration (seconds)", 
                minimum=10, 
                maximum=30, 
                step=1, 
                value=20
            )
        
        with gr.Column():
            musicgen_model = gr.Dropdown(
                label="MusicGen Model", 
                choices=[
                    "thepatch/vanya_ai_dnb_0.1 (small)",
                    "thepatch/budots_remix (small)",
                    "thepatch/PhonkV2 (small)",
                    "thepatch/bleeps-medium (medium)",
                    "thepatch/hoenn_lofi (large)",
                    "foureyednymph/musicgen-sza-sos-small (small)"
                ], 
                value="thepatch/vanya_ai_dnb_0.1 (small)"
            )

    # Two different continuation options with clear explanations
    with gr.Row():
        with gr.Column():
            gr.Markdown("### 🔄 Continue from Beginning")
            gr.Markdown("*Uses the **first** X seconds as prompt. Good for reimagining/reworking from a starting point.*")
            generate_music_button = gr.Button("🔄 Continue from Beginning", variant="primary", size="lg")
        
        with gr.Column():
            gr.Markdown("### ➡️ Extend from End")
            gr.Markdown("*Uses the **last** X seconds as prompt. Extends your audio by adding new content to the end.*")
            continue_music_button = gr.Button("➡️ Extend from End", variant="secondary", size="lg")

    # ========== EVENT HANDLERS ==========
    
    # Step 1: Generate
    generate_button.click(generate_drum_sample, outputs=[main_audio])
    continue_drum_button.click(continue_drum_sample, inputs=[main_audio], outputs=[main_audio])
    
    # Step 2: Transform (using Facebook MelodyFlow API)
    transform_button.click(
        transform_with_melodyflow_api,
        inputs=[main_audio, transform_variation, transform_prompt, transform_solver, transform_flowstep],
        outputs=[main_audio, transform_status]
    )
    
    # Step 3: Continue (two different approaches)
    generate_music_button.click(
        generate_music, 
        inputs=[main_audio, prompt_duration, musicgen_model, output_duration], 
        outputs=[main_audio]
    )
    
    continue_music_button.click(
        continue_music,
        inputs=[main_audio, prompt_duration, musicgen_model, output_duration],
        outputs=[main_audio]
    )

if __name__ == "__main__":
    iface.launch()