Spaces:

thepatch
/

micro-slot-machine

Sleeping

App Files Files Community

thecollabagepatch commited on Jul 11

Commit

edc7448

1 Parent(s): aa00058

continue added back in

Browse files

Files changed (1) hide show

app.py +104 -30

app.py CHANGED Viewed

@@ -9,6 +9,8 @@ import torch
 from gradio_client import Client, handle_file
 import random
 import time
 # Check if CUDA is available
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -105,6 +107,7 @@ def continue_drum_sample(existing_audio_path):
 @spaces.GPU
 def generate_music(wav_filename, prompt_duration, musicgen_model, output_duration):
     if wav_filename is None:
         return None
@@ -138,6 +141,74 @@ def generate_music(wav_filename, prompt_duration, musicgen_model, output_duratio
     return filename_with_extension
 # ========== MELODYFLOW FUNCTIONS (Via Facebook Space) ==========
 def transform_with_melodyflow_api(audio_path, variation, custom_prompt="", solver="euler", flowstep=0.12):
@@ -145,24 +216,10 @@ def transform_with_melodyflow_api(audio_path, variation, custom_prompt="", solve
     if audio_path is None:
         return None, "❌ No audio file provided"
-    # Initialize variables first to avoid scope issues
-    base_steps = 125
-    effective_steps = 25
     try:
         # Initialize client for Facebook MelodyFlow space
         client = Client("facebook/MelodyFlow")
-        # Set steps based on solver and the fact we're doing editing
-        # Facebook's space automatically reduces steps for editing:
-        # EULER: divides by 5, MIDPOINT: divides by 2
-        if solver == "midpoint":
-            base_steps = 128
-            effective_steps = 64  # 128 // 2
-        else:  # euler (default)
-            base_steps = 125
-            effective_steps = 25  # 125 // 5
         # Determine the prompt to use
         if custom_prompt.strip():
             prompt_text = custom_prompt.strip()
@@ -171,6 +228,16 @@ def transform_with_melodyflow_api(audio_path, variation, custom_prompt="", solve
             prompt_text = VARIATION_PROMPTS.get(variation, f"transform this audio to {variation} style")
             status_msg = f"✅ Transformed with {variation} style (flowstep: {flowstep}, {effective_steps} steps)"
         # Call the MelodyFlow API with the base steps (it will auto-reduce)
         result = client.predict(
             model="facebook/melodyflow-t24-30secs",
@@ -188,21 +255,12 @@ def transform_with_melodyflow_api(audio_path, variation, custom_prompt="", solve
         # Result is a tuple of 3 audio files (variations)
         # We'll use the first variation
         if result and len(result) > 0 and result[0]:
-            # Save the result locally with loudness normalization
             output_filename = f"melodyflow_{variation}_{random.randint(1000, 9999)}.wav"
-            # Load the result and apply consistent loudness normalization
-            transformed_audio, sr = torchaudio.load(result[0])
-            # Re-save with same loudness strategy as your MusicGen (no headroom)
-            audio_write(
-                output_filename.replace('.wav', ''),
-                transformed_audio,
-                sr,
-                strategy="loudness",
-                loudness_compressor=True
-                # Note: no loudness_headroom_db parameter like Facebook uses
-            )
             return output_filename, status_msg
         else:
@@ -319,8 +377,18 @@ with gr.Blocks() as iface:
                 ],
                 value="thepatch/vanya_ai_dnb_0.1 (small)"
             )
-    generate_music_button = gr.Button("🎼 Continue with MusicGen", variant="primary", size="lg")
     # ========== EVENT HANDLERS ==========
@@ -335,12 +403,18 @@ with gr.Blocks() as iface:
         outputs=[main_audio, transform_status]
     )
-    # Step 3: Continue
     generate_music_button.click(
         generate_music,
         inputs=[main_audio, prompt_duration, musicgen_model, output_duration],
         outputs=[main_audio]
     )
 if __name__ == "__main__":
     iface.launch()

 from gradio_client import Client, handle_file
 import random
 import time
+import io
+from pydub import AudioSegment
 # Check if CUDA is available
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 @spaces.GPU
 def generate_music(wav_filename, prompt_duration, musicgen_model, output_duration):
+    """Generate music using the BEGINNING of the audio as prompt"""
     if wav_filename is None:
         return None
     return filename_with_extension
+@spaces.GPU
+def continue_music(input_audio_path, prompt_duration, musicgen_model, output_duration):
+    """Continue music using the END of the audio as prompt - extends the audio"""
+    if input_audio_path is None:
+        return None
+    song, sr = torchaudio.load(input_audio_path)
+    song = song.to(device)
+    model_name = musicgen_model.split(" ")[0]
+    model_continue = MusicGen.get_pretrained(model_name)
+    model_continue.set_generation_params(
+        use_sampling=True,
+        top_k=250,
+        top_p=0.0,
+        temperature=1.0,
+        duration=output_duration,
+        cfg_coef=3
+    )
+    # Load original audio as AudioSegment for easier manipulation
+    original_audio = AudioSegment.from_wav(input_audio_path)
+    current_audio = original_audio
+    file_paths_for_cleanup = []
+    # Get the last `prompt_duration` seconds as the prompt
+    num_samples = int(prompt_duration * sr)
+    if song.shape[1] < num_samples:
+        raise ValueError("The prompt_duration is longer than the current audio length.")
+    # Extract the end portion for prompting
+    start_sample = song.shape[1] - num_samples
+    prompt_waveform = song[..., start_sample:]
+    prompt_waveform = preprocess_audio(prompt_waveform)
+    # Generate continuation
+    output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
+    output = output.cpu()
+    if len(output.size()) > 2:
+        output = output.squeeze()
+    # Save the generated audio
+    filename_without_extension = f'continue_extension_{random.randint(1000, 9999)}'
+    filename_with_extension = f'{filename_without_extension}.wav'
+    audio_write(filename_without_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
+    # Handle the double .wav extension issue
+    correct_filename = f'{filename_without_extension}.wav.wav'
+    if os.path.exists(correct_filename):
+        generated_audio_segment = AudioSegment.from_wav(correct_filename)
+        file_paths_for_cleanup.append(correct_filename)
+    else:
+        generated_audio_segment = AudioSegment.from_wav(filename_with_extension)
+        file_paths_for_cleanup.append(filename_with_extension)
+    # Combine original + new audio
+    combined_audio = current_audio + generated_audio_segment
+    combined_audio_filename = f"extended_audio_{random.randint(1000, 9999)}.wav"
+    combined_audio.export(combined_audio_filename, format="wav")
+    # Cleanup temporary files
+    for file_path in file_paths_for_cleanup:
+        if os.path.exists(file_path):
+            os.remove(file_path)
+    return combined_audio_filename
 # ========== MELODYFLOW FUNCTIONS (Via Facebook Space) ==========
 def transform_with_melodyflow_api(audio_path, variation, custom_prompt="", solver="euler", flowstep=0.12):
     if audio_path is None:
         return None, "❌ No audio file provided"
     try:
         # Initialize client for Facebook MelodyFlow space
         client = Client("facebook/MelodyFlow")
         # Determine the prompt to use
         if custom_prompt.strip():
             prompt_text = custom_prompt.strip()
             prompt_text = VARIATION_PROMPTS.get(variation, f"transform this audio to {variation} style")
             status_msg = f"✅ Transformed with {variation} style (flowstep: {flowstep}, {effective_steps} steps)"
+        # Set steps based on solver and the fact we're doing editing
+        # Facebook's space automatically reduces steps for editing:
+        # EULER: divides by 5, MIDPOINT: divides by 2
+        if solver == "midpoint":
+            base_steps = 128
+            effective_steps = base_steps // 2  # 64 effective steps
+        else:  # euler
+            base_steps = 125
+            effective_steps = base_steps // 5  # 25 effective steps
         # Call the MelodyFlow API with the base steps (it will auto-reduce)
         result = client.predict(
             model="facebook/melodyflow-t24-30secs",
         # Result is a tuple of 3 audio files (variations)
         # We'll use the first variation
         if result and len(result) > 0 and result[0]:
+            # Save the result locally
             output_filename = f"melodyflow_{variation}_{random.randint(1000, 9999)}.wav"
+            # Copy the result file to our local filename
+            import shutil
+            shutil.copy2(result[0], output_filename)
             return output_filename, status_msg
         else:
                 ],
                 value="thepatch/vanya_ai_dnb_0.1 (small)"
             )
+    # Two different continuation options with clear explanations
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### 🔄 Continue from Beginning")
+            gr.Markdown("*Uses the **first** X seconds as prompt. Good for reimagining/reworking from a starting point.*")
+            generate_music_button = gr.Button("🔄 Continue from Beginning", variant="primary", size="lg")
+        with gr.Column():
+            gr.Markdown("### ➡️ Extend from End")
+            gr.Markdown("*Uses the **last** X seconds as prompt. Extends your audio by adding new content to the end.*")
+            continue_music_button = gr.Button("➡️ Extend from End", variant="secondary", size="lg")
     # ========== EVENT HANDLERS ==========
         outputs=[main_audio, transform_status]
     )
+    # Step 3: Continue (two different approaches)
     generate_music_button.click(
         generate_music,
         inputs=[main_audio, prompt_duration, musicgen_model, output_duration],
         outputs=[main_audio]
     )
+    continue_music_button.click(
+        continue_music,
+        inputs=[main_audio, prompt_duration, musicgen_model, output_duration],
+        outputs=[main_audio]
+    )
 if __name__ == "__main__":
     iface.launch()