Spaces:
Running
on
Zero
Running
on
Zero
File size: 19,093 Bytes
0e5a9e8 c22018b c42634a cd9b5e1 1aa3245 cd9b5e1 1aa3245 edc7448 c22018b cd9b5e1 5570342 1aa3245 cd9b5e1 1aa3245 bdd5043 cd9b5e1 76c5192 fffe1f4 76c5192 1aa3245 cd9b5e1 bdd5043 cd9b5e1 edc7448 cd9b5e1 bdd5043 edc7448 6e56362 edc7448 6e56362 edc7448 f8a1b4d 0e896d0 6e56362 0e896d0 f8a1b4d 0e896d0 f8a1b4d 0e896d0 f8a1b4d 0e896d0 f8a1b4d 0e896d0 f8a1b4d 0e896d0 f8a1b4d 0e896d0 f8a1b4d 0e896d0 6e56362 edc7448 1aa3245 cd9b5e1 a71591f 1aa3245 c25bab9 1aa3245 7096e70 1aa3245 7096e70 a71591f edc7448 7096e70 1aa3245 7096e70 a71591f 1aa3245 edc7448 1aa3245 edc7448 1aa3245 cd9b5e1 1aa3245 cd9b5e1 1aa3245 7096e70 1aa3245 cd9b5e1 1aa3245 cd9b5e1 1aa3245 cd9b5e1 1aa3245 7096e70 1aa3245 a71591f 1aa3245 cd9b5e1 1aa3245 cd9b5e1 1aa3245 cd9b5e1 edc7448 cd9b5e1 1aa3245 a71591f 1aa3245 edc7448 1aa3245 edc7448 cd9b5e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 |
import gradio as gr
import spaces
import torchaudio
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
import tempfile
import os
import torch
from gradio_client import Client, handle_file
import random
import time
import io
from pydub import AudioSegment
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# MelodyFlow variation mapping - map your semantic variations to text prompts
VARIATION_PROMPTS = {
'accordion_folk': 'folk accordion melody with traditional folk instruments',
'banjo_bluegrass': 'bluegrass banjo with country folk instruments',
'piano_classical': 'classical piano with orchestral arrangement',
'celtic': 'celtic harp and flute with traditional irish instruments',
'strings_quartet': 'string quartet with violin, viola, cello arrangement',
'synth_retro': 'retro 80s synthesizer with vintage electronic sounds',
'synth_modern': 'modern synthesizer with contemporary electronic production',
'synth_edm': 'edm synthesizer with dance electronic beats',
'lofi_chill': 'lo-fi chill with relaxed jazz hip-hop elements',
'synth_bass': 'heavy bass synthesizer with sub-bass frequencies',
'rock_band': 'rock band with electric guitar, bass, and drums',
'cinematic_epic': 'cinematic epic orchestral with dramatic strings and brass',
'retro_rpg': 'retro rpg chiptune with 8-bit game music elements',
'chiptune': '8-bit chiptune with retro video game sounds',
'steel_drums': 'steel drums with caribbean tropical percussion',
'gamelan_fusion': 'gamelan fusion with indonesian percussion instruments',
'music_box': 'music box with delicate mechanical melody',
'trap_808': 'trap beats with heavy 808 drums and hi-hats',
'lo_fi_drums': 'lo-fi drums with vinyl crackle and jazz samples',
'boom_bap': 'boom bap hip-hop with classic drum breaks',
'percussion_ensemble': 'percussion ensemble with varied drum instruments',
'future_bass': 'future bass with melodic drops and vocal chops',
'synthwave_retro': 'synthwave retro with neon 80s aesthetic',
'melodic_techno': 'melodic techno with driving beats and emotional melodies',
'dubstep_wobble': 'dubstep with heavy wobble bass and electronic drops',
'glitch_hop': 'glitch hop with broken beats and digital artifacts',
'digital_disruption': 'digital disruption with glitchy electronic effects',
'circuit_bent': 'circuit bent with broken electronic hardware sounds',
'orchestral_glitch': 'orchestral glitch with classical instruments and digital errors',
'vapor_drums': 'vaporwave drums with slowed down nostalgic beats',
'industrial_textures': 'industrial textures with harsh mechanical sounds',
'jungle_breaks': 'jungle breaks with fast drum and bass rhythms'
}
def preprocess_audio(waveform):
waveform_np = waveform.cpu().squeeze().numpy()
return torch.from_numpy(waveform_np).unsqueeze(0).to(device)
# ========== MUSICGEN FUNCTIONS (Local ZeroGPU) ==========
@spaces.GPU(duration=10)
def generate_drum_sample():
model = MusicGen.get_pretrained('pharoAIsanders420/micro-musicgen-jungle')
model.set_generation_params(duration=10)
wav = model.generate_unconditional(1).squeeze(0)
filename_without_extension = f'jungle'
filename_with_extension = f'{filename_without_extension}.wav'
audio_write(filename_without_extension, wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True)
return filename_with_extension
@spaces.GPU
def continue_drum_sample(existing_audio_path):
if existing_audio_path is None:
return None
existing_audio, sr = torchaudio.load(existing_audio_path)
existing_audio = existing_audio.to(device)
prompt_duration = 2
output_duration = 10
num_samples = int(prompt_duration * sr)
if existing_audio.shape[1] < num_samples:
raise ValueError("The existing audio is too short for the specified prompt duration.")
start_sample = existing_audio.shape[1] - num_samples
prompt_waveform = existing_audio[..., start_sample:]
model = MusicGen.get_pretrained('pharoAIsanders420/micro-musicgen-jungle')
model.set_generation_params(duration=output_duration)
output = model.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
output = output.to(device)
if output.dim() == 3:
output = output.squeeze(0)
if output.dim() == 1:
output = output.unsqueeze(0)
combined_audio = torch.cat((existing_audio, output), dim=1)
combined_audio = combined_audio.cpu()
combined_file_path = f'./continued_jungle_{random.randint(1000, 9999)}.wav'
torchaudio.save(combined_file_path, combined_audio, sr)
return combined_file_path
@spaces.GPU(duration=30)
def generate_music(wav_filename, prompt_duration, musicgen_model, output_duration):
"""Generate music using the BEGINNING of the audio as prompt"""
if wav_filename is None:
return None
song, sr = torchaudio.load(wav_filename)
song = song.to(device)
model_name = musicgen_model.split(" ")[0]
model_continue = MusicGen.get_pretrained(model_name)
model_continue.set_generation_params(
use_sampling=True,
top_k=250,
top_p=0.0,
temperature=1.0,
duration=output_duration,
cfg_coef=3
)
prompt_waveform = song[..., :int(prompt_duration * sr)]
prompt_waveform = preprocess_audio(prompt_waveform)
output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
output = output.cpu()
if len(output.size()) > 2:
output = output.squeeze()
filename_without_extension = f'continued_music'
filename_with_extension = f'{filename_without_extension}.wav'
audio_write(filename_without_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
return filename_with_extension
@spaces.GPU(duration=30)
def continue_music(input_audio_path, prompt_duration, musicgen_model, output_duration):
"""Continue music using the END of the audio as prompt - extends the audio"""
if input_audio_path is None:
return None
song, sr = torchaudio.load(input_audio_path)
song = song.to(device)
model_name = musicgen_model.split(" ")[0]
model_continue = MusicGen.get_pretrained(model_name)
model_continue.set_generation_params(
use_sampling=True,
top_k=250,
top_p=0.0,
temperature=1.0,
duration=output_duration,
cfg_coef=3
)
# Load original audio as AudioSegment for easier manipulation
original_audio = AudioSegment.from_wav(input_audio_path)
file_paths_for_cleanup = []
# Get the last `prompt_duration` seconds as the prompt
num_samples = int(prompt_duration * sr)
if song.shape[1] < num_samples:
raise ValueError("The prompt_duration is longer than the current audio length.")
# Extract the end portion for prompting
start_sample = song.shape[1] - num_samples
prompt_waveform = song[..., start_sample:]
prompt_waveform = preprocess_audio(prompt_waveform)
# Generate continuation
output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
output = output.cpu()
if len(output.size()) > 2:
output = output.squeeze()
# Save the generated audio WITHOUT aggressive loudness processing
filename_without_extension = f'continue_extension_{random.randint(1000, 9999)}'
filename_with_extension = f'{filename_without_extension}.wav'
audio_write(filename_without_extension, output, model_continue.sample_rate,
strategy="clip") # Just prevent clipping, no loudness changes
# Handle the double .wav extension issue
correct_filename = f'{filename_without_extension}.wav.wav'
if os.path.exists(correct_filename):
generated_audio_segment = AudioSegment.from_wav(correct_filename)
file_paths_for_cleanup.append(correct_filename)
else:
generated_audio_segment = AudioSegment.from_wav(filename_with_extension)
file_paths_for_cleanup.append(filename_with_extension)
# SMART VOLUME MATCHING: Only match the prompt portion
# 1. Remove prompt duration from original (no overlap)
prompt_duration_ms = int(prompt_duration * 1000)
original_minus_prompt = original_audio[:-prompt_duration_ms]
# 2. Extract JUST the prompt portion from generated audio for RMS analysis
generated_prompt_portion = generated_audio_segment[:prompt_duration_ms]
# 3. Calculate RMS of the transition points
original_rms = original_minus_prompt.rms
prompt_portion_rms = generated_prompt_portion.rms
print(f"π Smart volume analysis:")
print(f" Original ending RMS: {original_rms}")
print(f" Generated prompt RMS: {prompt_portion_rms}")
print(f" Generated full RMS: {generated_audio_segment.rms}")
# 4. Match the prompt portion to original level
if prompt_portion_rms > 0:
from pydub.utils import ratio_to_db
volume_adjustment = ratio_to_db(original_rms / prompt_portion_rms)
print(f" Applying {volume_adjustment:.1f}dB to entire generated segment")
# Apply to entire segment (preserves the buildup)
generated_matched = generated_audio_segment + volume_adjustment
else:
generated_matched = generated_audio_segment
# 5. Combine seamlessly
combined_audio = original_minus_prompt + generated_matched
# Save final result
combined_audio_filename = f"extended_audio_{random.randint(1000, 9999)}.wav"
combined_audio.export(combined_audio_filename, format="wav")
# Cleanup temporary files
for file_path in file_paths_for_cleanup:
if os.path.exists(file_path):
os.remove(file_path)
return combined_audio_filename
# ========== MELODYFLOW FUNCTIONS (Via Facebook Space) ==========
def transform_with_melodyflow_api(audio_path, variation, custom_prompt="", solver="euler", flowstep=0.12):
"""Transform audio using Facebook/MelodyFlow space API"""
if audio_path is None:
return None, "β No audio file provided"
base_steps = 125
effective_steps = 25
try:
# Initialize client for Facebook MelodyFlow space
client = Client("facebook/MelodyFlow")
# Determine the prompt to use
if custom_prompt.strip():
prompt_text = custom_prompt.strip()
status_msg = f"β
Transformed with custom prompt: '{prompt_text}' (flowstep: {flowstep}, {effective_steps} steps)"
else:
prompt_text = VARIATION_PROMPTS.get(variation, f"transform this audio to {variation} style")
status_msg = f"β
Transformed with {variation} style (flowstep: {flowstep}, {effective_steps} steps)"
# Set steps based on solver and the fact we're doing editing
# Facebook's space automatically reduces steps for editing:
# EULER: divides by 5, MIDPOINT: divides by 2
if solver == "midpoint":
base_steps = 128
effective_steps = base_steps // 2 # 64 effective steps
else: # euler
base_steps = 125
effective_steps = base_steps // 5 # 25 effective steps
# Call the MelodyFlow API with the base steps (it will auto-reduce)
result = client.predict(
model="facebook/melodyflow-t24-30secs",
text=prompt_text,
solver=solver,
steps=base_steps, # Will be auto-reduced to effective_steps by the space
target_flowstep=flowstep, # This is the key parameter!
regularize=solver == "euler", # Regularize for euler, not for midpoint
regularization_strength=0.2,
duration=30, # Max duration
melody=handle_file(audio_path),
api_name="/predict"
)
# Result is a tuple of 3 audio files (variations)
# We'll use the first variation
if result and len(result) > 0 and result[0]:
# Save the result locally
output_filename = f"melodyflow_{variation}_{random.randint(1000, 9999)}.wav"
# Copy the result file to our local filename
import shutil
shutil.copy2(result[0], output_filename)
return output_filename, status_msg
else:
return None, "β MelodyFlow API returned no results"
except Exception as e:
return None, f"β MelodyFlow API error: {str(e)}"
# ========== GRADIO INTERFACE ==========
# Create the interface
with gr.Blocks() as iface:
gr.Markdown("# π° The Mega Slot Machine")
gr.Markdown("**Hybrid Multi-Model Pipeline**: MicroMusicGen β MelodyFlow (via API) β MusicGen Fine-tunes")
gr.Markdown("*Demonstrating the workflow from our Ableton device in a web interface!*")
with gr.Accordion("How This Works", open=False):
gr.Markdown("""
This demo shows how multiple AI models can work together:
1. **Generate** initial audio with MicroMusicGen (super fast jungle drums)
2. **Transform** it using MelodyFlow (via Facebook's space API)
3. **Continue** with MusicGen fine-tunes (trained on specific genres)
4. **Repeat** the cycle to create infinite musical journeys!
The models run with different PyTorch versions, so we use the Facebook MelodyFlow space via API.
**Performance Note**: For audio transformation, MelodyFlow automatically uses fewer steps than generation:
- EULER solver: 25 effective steps (fast, good quality)
- MIDPOINT solver: 64 effective steps (slower, potentially higher quality)
""")
# ========== STEP 1: GENERATE ==========
gr.Markdown("## π΅ Step 1: Generate Initial Audio")
with gr.Row():
with gr.Column():
generate_button = gr.Button("Generate Jungle Drums", variant="primary", size="lg")
continue_drum_button = gr.Button("Continue Drums", size="sm")
main_audio = gr.Audio(
label="π΅ Current Audio (flows through pipeline)",
type="filepath",
interactive=True,
show_download_button=True
)
# ========== STEP 2: TRANSFORM ==========
gr.Markdown("## ποΈ Step 2: Transform with MelodyFlow")
with gr.Row():
with gr.Column(scale=2):
transform_variation = gr.Dropdown(
label="Transform Style",
choices=list(VARIATION_PROMPTS.keys()),
value="synth_modern",
interactive=True
)
with gr.Column(scale=3):
transform_prompt = gr.Textbox(
label="Custom Prompt (optional)",
placeholder="Leave empty to use style above, or enter custom transformation prompt",
lines=2
)
with gr.Row():
transform_solver = gr.Dropdown(
label="Solver",
choices=["euler", "midpoint"],
value="euler",
info="EULER: faster (25 steps), MIDPOINT: slower but potentially higher quality (64 steps)"
)
transform_flowstep = gr.Slider(
label="Transform Intensity (Flowstep)",
minimum=0.0,
maximum=0.15,
step=0.01,
value=0.12,
info="Lower values = more dramatic transformation"
)
transform_button = gr.Button("ποΈ Transform Audio", variant="secondary", size="lg")
transform_status = gr.Textbox(label="Transform Status", value="Ready to transform", interactive=False)
# ========== STEP 3: CONTINUE ==========
gr.Markdown("## πΌ Step 3: Continue with MusicGen")
with gr.Row():
with gr.Column():
prompt_duration = gr.Dropdown(
label="Prompt Duration (seconds)",
choices=list(range(1, 11)),
value=5
)
output_duration = gr.Slider(
label="Output Duration (seconds)",
minimum=10,
maximum=30,
step=1,
value=20
)
with gr.Column():
musicgen_model = gr.Dropdown(
label="MusicGen Model",
choices=[
"thepatch/vanya_ai_dnb_0.1 (small)",
"thepatch/budots_remix (small)",
"thepatch/PhonkV2 (small)",
"thepatch/bleeps-medium (medium)",
"thepatch/hoenn_lofi (large)",
"foureyednymph/musicgen-sza-sos-small (small)"
],
value="thepatch/vanya_ai_dnb_0.1 (small)"
)
# Two different continuation options with clear explanations
with gr.Row():
with gr.Column():
gr.Markdown("### π Continue from Beginning")
gr.Markdown("*Uses the **first** X seconds as prompt. Good for reimagining/reworking from a starting point.*")
generate_music_button = gr.Button("π Continue from Beginning", variant="primary", size="lg")
with gr.Column():
gr.Markdown("### β‘οΈ Extend from End")
gr.Markdown("*Uses the **last** X seconds as prompt. Extends your audio by adding new content to the end.*")
continue_music_button = gr.Button("β‘οΈ Extend from End", variant="secondary", size="lg")
# ========== EVENT HANDLERS ==========
# Step 1: Generate
generate_button.click(generate_drum_sample, outputs=[main_audio])
continue_drum_button.click(continue_drum_sample, inputs=[main_audio], outputs=[main_audio])
# Step 2: Transform (using Facebook MelodyFlow API)
transform_button.click(
transform_with_melodyflow_api,
inputs=[main_audio, transform_variation, transform_prompt, transform_solver, transform_flowstep],
outputs=[main_audio, transform_status]
)
# Step 3: Continue (two different approaches)
generate_music_button.click(
generate_music,
inputs=[main_audio, prompt_duration, musicgen_model, output_duration],
outputs=[main_audio]
)
continue_music_button.click(
continue_music,
inputs=[main_audio, prompt_duration, musicgen_model, output_duration],
outputs=[main_audio]
)
if __name__ == "__main__":
iface.launch() |