Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import numpy as np
|
| 3 |
+
import torch
|
| 4 |
+
from einops import rearrange
|
| 5 |
+
from stable_audio_tools import get_pretrained_model
|
| 6 |
+
from stable_audio_tools.inference.generation import generate_diffusion_cond
|
| 7 |
+
|
| 8 |
+
# ---------- Load model ----------
|
| 9 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 10 |
+
MODEL_REPO = "stabilityai/stable-audio-open-small" # accept license once on the model page
|
| 11 |
+
|
| 12 |
+
# Download + load (cached on first run)
|
| 13 |
+
model, model_config = get_pretrained_model(MODEL_REPO)
|
| 14 |
+
SAMPLE_RATE = int(model_config["sample_rate"]) # 44100
|
| 15 |
+
SAMPLE_SIZE = int(model_config["sample_size"]) # internal size; we will pass seconds via conditioning
|
| 16 |
+
model = model.to(DEVICE)
|
| 17 |
+
model.eval()
|
| 18 |
+
|
| 19 |
+
def tta_seconds_to_sample_size(seconds: float) -> int:
|
| 20 |
+
# Clamp to 1–11s (model cap)
|
| 21 |
+
seconds = max(1.0, min(float(seconds), 11.0))
|
| 22 |
+
return int(seconds)
|
| 23 |
+
|
| 24 |
+
@torch.inference_mode()
|
| 25 |
+
def generate_sfx(prompt, seconds, steps, cfg_scale, sampler):
|
| 26 |
+
if not prompt or not prompt.strip():
|
| 27 |
+
return None, "Enter a descriptive prompt (e.g., 'footsteps on gravel, outdoors, distant')."
|
| 28 |
+
seconds = tta_seconds_to_sample_size(seconds)
|
| 29 |
+
|
| 30 |
+
# Conditioning per stable-audio-tools API
|
| 31 |
+
conditioning = [{
|
| 32 |
+
"prompt": prompt.strip(),
|
| 33 |
+
"seconds_total": seconds
|
| 34 |
+
}]
|
| 35 |
+
|
| 36 |
+
# Fast, CPU-friendly defaults:
|
| 37 |
+
# steps=8–12 is a good range; pingpong sampler is efficient on CPU
|
| 38 |
+
output = generate_diffusion_cond(
|
| 39 |
+
model=model,
|
| 40 |
+
steps=int(steps),
|
| 41 |
+
cfg_scale=float(cfg_scale),
|
| 42 |
+
conditioning=conditioning,
|
| 43 |
+
sample_size=SAMPLE_SIZE,
|
| 44 |
+
sampler_type=sampler,
|
| 45 |
+
device=DEVICE
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# output shape: (B, C, N) -> here B=1. Make it (C, N)
|
| 49 |
+
audio = rearrange(output, "b d n -> d (b n)")
|
| 50 |
+
# Normalize to [-1, 1] float32
|
| 51 |
+
audio = audio.to(torch.float32)
|
| 52 |
+
peak = torch.max(torch.abs(audio))
|
| 53 |
+
if peak > 0:
|
| 54 |
+
audio = (audio / peak).clamp(-1, 1)
|
| 55 |
+
audio_np = audio.cpu().numpy()
|
| 56 |
+
|
| 57 |
+
# Gradio expects (sr, np.ndarray [N] or [N, C]); provide stereo [N, 2]
|
| 58 |
+
audio_np = audio_np.T # (N, C)
|
| 59 |
+
return (SAMPLE_RATE, audio_np), "Done."
|
| 60 |
+
|
| 61 |
+
EXAMPLES = [
|
| 62 |
+
"Footsteps on gravel, outdoors, medium pace, natural ambience",
|
| 63 |
+
"Heavy metal door slam with long metallic reverb, industrial",
|
| 64 |
+
"Rain on window, occasional distant thunder, calm night",
|
| 65 |
+
"Camera shutter click, mechanical, clean studio",
|
| 66 |
+
"Sci-fi laser blast, short, bright, synthetic fizz"
|
| 67 |
+
]
|
| 68 |
+
|
| 69 |
+
with gr.Blocks(title="Professor Treviño — Text→SFX (Free)") as demo:
|
| 70 |
+
gr.Markdown("### Text-to-Sound Effects — Free, no login\nEnter a descriptive prompt and generate up to ~11s stereo @ 44.1 kHz.")
|
| 71 |
+
|
| 72 |
+
with gr.Row():
|
| 73 |
+
prompt = gr.Textbox(label="Prompt", placeholder="e.g., footsteps on gravel, outdoors, distant")
|
| 74 |
+
with gr.Row():
|
| 75 |
+
seconds = gr.Slider(3, 11, value=6, step=1, label="Duration (seconds)")
|
| 76 |
+
steps = gr.Slider(6, 16, value=8, step=1, label="Diffusion steps (higher = better/slower)")
|
| 77 |
+
with gr.Row():
|
| 78 |
+
cfg_scale = gr.Slider(0.5, 4.0, value=1.0, step=0.1, label="Guidance (CFG scale)")
|
| 79 |
+
sampler = gr.Dropdown(choices=["pingpong", "heun", "dpmpp-2m"], value="pingpong", label="Sampler")
|
| 80 |
+
|
| 81 |
+
btn = gr.Button("Generate")
|
| 82 |
+
audio_out = gr.Audio(label="Output", type="numpy")
|
| 83 |
+
status = gr.Markdown()
|
| 84 |
+
|
| 85 |
+
btn.click(fn=generate_sfx, inputs=[prompt, seconds, steps, cfg_scale, sampler], outputs=[audio_out, status])
|
| 86 |
+
gr.Examples(EXAMPLES, [prompt], [], fn=None, cache_examples=False)
|
| 87 |
+
|
| 88 |
+
demo.queue(concurrency_count=1, max_size=8).launch()
|