Spaces:

Thumbwrestler
/

Trevino_Sound_Effects_Generator

Runtime error

App Files Files Community

Thumbwrestler commited on Sep 14

Commit

b6528cf

verified ·

1 Parent(s): 90bc98f

Create app.py

Browse files

Files changed (1) hide show

app.py +88 -0

app.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import gradio as gr
+import numpy as np
+import torch
+from einops import rearrange
+from stable_audio_tools import get_pretrained_model
+from stable_audio_tools.inference.generation import generate_diffusion_cond
+# ---------- Load model ----------
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+MODEL_REPO = "stabilityai/stable-audio-open-small"  # accept license once on the model page
+# Download + load (cached on first run)
+model, model_config = get_pretrained_model(MODEL_REPO)
+SAMPLE_RATE = int(model_config["sample_rate"])   # 44100
+SAMPLE_SIZE  = int(model_config["sample_size"])  # internal size; we will pass seconds via conditioning
+model = model.to(DEVICE)
+model.eval()
+def tta_seconds_to_sample_size(seconds: float) -> int:
+    # Clamp to 1–11s (model cap)
+    seconds = max(1.0, min(float(seconds), 11.0))
+    return int(seconds)
+@torch.inference_mode()
+def generate_sfx(prompt, seconds, steps, cfg_scale, sampler):
+    if not prompt or not prompt.strip():
+        return None, "Enter a descriptive prompt (e.g., 'footsteps on gravel, outdoors, distant')."
+    seconds = tta_seconds_to_sample_size(seconds)
+    # Conditioning per stable-audio-tools API
+    conditioning = [{
+        "prompt": prompt.strip(),
+        "seconds_total": seconds
+    }]
+    # Fast, CPU-friendly defaults:
+    # steps=8–12 is a good range; pingpong sampler is efficient on CPU
+    output = generate_diffusion_cond(
+        model=model,
+        steps=int(steps),
+        cfg_scale=float(cfg_scale),
+        conditioning=conditioning,
+        sample_size=SAMPLE_SIZE,
+        sampler_type=sampler,
+        device=DEVICE
+    )
+    # output shape: (B, C, N) -> here B=1. Make it (C, N)
+    audio = rearrange(output, "b d n -> d (b n)")
+    # Normalize to [-1, 1] float32
+    audio = audio.to(torch.float32)
+    peak = torch.max(torch.abs(audio))
+    if peak > 0:
+        audio = (audio / peak).clamp(-1, 1)
+    audio_np = audio.cpu().numpy()
+    # Gradio expects (sr, np.ndarray [N] or [N, C]); provide stereo [N, 2]
+    audio_np = audio_np.T  # (N, C)
+    return (SAMPLE_RATE, audio_np), "Done."
+EXAMPLES = [
+    "Footsteps on gravel, outdoors, medium pace, natural ambience",
+    "Heavy metal door slam with long metallic reverb, industrial",
+    "Rain on window, occasional distant thunder, calm night",
+    "Camera shutter click, mechanical, clean studio",
+    "Sci-fi laser blast, short, bright, synthetic fizz"
+]
+with gr.Blocks(title="Professor Treviño — Text→SFX (Free)") as demo:
+    gr.Markdown("### Text-to-Sound Effects — Free, no login\nEnter a descriptive prompt and generate up to ~11s stereo @ 44.1 kHz.")
+    with gr.Row():
+        prompt = gr.Textbox(label="Prompt", placeholder="e.g., footsteps on gravel, outdoors, distant")
+    with gr.Row():
+        seconds = gr.Slider(3, 11, value=6, step=1, label="Duration (seconds)")
+        steps = gr.Slider(6, 16, value=8, step=1, label="Diffusion steps (higher = better/slower)")
+    with gr.Row():
+        cfg_scale = gr.Slider(0.5, 4.0, value=1.0, step=0.1, label="Guidance (CFG scale)")
+        sampler = gr.Dropdown(choices=["pingpong", "heun", "dpmpp-2m"], value="pingpong", label="Sampler")
+    btn = gr.Button("Generate")
+    audio_out = gr.Audio(label="Output", type="numpy")
+    status = gr.Markdown()
+    btn.click(fn=generate_sfx, inputs=[prompt, seconds, steps, cfg_scale, sampler], outputs=[audio_out, status])
+    gr.Examples(EXAMPLES, [prompt], [], fn=None, cache_examples=False)
+demo.queue(concurrency_count=1, max_size=8).launch()