Thumbwrestler commited on
Commit
b6528cf
·
verified ·
1 Parent(s): 90bc98f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -0
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+ from einops import rearrange
5
+ from stable_audio_tools import get_pretrained_model
6
+ from stable_audio_tools.inference.generation import generate_diffusion_cond
7
+
8
+ # ---------- Load model ----------
9
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
10
+ MODEL_REPO = "stabilityai/stable-audio-open-small" # accept license once on the model page
11
+
12
+ # Download + load (cached on first run)
13
+ model, model_config = get_pretrained_model(MODEL_REPO)
14
+ SAMPLE_RATE = int(model_config["sample_rate"]) # 44100
15
+ SAMPLE_SIZE = int(model_config["sample_size"]) # internal size; we will pass seconds via conditioning
16
+ model = model.to(DEVICE)
17
+ model.eval()
18
+
19
+ def tta_seconds_to_sample_size(seconds: float) -> int:
20
+ # Clamp to 1–11s (model cap)
21
+ seconds = max(1.0, min(float(seconds), 11.0))
22
+ return int(seconds)
23
+
24
+ @torch.inference_mode()
25
+ def generate_sfx(prompt, seconds, steps, cfg_scale, sampler):
26
+ if not prompt or not prompt.strip():
27
+ return None, "Enter a descriptive prompt (e.g., 'footsteps on gravel, outdoors, distant')."
28
+ seconds = tta_seconds_to_sample_size(seconds)
29
+
30
+ # Conditioning per stable-audio-tools API
31
+ conditioning = [{
32
+ "prompt": prompt.strip(),
33
+ "seconds_total": seconds
34
+ }]
35
+
36
+ # Fast, CPU-friendly defaults:
37
+ # steps=8–12 is a good range; pingpong sampler is efficient on CPU
38
+ output = generate_diffusion_cond(
39
+ model=model,
40
+ steps=int(steps),
41
+ cfg_scale=float(cfg_scale),
42
+ conditioning=conditioning,
43
+ sample_size=SAMPLE_SIZE,
44
+ sampler_type=sampler,
45
+ device=DEVICE
46
+ )
47
+
48
+ # output shape: (B, C, N) -> here B=1. Make it (C, N)
49
+ audio = rearrange(output, "b d n -> d (b n)")
50
+ # Normalize to [-1, 1] float32
51
+ audio = audio.to(torch.float32)
52
+ peak = torch.max(torch.abs(audio))
53
+ if peak > 0:
54
+ audio = (audio / peak).clamp(-1, 1)
55
+ audio_np = audio.cpu().numpy()
56
+
57
+ # Gradio expects (sr, np.ndarray [N] or [N, C]); provide stereo [N, 2]
58
+ audio_np = audio_np.T # (N, C)
59
+ return (SAMPLE_RATE, audio_np), "Done."
60
+
61
+ EXAMPLES = [
62
+ "Footsteps on gravel, outdoors, medium pace, natural ambience",
63
+ "Heavy metal door slam with long metallic reverb, industrial",
64
+ "Rain on window, occasional distant thunder, calm night",
65
+ "Camera shutter click, mechanical, clean studio",
66
+ "Sci-fi laser blast, short, bright, synthetic fizz"
67
+ ]
68
+
69
+ with gr.Blocks(title="Professor Treviño — Text→SFX (Free)") as demo:
70
+ gr.Markdown("### Text-to-Sound Effects — Free, no login\nEnter a descriptive prompt and generate up to ~11s stereo @ 44.1 kHz.")
71
+
72
+ with gr.Row():
73
+ prompt = gr.Textbox(label="Prompt", placeholder="e.g., footsteps on gravel, outdoors, distant")
74
+ with gr.Row():
75
+ seconds = gr.Slider(3, 11, value=6, step=1, label="Duration (seconds)")
76
+ steps = gr.Slider(6, 16, value=8, step=1, label="Diffusion steps (higher = better/slower)")
77
+ with gr.Row():
78
+ cfg_scale = gr.Slider(0.5, 4.0, value=1.0, step=0.1, label="Guidance (CFG scale)")
79
+ sampler = gr.Dropdown(choices=["pingpong", "heun", "dpmpp-2m"], value="pingpong", label="Sampler")
80
+
81
+ btn = gr.Button("Generate")
82
+ audio_out = gr.Audio(label="Output", type="numpy")
83
+ status = gr.Markdown()
84
+
85
+ btn.click(fn=generate_sfx, inputs=[prompt, seconds, steps, cfg_scale, sampler], outputs=[audio_out, status])
86
+ gr.Examples(EXAMPLES, [prompt], [], fn=None, cache_examples=False)
87
+
88
+ demo.queue(concurrency_count=1, max_size=8).launch()