Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -42,12 +42,6 @@ if torch.cuda.is_available() and torch.cuda.device_count() > 0:
|
|
| 42 |
torch_dtype=torch.float16,
|
| 43 |
device="cuda:0"
|
| 44 |
)
|
| 45 |
-
pipe.model.config.forced_decoder_ids = (
|
| 46 |
-
pipe.tokenizer.get_decoder_prompt_ids(
|
| 47 |
-
language="it",
|
| 48 |
-
task="transcribe"
|
| 49 |
-
)
|
| 50 |
-
)
|
| 51 |
else:
|
| 52 |
pipe = pipeline(model=checkpoint)
|
| 53 |
|
|
@@ -129,7 +123,7 @@ def make_frame(t):
|
|
| 129 |
return last_image
|
| 130 |
|
| 131 |
|
| 132 |
-
def predict(audio_path):
|
| 133 |
global chunks, start_chunk, last_draws, last_image
|
| 134 |
|
| 135 |
start_chunk = 0
|
|
@@ -143,6 +137,12 @@ def predict(audio_path):
|
|
| 143 |
|
| 144 |
# Run Whisper to get word-level timestamps.
|
| 145 |
audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word", generate_kwargs =
|
| 147 |
{
|
| 148 |
"penalty_alpha": 0.6,
|
|
@@ -189,16 +189,19 @@ article = """
|
|
| 189 |
"""
|
| 190 |
|
| 191 |
examples = [
|
| 192 |
-
"examples/steve_jobs_crazy_ones.mp3",
|
| 193 |
-
"examples/henry5.wav",
|
| 194 |
-
"examples/stupid_people.mp3",
|
| 195 |
-
"examples/beos_song.mp3",
|
| 196 |
]
|
| 197 |
|
| 198 |
gr.Interface(
|
| 199 |
fn=predict,
|
| 200 |
inputs=[
|
| 201 |
gr.Audio(label="Upload Audio", source="upload", type="filepath"),
|
|
|
|
|
|
|
|
|
|
| 202 |
],
|
| 203 |
outputs=[
|
| 204 |
gr.Video(label="Output Video"),
|
|
|
|
| 42 |
torch_dtype=torch.float16,
|
| 43 |
device="cuda:0"
|
| 44 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
else:
|
| 46 |
pipe = pipeline(model=checkpoint)
|
| 47 |
|
|
|
|
| 123 |
return last_image
|
| 124 |
|
| 125 |
|
| 126 |
+
def predict(audio_path, lang):
|
| 127 |
global chunks, start_chunk, last_draws, last_image
|
| 128 |
|
| 129 |
start_chunk = 0
|
|
|
|
| 137 |
|
| 138 |
# Run Whisper to get word-level timestamps.
|
| 139 |
audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate)
|
| 140 |
+
pipe.model.config.forced_decoder_ids = (
|
| 141 |
+
pipe.tokenizer.get_decoder_prompt_ids(
|
| 142 |
+
language=lang,
|
| 143 |
+
task="transcribe"
|
| 144 |
+
)
|
| 145 |
+
)
|
| 146 |
output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word", generate_kwargs =
|
| 147 |
{
|
| 148 |
"penalty_alpha": 0.6,
|
|
|
|
| 189 |
"""
|
| 190 |
|
| 191 |
examples = [
|
| 192 |
+
["examples/steve_jobs_crazy_ones.mp3", "en"],
|
| 193 |
+
["examples/henry5.wav", "en"],
|
| 194 |
+
["examples/stupid_people.mp3", "en"],
|
| 195 |
+
["examples/beos_song.mp3", "en"],
|
| 196 |
]
|
| 197 |
|
| 198 |
gr.Interface(
|
| 199 |
fn=predict,
|
| 200 |
inputs=[
|
| 201 |
gr.Audio(label="Upload Audio", source="upload", type="filepath"),
|
| 202 |
+
gr.Dropdown(
|
| 203 |
+
["en", "de", "it", "fr", "nl"], label="Lang", info="Select a language!", max_choices=1
|
| 204 |
+
)
|
| 205 |
],
|
| 206 |
outputs=[
|
| 207 |
gr.Video(label="Output Video"),
|