whisper_word_timestamps

Runtime error

reach-vb commited on Jun 26, 2023

Commit

b9ba023

1 Parent(s): 4bc3e7c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -42,12 +42,6 @@ if torch.cuda.is_available() and torch.cuda.device_count() > 0:
         torch_dtype=torch.float16,
         device="cuda:0"
     )
-    pipe.model.config.forced_decoder_ids = (
-        pipe.tokenizer.get_decoder_prompt_ids(
-            language="it",
-            task="transcribe"
-        )
-    )
 else:
     pipe = pipeline(model=checkpoint)
@@ -129,7 +123,7 @@ def make_frame(t):
     return last_image
-def predict(audio_path):
     global chunks, start_chunk, last_draws, last_image
     start_chunk = 0
@@ -143,6 +137,12 @@ def predict(audio_path):
     # Run Whisper to get word-level timestamps.
     audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate)
     output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word", generate_kwargs =
          {
               "penalty_alpha": 0.6,
@@ -189,16 +189,19 @@ article = """
 """
 examples = [
-    "examples/steve_jobs_crazy_ones.mp3",
-    "examples/henry5.wav",
-    "examples/stupid_people.mp3",
-    "examples/beos_song.mp3",
 ]
 gr.Interface(
     fn=predict,
     inputs=[
         gr.Audio(label="Upload Audio", source="upload", type="filepath"),
     ],
     outputs=[
         gr.Video(label="Output Video"),

         torch_dtype=torch.float16,
         device="cuda:0"
     )
 else:
     pipe = pipeline(model=checkpoint)
     return last_image
+def predict(audio_path, lang):
     global chunks, start_chunk, last_draws, last_image
     start_chunk = 0
     # Run Whisper to get word-level timestamps.
     audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate)
+    pipe.model.config.forced_decoder_ids = (
+        pipe.tokenizer.get_decoder_prompt_ids(
+            language=lang,
+            task="transcribe"
+        )
+    )
     output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word", generate_kwargs =
          {
               "penalty_alpha": 0.6,
 """
 examples = [
+    ["examples/steve_jobs_crazy_ones.mp3", "en"],
+    ["examples/henry5.wav", "en"],
+    ["examples/stupid_people.mp3", "en"],
+    ["examples/beos_song.mp3", "en"],
 ]
 gr.Interface(
     fn=predict,
     inputs=[
         gr.Audio(label="Upload Audio", source="upload", type="filepath"),
+        gr.Dropdown(
+            ["en", "de", "it", "fr", "nl"], label="Lang", info="Select a language!", max_choices=1
+        )
     ],
     outputs=[
         gr.Video(label="Output Video"),