reach-vb commited on
Commit
b9ba023
·
1 Parent(s): 4bc3e7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -11
app.py CHANGED
@@ -42,12 +42,6 @@ if torch.cuda.is_available() and torch.cuda.device_count() > 0:
42
  torch_dtype=torch.float16,
43
  device="cuda:0"
44
  )
45
- pipe.model.config.forced_decoder_ids = (
46
- pipe.tokenizer.get_decoder_prompt_ids(
47
- language="it",
48
- task="transcribe"
49
- )
50
- )
51
  else:
52
  pipe = pipeline(model=checkpoint)
53
 
@@ -129,7 +123,7 @@ def make_frame(t):
129
  return last_image
130
 
131
 
132
- def predict(audio_path):
133
  global chunks, start_chunk, last_draws, last_image
134
 
135
  start_chunk = 0
@@ -143,6 +137,12 @@ def predict(audio_path):
143
 
144
  # Run Whisper to get word-level timestamps.
145
  audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate)
 
 
 
 
 
 
146
  output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word", generate_kwargs =
147
  {
148
  "penalty_alpha": 0.6,
@@ -189,16 +189,19 @@ article = """
189
  """
190
 
191
  examples = [
192
- "examples/steve_jobs_crazy_ones.mp3",
193
- "examples/henry5.wav",
194
- "examples/stupid_people.mp3",
195
- "examples/beos_song.mp3",
196
  ]
197
 
198
  gr.Interface(
199
  fn=predict,
200
  inputs=[
201
  gr.Audio(label="Upload Audio", source="upload", type="filepath"),
 
 
 
202
  ],
203
  outputs=[
204
  gr.Video(label="Output Video"),
 
42
  torch_dtype=torch.float16,
43
  device="cuda:0"
44
  )
 
 
 
 
 
 
45
  else:
46
  pipe = pipeline(model=checkpoint)
47
 
 
123
  return last_image
124
 
125
 
126
+ def predict(audio_path, lang):
127
  global chunks, start_chunk, last_draws, last_image
128
 
129
  start_chunk = 0
 
137
 
138
  # Run Whisper to get word-level timestamps.
139
  audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate)
140
+ pipe.model.config.forced_decoder_ids = (
141
+ pipe.tokenizer.get_decoder_prompt_ids(
142
+ language=lang,
143
+ task="transcribe"
144
+ )
145
+ )
146
  output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word", generate_kwargs =
147
  {
148
  "penalty_alpha": 0.6,
 
189
  """
190
 
191
  examples = [
192
+ ["examples/steve_jobs_crazy_ones.mp3", "en"],
193
+ ["examples/henry5.wav", "en"],
194
+ ["examples/stupid_people.mp3", "en"],
195
+ ["examples/beos_song.mp3", "en"],
196
  ]
197
 
198
  gr.Interface(
199
  fn=predict,
200
  inputs=[
201
  gr.Audio(label="Upload Audio", source="upload", type="filepath"),
202
+ gr.Dropdown(
203
+ ["en", "de", "it", "fr", "nl"], label="Lang", info="Select a language!", max_choices=1
204
+ )
205
  ],
206
  outputs=[
207
  gr.Video(label="Output Video"),