speech-to-speech-translation

Build error

App Files Files Community

JackismyShephard commited on Mar 3, 2024

Commit

4b94ac2

1 Parent(s): 63fa33e

add text translation module

Browse files

Files changed (1) hide show

app.py +22 -14

app.py CHANGED Viewed

@@ -4,9 +4,6 @@ import torch
 from transformers import pipeline
-checkpoint_finetuned = "JackismyShephard/speecht5_tts-finetuned-nst-da"
-revision = "5af228df418092b681cf31c31e413bdd2b5f9c8c"
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 # load speech translation checkpoint
@@ -17,14 +14,21 @@ asr_pipe = pipeline(
     chunk_length_s=30,
     use_fast=True,
 )
 # load text-to-speech checkpoint and speaker embeddings
-pipe = pipeline(
     "text-to-speech",
-    model=checkpoint_finetuned,
     use_fast=True,
     device=device,
-    revision=revision,
 )
 speaker_embedding_path = "female_23_vestjylland.npy"
@@ -38,11 +42,17 @@ max_range = np.iinfo(target_dtype).max
 def translate(audio):
     outputs = asr_pipe(
         audio,
-        max_new_tokens=256,
         batch_size=8,
-        generate_kwargs={"task": "translate", "language": "danish"},
     )
-    return outputs["text"]
 def synthesise(text):
@@ -52,7 +62,7 @@ def synthesise(text):
     text = replace_danish_letters(text)
     forward_params = {"speaker_embeddings": speaker_embedding_tensor}
-    speech = pipe(text, forward_params=forward_params)
     sr, audio = speech["sampling_rate"], speech["audio"]
@@ -95,7 +105,7 @@ replacements = [
 ]
-title = "Cascaded STST"
 description = """
 Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Danish. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and JackismyShephard's
 [speecht5_tts-finetuned-nst-da](https://huggingface.co/JackismyShephard/speecht5_tts-finetuned-nst-da) model for text-to-speech:
@@ -105,9 +115,7 @@ Demo for cascaded speech-to-speech translation (STST), mapping from source speec
 demo = gr.Interface(
     fn=speech_to_speech_translation,
-    inputs=[
-        gr.Audio(label="Input Speech", type="filepath"),
-    ],
     outputs=gr.Audio(label="Translated Speech", type="numpy"),
     title=title,
     description=description,

 from transformers import pipeline
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 # load speech translation checkpoint
     chunk_length_s=30,
     use_fast=True,
 )
+# load text translation checkpoint
+translation_pipe = pipeline(
+    "translation",
+    model="facebook/nllb-200-distilled-600M",
+    use_fast=True,
+    device=device,
+)
 # load text-to-speech checkpoint and speaker embeddings
+tts_pipe = pipeline(
     "text-to-speech",
+    model="JackismyShephard/speecht5_tts-finetuned-nst-da",
     use_fast=True,
     device=device,
+    revision="5af228df418092b681cf31c31e413bdd2b5f9c8c",
 )
 speaker_embedding_path = "female_23_vestjylland.npy"
 def translate(audio):
     outputs = asr_pipe(
         audio,
         batch_size=8,
+        generate_kwargs={
+            "task": "translate",
+        },
     )
+    translated_text = translation_pipe(
+        outputs["text"],
+        src_lang="eng_Latn",
+        tgt_lang="dan_Latn",
+    )[0]["translation_text"]
+    return translated_text
 def synthesise(text):
     text = replace_danish_letters(text)
     forward_params = {"speaker_embeddings": speaker_embedding_tensor}
+    speech = tts_pipe(text, forward_params=forward_params)
     sr, audio = speech["sampling_rate"], speech["audio"]
 ]
+title = "Speech to Danish Speech Translation"
 description = """
 Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Danish. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and JackismyShephard's
 [speecht5_tts-finetuned-nst-da](https://huggingface.co/JackismyShephard/speecht5_tts-finetuned-nst-da) model for text-to-speech:
 demo = gr.Interface(
     fn=speech_to_speech_translation,
+    inputs=gr.Audio(label="Input Speech", type="filepath"),
     outputs=gr.Audio(label="Translated Speech", type="numpy"),
     title=title,
     description=description,