Spaces:
Build error
Build error
Commit
·
4b94ac2
1
Parent(s):
63fa33e
add text translation module
Browse files
app.py
CHANGED
|
@@ -4,9 +4,6 @@ import torch
|
|
| 4 |
|
| 5 |
from transformers import pipeline
|
| 6 |
|
| 7 |
-
checkpoint_finetuned = "JackismyShephard/speecht5_tts-finetuned-nst-da"
|
| 8 |
-
|
| 9 |
-
revision = "5af228df418092b681cf31c31e413bdd2b5f9c8c"
|
| 10 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 11 |
|
| 12 |
# load speech translation checkpoint
|
|
@@ -17,14 +14,21 @@ asr_pipe = pipeline(
|
|
| 17 |
chunk_length_s=30,
|
| 18 |
use_fast=True,
|
| 19 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# load text-to-speech checkpoint and speaker embeddings
|
| 22 |
-
|
| 23 |
"text-to-speech",
|
| 24 |
-
model=
|
| 25 |
use_fast=True,
|
| 26 |
device=device,
|
| 27 |
-
revision=
|
| 28 |
)
|
| 29 |
|
| 30 |
speaker_embedding_path = "female_23_vestjylland.npy"
|
|
@@ -38,11 +42,17 @@ max_range = np.iinfo(target_dtype).max
|
|
| 38 |
def translate(audio):
|
| 39 |
outputs = asr_pipe(
|
| 40 |
audio,
|
| 41 |
-
max_new_tokens=256,
|
| 42 |
batch_size=8,
|
| 43 |
-
generate_kwargs={
|
|
|
|
|
|
|
| 44 |
)
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
def synthesise(text):
|
|
@@ -52,7 +62,7 @@ def synthesise(text):
|
|
| 52 |
text = replace_danish_letters(text)
|
| 53 |
|
| 54 |
forward_params = {"speaker_embeddings": speaker_embedding_tensor}
|
| 55 |
-
speech =
|
| 56 |
|
| 57 |
sr, audio = speech["sampling_rate"], speech["audio"]
|
| 58 |
|
|
@@ -95,7 +105,7 @@ replacements = [
|
|
| 95 |
]
|
| 96 |
|
| 97 |
|
| 98 |
-
title = "
|
| 99 |
description = """
|
| 100 |
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Danish. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and JackismyShephard's
|
| 101 |
[speecht5_tts-finetuned-nst-da](https://huggingface.co/JackismyShephard/speecht5_tts-finetuned-nst-da) model for text-to-speech:
|
|
@@ -105,9 +115,7 @@ Demo for cascaded speech-to-speech translation (STST), mapping from source speec
|
|
| 105 |
|
| 106 |
demo = gr.Interface(
|
| 107 |
fn=speech_to_speech_translation,
|
| 108 |
-
inputs=
|
| 109 |
-
gr.Audio(label="Input Speech", type="filepath"),
|
| 110 |
-
],
|
| 111 |
outputs=gr.Audio(label="Translated Speech", type="numpy"),
|
| 112 |
title=title,
|
| 113 |
description=description,
|
|
|
|
| 4 |
|
| 5 |
from transformers import pipeline
|
| 6 |
|
|
|
|
|
|
|
|
|
|
| 7 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 8 |
|
| 9 |
# load speech translation checkpoint
|
|
|
|
| 14 |
chunk_length_s=30,
|
| 15 |
use_fast=True,
|
| 16 |
)
|
| 17 |
+
# load text translation checkpoint
|
| 18 |
+
translation_pipe = pipeline(
|
| 19 |
+
"translation",
|
| 20 |
+
model="facebook/nllb-200-distilled-600M",
|
| 21 |
+
use_fast=True,
|
| 22 |
+
device=device,
|
| 23 |
+
)
|
| 24 |
|
| 25 |
# load text-to-speech checkpoint and speaker embeddings
|
| 26 |
+
tts_pipe = pipeline(
|
| 27 |
"text-to-speech",
|
| 28 |
+
model="JackismyShephard/speecht5_tts-finetuned-nst-da",
|
| 29 |
use_fast=True,
|
| 30 |
device=device,
|
| 31 |
+
revision="5af228df418092b681cf31c31e413bdd2b5f9c8c",
|
| 32 |
)
|
| 33 |
|
| 34 |
speaker_embedding_path = "female_23_vestjylland.npy"
|
|
|
|
| 42 |
def translate(audio):
|
| 43 |
outputs = asr_pipe(
|
| 44 |
audio,
|
|
|
|
| 45 |
batch_size=8,
|
| 46 |
+
generate_kwargs={
|
| 47 |
+
"task": "translate",
|
| 48 |
+
},
|
| 49 |
)
|
| 50 |
+
translated_text = translation_pipe(
|
| 51 |
+
outputs["text"],
|
| 52 |
+
src_lang="eng_Latn",
|
| 53 |
+
tgt_lang="dan_Latn",
|
| 54 |
+
)[0]["translation_text"]
|
| 55 |
+
return translated_text
|
| 56 |
|
| 57 |
|
| 58 |
def synthesise(text):
|
|
|
|
| 62 |
text = replace_danish_letters(text)
|
| 63 |
|
| 64 |
forward_params = {"speaker_embeddings": speaker_embedding_tensor}
|
| 65 |
+
speech = tts_pipe(text, forward_params=forward_params)
|
| 66 |
|
| 67 |
sr, audio = speech["sampling_rate"], speech["audio"]
|
| 68 |
|
|
|
|
| 105 |
]
|
| 106 |
|
| 107 |
|
| 108 |
+
title = "Speech to Danish Speech Translation"
|
| 109 |
description = """
|
| 110 |
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Danish. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and JackismyShephard's
|
| 111 |
[speecht5_tts-finetuned-nst-da](https://huggingface.co/JackismyShephard/speecht5_tts-finetuned-nst-da) model for text-to-speech:
|
|
|
|
| 115 |
|
| 116 |
demo = gr.Interface(
|
| 117 |
fn=speech_to_speech_translation,
|
| 118 |
+
inputs=gr.Audio(label="Input Speech", type="filepath"),
|
|
|
|
|
|
|
| 119 |
outputs=gr.Audio(label="Translated Speech", type="numpy"),
|
| 120 |
title=title,
|
| 121 |
description=description,
|