Spaces:

datnth1709
/

Realtime-Translation

Build error

App Files Files Community

datnth1709 commited on Sep 23, 2022

Commit

2f12a3f

1 Parent(s): e6ce204

update inference

Browse files

Files changed (2) hide show

README.md +3 -3
app.py +14 -152

README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-title: FantasticFour S2T MT Demo
-emoji: 🐠
-colorFrom: red
 colorTo: gray
 sdk: gradio
 sdk_version: 3.3.1

 ---
+title: Realtime S2T MT Demo
+emoji: 🥑
+colorFrom: blue
 colorTo: gray
 sdk: gradio
 sdk_version: 3.3.1

app.py CHANGED Viewed

@@ -77,65 +77,13 @@ def speech2text_vi(audio):
     return beam_search_output
-"""English speech2text"""
-nltk.download("punkt")
-# Loading the model and the tokenizer
-model_name = "facebook/s2t-small-librispeech-asr"
-eng_tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
-eng_model = Wav2Vec2ForCTC.from_pretrained(model_name)
-def load_data(input_file):
-    """ Function for resampling to ensure that the speech input is sampled at 16KHz.
-    """
-    # read the file
-    speech, sample_rate = librosa.load(input_file)
-    # make it 1-D
-    if len(speech.shape) > 1:
-        speech = speech[:, 0] + speech[:, 1]
-    # Resampling at 16KHz since wav2vec2-base-960h is pretrained and fine-tuned on speech audio sampled at 16 KHz.
-    if sample_rate != 16000:
-        speech = librosa.resample(speech, sample_rate, 16000)
-    return speech
-def correct_casing(input_sentence):
-    """ This function is for correcting the casing of the generated transcribed text
-    """
-    sentences = nltk.sent_tokenize(input_sentence)
-    return (' '.join([s.replace(s[0], s[0].capitalize(), 1) for s in sentences]))
-def speech2text_en(input_file):
-    """This function generates transcripts for the provided audio input
-    """
-    speech = load_data(input_file)
-    # Tokenize
-    input_values = eng_tokenizer(speech, return_tensors="pt").input_values
-    # Take logits
-    logits = eng_model(input_values).logits
-    # Take argmax
-    predicted_ids = torch.argmax(logits, dim=-1)
-    # Get the words from predicted word ids
-    transcription = eng_tokenizer.decode(predicted_ids[0])
-    # Output is all upper case
-    transcription = correct_casing(transcription.lower())
-    return transcription
 """Machine translation"""
 vien_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-vi-en_PhoMT"
-envi_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-en-vi_PhoMT"
 vien_translator = pipeline("translation", model=vien_model_checkpoint)
-envi_translator = pipeline("translation", model=envi_model_checkpoint)
 def translate_vi2en(Vietnamese):
     return vien_translator(Vietnamese)[0]['translation_text']
-def translate_en2vi(English):
-    return envi_translator(English)[0]['translation_text']
 """ Inference"""
 def inference_vien(audio):
@@ -143,46 +91,6 @@ def inference_vien(audio):
     en_text = translate_vi2en(vi_text)
     return vi_text, en_text
-def inference_envi(audio):
-    en_text = speech2text_en(audio)
-    vi_text = translate_en2vi(en_text)
-    return en_text, vi_text
-def transcribe_vi(audio, state_vi="", state_en=""):
-    ds = speech_file_to_array_fn(audio.name)
-    # infer model
-    input_values = processor(
-          ds["speech"],
-          sampling_rate=ds["sampling_rate"],
-          return_tensors="pt"
-    ).input_values
-    # decode ctc output
-    logits = vi_model(input_values).logits[0]
-    pred_ids = torch.argmax(logits, dim=-1)
-    greedy_search_output = processor.decode(pred_ids)
-    beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
-    state_vi += beam_search_output + " "
-    en_text = translate_vi2en(beam_search_output)
-    state_en += en_text + " "
-    return state_vi, state_en
-def transcribe_en(audio, state_en="", state_vi=""):
-    speech = load_data(audio)
-    # Tokenize
-    input_values = eng_tokenizer(speech, return_tensors="pt").input_values
-    # Take logits
-    logits = eng_model(input_values).logits
-    # Take argmax
-    predicted_ids = torch.argmax(logits, dim=-1)
-    # Get the words from predicted word ids
-    transcription = eng_tokenizer.decode(predicted_ids[0])
-    # Output is all upper case
-    transcription = correct_casing(transcription.lower())
-    state_en += transcription + "+"
-    vi_text = translate_en2vi(transcription)
-    state_vi += vi_text + "+"
-    return state_en, state_vi
 def transcribe_vi_1(audio, state_en=""):
     ds = speech_file_to_array_fn(audio.name)
     # infer model
@@ -200,69 +108,23 @@ def transcribe_vi_1(audio, state_en=""):
     state_en += en_text + " "
     return state_en, state_en
-def transcribe_en_1(audio, state_vi=""):
-    speech = load_data(audio)
-    # Tokenize
-    input_values = eng_tokenizer(speech, return_tensors="pt").input_values
-    # Take logits
-    logits = eng_model(input_values).logits
-    # Take argmax
-    predicted_ids = torch.argmax(logits, dim=-1)
-    # Get the words from predicted word ids
-    transcription = eng_tokenizer.decode(predicted_ids[0])
-    # Output is all upper case
-    transcription = correct_casing(transcription.lower())
-    vi_text = translate_en2vi(transcription)
-    state_vi += vi_text + "+"
-    return state_vi, state_vi
 """Gradio demo"""
 vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
                    "Ánh mắt ta chạm nhau. Chỉ muốn ngắm anh lâu thật lâu.",
                    "Nếu như một câu nói có thể khiến em vui."]
 vi_example_voice =[['vi_speech_01.wav'], ['vi_speech_02.wav'], ['vi_speech_03.wav']]
-en_example_text = ["According to a study by Statista, the global AI market is set to grow up to 54 percent every single year.",
-                   "As one of the world's greatest cities, Air New Zealand is proud to add the Big Apple to its list of 29 international destinations.",
-                   "And yet, earlier this month, I found myself at Halloween Horror Nights at Universal Orlando Resort, one of the most popular Halloween events in the US among hardcore horror buffs."
-                   ]
-en_example_voice =[['en_speech_01.wav'], ['en_speech_02.wav'], ['en_speech_03.wav']]
-with gr.Blocks() as demo:
-    with gr.Tabs():
-        with gr.TabItem("Vi-En Realtime Translation"):
-            gr.Interface(
-                fn=transcribe_vi_1,
-                inputs=[
-                    gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True),
-                    "state",
-                ],
-                outputs= [
-                    "text",
-                    "state",
-                ],
-                examples=vi_example_voice,
-                live=True).launch()
-    with gr.Tabs():
-        with gr.TabItem("En-Vi Realtime Translation"):
-            gr.Interface(
-                fn=transcribe_en_1,
-                inputs=[
-                    gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True),
-                    "state",
-                ],
-                outputs= [
-                    "text",
-                    "state",
-                ],
-                examples=en_example_voice,
-                live=True).launch()
-if __name__ == "__main__":
-    demo.launch()

     return beam_search_output
 """Machine translation"""
 vien_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-vi-en_PhoMT"
 vien_translator = pipeline("translation", model=vien_model_checkpoint)
 def translate_vi2en(Vietnamese):
     return vien_translator(Vietnamese)[0]['translation_text']
 """ Inference"""
 def inference_vien(audio):
     en_text = translate_vi2en(vi_text)
     return vi_text, en_text
 def transcribe_vi_1(audio, state_en=""):
     ds = speech_file_to_array_fn(audio.name)
     # infer model
     state_en += en_text + " "
     return state_en, state_en
 """Gradio demo"""
 vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
                    "Ánh mắt ta chạm nhau. Chỉ muốn ngắm anh lâu thật lâu.",
                    "Nếu như một câu nói có thể khiến em vui."]
 vi_example_voice =[['vi_speech_01.wav'], ['vi_speech_02.wav'], ['vi_speech_03.wav']]
+with gr.TabItem("Vi-En Realtime Translation"):
+    gr.Interface(
+        fn=transcribe_vi_1,
+        inputs=[
+            gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True),
+            "state",
+        ],
+        outputs= [
+            "text",
+            "state",
+        ],
+        examples=vi_example_voice,
+        live=True).launch()