Spaces:

TlanextliOpenLab
/

MultilanguageTranscTransl

Sleeping

App Files Files Community

Tlanextli commited on Sep 25, 2023

Commit

248bc6b

1 Parent(s): bf409ca

Upload 3 files

Browse files

Files changed (3) hide show

app.py +106 -0
languages_dic.py +101 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import os
+import gradio as gr
+import torch
+import numpy
+import librosa
+import languages_dic
+from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
+title = "Multilanguage Transcription and Translation"
+availableLang = "Afrikaans, Arabic, Armenian, Azerbaijani, Belarusian, Bosnian, Bulgarian, Catalan, Chinese, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, Galician, German, Greek, Hebrew, Hindi, Hungarian, Icelandic, Indonesian, Italian, Japanese, Kannada, Kazakh, Korean, Latvian, Lithuanian, Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian, Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh."
+description1 = """<p style='font-size: 18px;'> Transcribe an audio file containing a speech in any of the languages listed below and translate it to English. </p>
+<p style='font-size: 16px;'> This demo uses the ASR system Whisper and runs on CPU basis hence responses might be slow. </p> \n
+""" + availableLang
+description2 ="""<p style='font-size: 18px;'> Transcribe a recording with your microphone of a speech in any of the languages listed below and translate it to English. </p>
+<p style='font-size: 16px;'> This demo uses the ASR system Whisper and runs on CPU basis hence responses might be slow. </p> \n
+""" + availableLang
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+#modelType = "openai/whisper-small"
+class LM:
+    model={}
+    processor={}
+    pipe={}
+    #LMsizes = ["tiny", "base", "small", "medium", "large"]
+    LMsizes = ["base", "small", "medium"]
+myLM = LM()
+for LMsize in myLM.LMsizes:
+    modelType = "openai/whisper-"+LMsize
+    myLM.model[LMsize] = WhisperForConditionalGeneration.from_pretrained(modelType).to(device)
+    myLM.processor[LMsize] = WhisperProcessor.from_pretrained(modelType)
+    myLM.pipe[LMsize] = pipeline(task="automatic-speech-recognition", model=modelType, device=device, chunk_length_s=29, stride_length_s=[5,0])
+def detect_language(audio_path, model, processor, asr_pipe_whisper):
+    #Is not possible to retrieve the predicted language directly or using a pipeline. Instead:
+    # Loads and resample the audio file to 16kHz, convert to mono and control the duration of the audio input to 20sec
+    speech_data, sampling_rate = librosa.load(audio_path, sr=16000, mono=True, duration=20)
+    #get the input features using the feature extractor on the raw speech data
+    input_features = processor.feature_extractor(speech_data, return_tensors="pt", sampling_rate=sampling_rate).input_features.to(device)
+    #transcribe the input tensor of features obtained from function preAudioPath
+    predicted_ids = model.generate(input_features, task="transcribe")
+    #decode the second entry from the output array which conatins the detected language
+    detected_lang = asr_pipe_whisper.tokenizer.decode(predicted_ids[0,1])
+    #looks up in the dictionary to retrieve the expanded language name. E.g. detected_lang = "<|ge|>" returns detected_lang = "german"
+    detected_lang = languages_dic.LANGUAGES.get(detected_lang.strip("<|>"))
+    return detected_lang
+# def transcribe(inputs):
+    # # predicted_ids = model.generate(inputs, language="<|es|>", task="transcribe")
+    # predicted_ids = model.generate(inputs, task="transcribe")
+    # transcription = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+    # return transcription
+def getLM(modelsize):
+    modelsize = modelsize.split(" ")
+    if len(modelsize) > 0:
+        modelsize = modelsize[0]
+    return (myLM.model[modelsize], myLM.processor[modelsize], myLM.pipe[modelsize])
+def processAudio(audio_path, modelsize):
+    model, processor, asr_pipe_whisper = getLM(modelsize)
+    translation = asr_pipe_whisper(audio_path, max_new_tokens=256, generate_kwargs={"task":"translate"})
+    transcription = asr_pipe_whisper(audio_path, generate_kwargs={"task":"transcribe"})
+    #transcription = transcribe(preprocessAudioPath(audio_path))
+    inputLang = detect_language(audio_path, model, processor, asr_pipe_whisper)
+    return (inputLang, transcription["text"], translation["text"])
+modelsizeInfo = "Try out the performance for different model sizes. Larger models are more robust and deliver better results but are also slower."
+app1 = gr.Interface(
+    fn=processAudio,
+#    inputs=[gr.Audio(source="upload", type="filepath",label="Audio Input"),
+#                    gr.Radio(["tiny - 39M", "base - 74M", "small -244M", "medium - 769M", "large - 1550M"],
+#                             label="Select the model size", info=modelsizeInfo, value="small -244M")],
+    inputs=[gr.Audio(source="upload", type="filepath",label="Audio Input"),
+                    gr.Radio(["base - 74M", "small -244M", "medium - 769M"], label="Select the model size", info=modelsizeInfo, value="small -244M")],
+    outputs=[gr.Textbox(label="Detected input language"), gr.Textbox(label="Transcription"), gr.Textbox(label="Translation to english")],
+    title=title,
+    description=description1
+)
+app2 = gr.Interface(
+    fn=processAudio,
+    inputs=[gr.Audio(source="microphone", type="filepath",label="Audio Input"),
+                    gr.Radio(["base - 74M", "small -244M", "medium - 769M"], label="Select the model size", info=modelsizeInfo, value="small -244M")],
+    outputs=[gr.Textbox(label="Detected input language"), gr.Textbox(label="Transcription"), gr.Textbox(label="Translation to english")],
+    title=title,
+    description=description2
+)
+demo = gr.TabbedInterface([app1, app2], ["Audio File", "Microphone"])
+if __name__ == "__main__":
+    demo.launch()

languages_dic.py ADDED Viewed

	@@ -0,0 +1,101 @@

+LANGUAGES = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ru": "russian",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "pt": "portuguese",
+    "tr": "turkish",
+    "pl": "polish",
+    "ca": "catalan",
+    "nl": "dutch",
+    "ar": "arabic",
+    "sv": "swedish",
+    "it": "italian",
+    "id": "indonesian",
+    "hi": "hindi",
+    "fi": "finnish",
+    "vi": "vietnamese",
+    "he": "hebrew",
+    "uk": "ukrainian",
+    "el": "greek",
+    "ms": "malay",
+    "cs": "czech",
+    "ro": "romanian",
+    "da": "danish",
+    "hu": "hungarian",
+    "ta": "tamil",
+    "no": "norwegian",
+    "th": "thai",
+    "ur": "urdu",
+    "hr": "croatian",
+    "bg": "bulgarian",
+    "lt": "lithuanian",
+    "la": "latin",
+    "mi": "maori",
+    "ml": "malayalam",
+    "cy": "welsh",
+    "sk": "slovak",
+    "te": "telugu",
+    "fa": "persian",
+    "lv": "latvian",
+    "bn": "bengali",
+    "sr": "serbian",
+    "az": "azerbaijani",
+    "sl": "slovenian",
+    "kn": "kannada",
+    "et": "estonian",
+    "mk": "macedonian",
+    "br": "breton",
+    "eu": "basque",
+    "is": "icelandic",
+    "hy": "armenian",
+    "ne": "nepali",
+    "mn": "mongolian",
+    "bs": "bosnian",
+    "kk": "kazakh",
+    "sq": "albanian",
+    "sw": "swahili",
+    "gl": "galician",
+    "mr": "marathi",
+    "pa": "punjabi",
+    "si": "sinhala",
+    "km": "khmer",
+    "sn": "shona",
+    "yo": "yoruba",
+    "so": "somali",
+    "af": "afrikaans",
+    "oc": "occitan",
+    "ka": "georgian",
+    "be": "belarusian",
+    "tg": "tajik",
+    "sd": "sindhi",
+    "gu": "gujarati",
+    "am": "amharic",
+    "yi": "yiddish",
+    "lo": "lao",
+    "uz": "uzbek",
+    "fo": "faroese",
+    "ht": "haitian creole",
+    "ps": "pashto",
+    "tk": "turkmen",
+    "nn": "nynorsk",
+    "mt": "maltese",
+    "sa": "sanskrit",
+    "lb": "luxembourgish",
+    "my": "myanmar",
+    "bo": "tibetan",
+    "tl": "tagalog",
+    "mg": "malagasy",
+    "as": "assamese",
+    "tt": "tatar",
+    "haw": "hawaiian",
+    "ln": "lingala",
+    "ha": "hausa",
+    "ba": "bashkir",
+    "jw": "javanese",
+    "su": "sundanese",
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+transformers
+torch
+kenlm
+pyctcdecode
+numpy
+audio2numpy
+librosa