Tlanextli commited on
Commit
248bc6b
·
1 Parent(s): bf409ca

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +106 -0
  2. languages_dic.py +101 -0
  3. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import torch
4
+ import numpy
5
+ import librosa
6
+ import languages_dic
7
+ from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
8
+
9
+
10
+ title = "Multilanguage Transcription and Translation"
11
+
12
+ availableLang = "Afrikaans, Arabic, Armenian, Azerbaijani, Belarusian, Bosnian, Bulgarian, Catalan, Chinese, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, Galician, German, Greek, Hebrew, Hindi, Hungarian, Icelandic, Indonesian, Italian, Japanese, Kannada, Kazakh, Korean, Latvian, Lithuanian, Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian, Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh."
13
+
14
+ description1 = """<p style='font-size: 18px;'> Transcribe an audio file containing a speech in any of the languages listed below and translate it to English. </p>
15
+ <p style='font-size: 16px;'> This demo uses the ASR system Whisper and runs on CPU basis hence responses might be slow. </p> \n
16
+ """ + availableLang
17
+
18
+ description2 ="""<p style='font-size: 18px;'> Transcribe a recording with your microphone of a speech in any of the languages listed below and translate it to English. </p>
19
+ <p style='font-size: 16px;'> This demo uses the ASR system Whisper and runs on CPU basis hence responses might be slow. </p> \n
20
+ """ + availableLang
21
+
22
+
23
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
24
+ #modelType = "openai/whisper-small"
25
+
26
+ class LM:
27
+ model={}
28
+ processor={}
29
+ pipe={}
30
+ #LMsizes = ["tiny", "base", "small", "medium", "large"]
31
+ LMsizes = ["base", "small", "medium"]
32
+
33
+ myLM = LM()
34
+
35
+ for LMsize in myLM.LMsizes:
36
+ modelType = "openai/whisper-"+LMsize
37
+ myLM.model[LMsize] = WhisperForConditionalGeneration.from_pretrained(modelType).to(device)
38
+ myLM.processor[LMsize] = WhisperProcessor.from_pretrained(modelType)
39
+ myLM.pipe[LMsize] = pipeline(task="automatic-speech-recognition", model=modelType, device=device, chunk_length_s=29, stride_length_s=[5,0])
40
+
41
+
42
+
43
+ def detect_language(audio_path, model, processor, asr_pipe_whisper):
44
+ #Is not possible to retrieve the predicted language directly or using a pipeline. Instead:
45
+ # Loads and resample the audio file to 16kHz, convert to mono and control the duration of the audio input to 20sec
46
+ speech_data, sampling_rate = librosa.load(audio_path, sr=16000, mono=True, duration=20)
47
+ #get the input features using the feature extractor on the raw speech data
48
+ input_features = processor.feature_extractor(speech_data, return_tensors="pt", sampling_rate=sampling_rate).input_features.to(device)
49
+ #transcribe the input tensor of features obtained from function preAudioPath
50
+ predicted_ids = model.generate(input_features, task="transcribe")
51
+ #decode the second entry from the output array which conatins the detected language
52
+ detected_lang = asr_pipe_whisper.tokenizer.decode(predicted_ids[0,1])
53
+ #looks up in the dictionary to retrieve the expanded language name. E.g. detected_lang = "<|ge|>" returns detected_lang = "german"
54
+ detected_lang = languages_dic.LANGUAGES.get(detected_lang.strip("<|>"))
55
+ return detected_lang
56
+
57
+
58
+ # def transcribe(inputs):
59
+ # # predicted_ids = model.generate(inputs, language="<|es|>", task="transcribe")
60
+ # predicted_ids = model.generate(inputs, task="transcribe")
61
+ # transcription = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
62
+ # return transcription
63
+
64
+ def getLM(modelsize):
65
+ modelsize = modelsize.split(" ")
66
+ if len(modelsize) > 0:
67
+ modelsize = modelsize[0]
68
+ return (myLM.model[modelsize], myLM.processor[modelsize], myLM.pipe[modelsize])
69
+
70
+
71
+ def processAudio(audio_path, modelsize):
72
+ model, processor, asr_pipe_whisper = getLM(modelsize)
73
+ translation = asr_pipe_whisper(audio_path, max_new_tokens=256, generate_kwargs={"task":"translate"})
74
+ transcription = asr_pipe_whisper(audio_path, generate_kwargs={"task":"transcribe"})
75
+ #transcription = transcribe(preprocessAudioPath(audio_path))
76
+ inputLang = detect_language(audio_path, model, processor, asr_pipe_whisper)
77
+ return (inputLang, transcription["text"], translation["text"])
78
+
79
+
80
+ modelsizeInfo = "Try out the performance for different model sizes. Larger models are more robust and deliver better results but are also slower."
81
+
82
+ app1 = gr.Interface(
83
+ fn=processAudio,
84
+ # inputs=[gr.Audio(source="upload", type="filepath",label="Audio Input"),
85
+ # gr.Radio(["tiny - 39M", "base - 74M", "small -244M", "medium - 769M", "large - 1550M"],
86
+ # label="Select the model size", info=modelsizeInfo, value="small -244M")],
87
+ inputs=[gr.Audio(source="upload", type="filepath",label="Audio Input"),
88
+ gr.Radio(["base - 74M", "small -244M", "medium - 769M"], label="Select the model size", info=modelsizeInfo, value="small -244M")],
89
+ outputs=[gr.Textbox(label="Detected input language"), gr.Textbox(label="Transcription"), gr.Textbox(label="Translation to english")],
90
+ title=title,
91
+ description=description1
92
+ )
93
+
94
+ app2 = gr.Interface(
95
+ fn=processAudio,
96
+ inputs=[gr.Audio(source="microphone", type="filepath",label="Audio Input"),
97
+ gr.Radio(["base - 74M", "small -244M", "medium - 769M"], label="Select the model size", info=modelsizeInfo, value="small -244M")],
98
+ outputs=[gr.Textbox(label="Detected input language"), gr.Textbox(label="Transcription"), gr.Textbox(label="Translation to english")],
99
+ title=title,
100
+ description=description2
101
+ )
102
+
103
+ demo = gr.TabbedInterface([app1, app2], ["Audio File", "Microphone"])
104
+
105
+ if __name__ == "__main__":
106
+ demo.launch()
languages_dic.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LANGUAGES = {
2
+ "en": "english",
3
+ "zh": "chinese",
4
+ "de": "german",
5
+ "es": "spanish",
6
+ "ru": "russian",
7
+ "ko": "korean",
8
+ "fr": "french",
9
+ "ja": "japanese",
10
+ "pt": "portuguese",
11
+ "tr": "turkish",
12
+ "pl": "polish",
13
+ "ca": "catalan",
14
+ "nl": "dutch",
15
+ "ar": "arabic",
16
+ "sv": "swedish",
17
+ "it": "italian",
18
+ "id": "indonesian",
19
+ "hi": "hindi",
20
+ "fi": "finnish",
21
+ "vi": "vietnamese",
22
+ "he": "hebrew",
23
+ "uk": "ukrainian",
24
+ "el": "greek",
25
+ "ms": "malay",
26
+ "cs": "czech",
27
+ "ro": "romanian",
28
+ "da": "danish",
29
+ "hu": "hungarian",
30
+ "ta": "tamil",
31
+ "no": "norwegian",
32
+ "th": "thai",
33
+ "ur": "urdu",
34
+ "hr": "croatian",
35
+ "bg": "bulgarian",
36
+ "lt": "lithuanian",
37
+ "la": "latin",
38
+ "mi": "maori",
39
+ "ml": "malayalam",
40
+ "cy": "welsh",
41
+ "sk": "slovak",
42
+ "te": "telugu",
43
+ "fa": "persian",
44
+ "lv": "latvian",
45
+ "bn": "bengali",
46
+ "sr": "serbian",
47
+ "az": "azerbaijani",
48
+ "sl": "slovenian",
49
+ "kn": "kannada",
50
+ "et": "estonian",
51
+ "mk": "macedonian",
52
+ "br": "breton",
53
+ "eu": "basque",
54
+ "is": "icelandic",
55
+ "hy": "armenian",
56
+ "ne": "nepali",
57
+ "mn": "mongolian",
58
+ "bs": "bosnian",
59
+ "kk": "kazakh",
60
+ "sq": "albanian",
61
+ "sw": "swahili",
62
+ "gl": "galician",
63
+ "mr": "marathi",
64
+ "pa": "punjabi",
65
+ "si": "sinhala",
66
+ "km": "khmer",
67
+ "sn": "shona",
68
+ "yo": "yoruba",
69
+ "so": "somali",
70
+ "af": "afrikaans",
71
+ "oc": "occitan",
72
+ "ka": "georgian",
73
+ "be": "belarusian",
74
+ "tg": "tajik",
75
+ "sd": "sindhi",
76
+ "gu": "gujarati",
77
+ "am": "amharic",
78
+ "yi": "yiddish",
79
+ "lo": "lao",
80
+ "uz": "uzbek",
81
+ "fo": "faroese",
82
+ "ht": "haitian creole",
83
+ "ps": "pashto",
84
+ "tk": "turkmen",
85
+ "nn": "nynorsk",
86
+ "mt": "maltese",
87
+ "sa": "sanskrit",
88
+ "lb": "luxembourgish",
89
+ "my": "myanmar",
90
+ "bo": "tibetan",
91
+ "tl": "tagalog",
92
+ "mg": "malagasy",
93
+ "as": "assamese",
94
+ "tt": "tatar",
95
+ "haw": "hawaiian",
96
+ "ln": "lingala",
97
+ "ha": "hausa",
98
+ "ba": "bashkir",
99
+ "jw": "javanese",
100
+ "su": "sundanese",
101
+ }
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ kenlm
4
+ pyctcdecode
5
+ numpy
6
+ audio2numpy
7
+ librosa