Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| import os | |
| import re | |
| import tempfile | |
| from transformers import VitsModel, VitsTokenizer | |
| models = { | |
| "English": VitsModel.from_pretrained("Matthijs/mms-tts-eng"), | |
| "German": VitsModel.from_pretrained("Matthijs/mms-tts-deu"), | |
| "Korean": VitsModel.from_pretrained("Matthijs/mms-tts-kor"), | |
| } | |
| tokenizers = { | |
| "English": VitsTokenizer.from_pretrained("Matthijs/mms-tts-eng"), | |
| "German": VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu"), | |
| "Korean": VitsTokenizer.from_pretrained("Matthijs/mms-tts-kor"), | |
| } | |
| # For certain checkpoints, the text needs to be romanized. | |
| # MMS-TTS uses uromanize.pl for this from https://github.com/isi-nlp/uroman | |
| # This needs to be installed in the folder "uroman" | |
| def uromanize(text, uroman_pl): | |
| iso = "xxx" | |
| with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2: | |
| with open(tf.name, "w") as f: | |
| f.write("\n".join([text])) | |
| cmd = f"perl " + uroman_pl | |
| cmd += f" -l {iso} " | |
| cmd += f" < {tf.name} > {tf2.name}" | |
| os.system(cmd) | |
| outtexts = [] | |
| with open(tf2.name) as f: | |
| for line in f: | |
| line = re.sub(r"\s+", " ", line).strip() | |
| outtexts.append(line) | |
| outtext = outtexts[0] | |
| return outtext | |
| def predict(text, language=None): | |
| if len(text.strip()) == 0: | |
| return (16000, np.zeros(0).astype(np.int16)) | |
| if language == "Korean": | |
| uroman_pl = os.path.join("uroman", "bin", "uroman.pl") | |
| text = uromanize(text, uroman_pl) | |
| tokenizer = tokenizers[language] | |
| inputs = tokenizer(text, return_tensors="pt") | |
| input_ids = inputs["input_ids"] | |
| if language != "Korean": | |
| text = tokenizer.batch_decode(input_ids)[0] | |
| model = models[language] | |
| with torch.no_grad(): | |
| outputs = model(input_ids) | |
| speech = outputs.audio[0] | |
| speech = (speech.numpy() * 32767).astype(np.int16) | |
| return (16000, speech), text | |
| title = "MMS-TTS speech synthesis" | |
| description = """ | |
| Facebook's [Massively Multilingual Speech](https://arxiv.org/abs/2305.13516) project aims to provide | |
| speech technology across a diverse range of languages. The MMS-TTS project contains a collection of | |
| over 1000 text-to-speech (TTS) models. | |
| This demo shows how to use MMS-TTS using 🤗 Transformers. Since MMS-TTS is based on the VITS | |
| model, this code can also be used to run VITS checkpoints. | |
| For a full list of checkpoints, [click here](https://huggingface.co/models?filter=vits). | |
| As the model performs random sampling, the generated speech is slightly different each time. | |
| The voice may also vary between runs, or sometimes even in the same sentence. | |
| (Note that 🤗 Transformers also supports multispeaker VITS checkpoints but the MMS-TTS checkpoints | |
| are not conditioned on a speaker ID.) | |
| """ | |
| article = """ | |
| <div style='margin:20px auto;'> | |
| <p>References: <a href="https://arxiv.org/abs/2305.13516">MMS paper</a> | | |
| <a href="https://ai.facebook.com/blog/multilingual-model-speech-recognition/">blog post</a> | | |
| <a href="https://huggingface.co/facebook/mms-tts">original weights</a> | | |
| <a href="https://huggingface.co/spaces/mms-meta/MMS">original MMS space</a> | |
| </p> | |
| <pre> | |
| @article{pratap2023mms, | |
| title={Scaling Speech Technology to 1,000+ Languages}, | |
| author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli}, | |
| journal={arXiv}, | |
| year={2023} | |
| } | |
| </pre> | |
| </div> | |
| """ | |
| examples = [ | |
| ["It is not in the stars to hold our destiny but in ourselves.", "English"], | |
| ["The octopus and Oliver went to the opera in October.", "English"], | |
| ["She sells seashells by the seashore. I saw a kitten eating chicken in the kitchen.", "English"], | |
| ["Brisk brave brigadiers brandished broad bright blades, blunderbusses, and bludgeons—balancing them badly.", "English"], | |
| ["A synonym for cinnamon is a cinnamon synonym.", "English"], | |
| ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?", "English"], | |
| ["Eins, zwei, Polizei. Drei, vier, Grenadier. Fünf, sechs, alte Keks. Sieben, acht, gute Nacht.", "German"], | |
| ["Alle meine Entchen, schwimmen auf dem See. Köpfchen in das Wasser, Schwänzchen in die Höh.", "German"], | |
| ["안녕 세상, 날씨는 아름다워", "Korean"], # Hello world, the weather is beautiful (Google Translate) | |
| ] | |
| gr.Interface( | |
| fn=predict, | |
| inputs=[ | |
| gr.Text(label="Input Text"), | |
| gr.Radio(label="Language", choices=[ | |
| "English", | |
| "German", | |
| "Korean", | |
| ], | |
| value="English"), | |
| ], | |
| outputs=[ | |
| gr.Audio(label="Generated Speech", type="numpy"), | |
| gr.Text(label="Processed text"), | |
| ], | |
| title=title, | |
| description=description, | |
| article=article, | |
| examples=examples, | |
| ).launch() | |