Spaces:
Runtime error
Runtime error
| import spaces | |
| import torch | |
| from threading import Thread | |
| from transformers import AutoProcessor | |
| from transformers import set_seed | |
| from vocos_bark import BarkModel | |
| import numpy as np | |
| import gradio as gr | |
| from vocos import Vocos | |
| set_seed(0) | |
| def _grab_best_device(use_gpu=True): | |
| if torch.cuda.device_count() > 0 and use_gpu: | |
| device = "cuda" | |
| else: | |
| device = "cpu" | |
| return device | |
| device = _grab_best_device() | |
| HUB_PATH = "suno/bark" | |
| processor = AutoProcessor.from_pretrained(HUB_PATH) | |
| speaker_embeddings = sorted([key for key in processor.speaker_embeddings.keys() if "speaker" in key]) | |
| SAMPLE_RATE = 24_000 | |
| vocos = Vocos.from_pretrained("charactr/vocos-encodec-24khz").to(device) | |
| title = "# ๐ถ Bark with <a href='https://github.com/charactr-platform/vocos'>Vocos</a></div>" | |
| description = """ | |
| <div> | |
| <a style="display:inline-block" href='https://github.com/charactr-platform/vocos'><img src='https://img.shields.io/github/stars/charactr-platform/vocos?style=social' /></a> | |
| <a style="display:inline-block; margin-left: 1em" href="https://huggingface.co/spaces/hf-audio/vocos-bark?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space%20to%20skip%20the%20queue-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a> | |
| </div> | |
| Bark is a universal text-to-audio model created by Suno. \ | |
| Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. \ | |
| In this demo, we leverage charactr's Vocos model to create high quality audio from Bark. \ | |
| """ | |
| bark = BarkModel.from_pretrained(HUB_PATH).to(device) | |
| bark = bark.to_bettertransformer() | |
| # Inference on Zero GPU | |
| def generate_audio(text, voice_preset=None, lag=0): | |
| if voice_preset not in speaker_embeddings: | |
| voice_preset = None | |
| inputs = processor([text], voice_preset=voice_preset).to(device) | |
| # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way. | |
| fine_output = bark.generate( | |
| **inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True | |
| ) | |
| print("Fine tokens generated") | |
| with torch.no_grad(): | |
| features = vocos.codes_to_features(fine_output.transpose(0,1)) | |
| vocos_waveform = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device)) | |
| return (SAMPLE_RATE, vocos_waveform.cpu().squeeze().numpy()) | |
| # Gradio blocks demo | |
| with gr.Blocks() as demo_blocks: | |
| gr.Markdown(title) | |
| gr.Markdown(description) | |
| with gr.Row(): | |
| with gr.Column(): | |
| inp_text = gr.Textbox(label="Input Text", info="What would you like bark to synthesise?") | |
| spk = gr.Dropdown( | |
| speaker_embeddings, | |
| value=None, | |
| label="Acoustic Prompt", | |
| info="Default: Unconditional Generation" | |
| ) | |
| btn = gr.Button("Generate Audio!") | |
| with gr.Column(): | |
| out_audio_vocos = gr.Audio(type="numpy", autoplay=False, label="Generated Audio", show_label=True) | |
| btn.click(generate_audio, [inp_text, spk], [out_audio_vocos]) | |
| demo_blocks.queue().launch(debug=True) |