Spaces:
Runtime error
Runtime error
| from encoder.params_model import model_embedding_size as speaker_embedding_size | |
| from utils.argutils import print_args | |
| from utils.modelutils import check_model_paths | |
| from synthesizer.inference import Synthesizer | |
| from encoder import inference as encoder | |
| from vocoder.wavernn import inference as rnn_vocoder | |
| from vocoder.hifigan import inference as gan_vocoder | |
| from pathlib import Path | |
| import numpy as np | |
| import soundfile as sf | |
| import librosa | |
| import argparse | |
| import torch | |
| import sys | |
| import os | |
| import re | |
| import cn2an | |
| import glob | |
| from audioread.exceptions import NoBackendError | |
| vocoder = gan_vocoder | |
| def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq): | |
| embeds = [embed] * len(texts) | |
| # If you know what the attention layer alignments are, you can retrieve them here by | |
| # passing return_alignments=True | |
| specs = synthesizer.synthesize_spectrograms(texts, embeds, style_idx=-1, min_stop_token=4, steps=400) | |
| #spec = specs[0] | |
| breaks = [spec.shape[1] for spec in specs] | |
| spec = np.concatenate(specs, axis=1) | |
| # If seed is specified, reset torch seed and reload vocoder | |
| # Synthesizing the waveform is fairly straightforward. Remember that the longer the | |
| # spectrogram, the more time-efficient the vocoder. | |
| generated_wav, output_sample_rate = vocoder.infer_waveform(spec) | |
| # Add breaks | |
| b_ends = np.cumsum(np.array(breaks) * synthesizer.hparams.hop_size) | |
| b_starts = np.concatenate(([0], b_ends[:-1])) | |
| wavs = [generated_wav[start:end] for start, end, in zip(b_starts, b_ends)] | |
| breaks = [np.zeros(int(0.15 * synthesizer.sample_rate))] * len(breaks) | |
| generated_wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) | |
| ## Post-generation | |
| # There's a bug with sounddevice that makes the audio cut one second earlier, so we | |
| # pad it. | |
| # Trim excess silences to compensate for gaps in spectrograms (issue #53) | |
| generated_wav = encoder.preprocess_wav(generated_wav) | |
| generated_wav = generated_wav / np.abs(generated_wav).max() * 0.97 | |
| # Save it on the disk | |
| model=os.path.basename(in_fpath) | |
| filename = "%s_%d_%s.wav" %(file_name, seq, model) | |
| sf.write(filename, generated_wav, synthesizer.sample_rate) | |
| print("\nSaved output as %s\n\n" % filename) | |
| def generate_wav(enc_model_fpath, syn_model_fpath, voc_model_fpath, in_fpath, input_txt, file_name): | |
| if torch.cuda.is_available(): | |
| device_id = torch.cuda.current_device() | |
| gpu_properties = torch.cuda.get_device_properties(device_id) | |
| ## Print some environment information (for debugging purposes) | |
| print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " | |
| "%.1fGb total memory.\n" % | |
| (torch.cuda.device_count(), | |
| device_id, | |
| gpu_properties.name, | |
| gpu_properties.major, | |
| gpu_properties.minor, | |
| gpu_properties.total_memory / 1e9)) | |
| else: | |
| print("Using CPU for inference.\n") | |
| print("Preparing the encoder, the synthesizer and the vocoder...") | |
| encoder.load_model(enc_model_fpath) | |
| synthesizer = Synthesizer(syn_model_fpath) | |
| vocoder.load_model(voc_model_fpath) | |
| encoder_wav = synthesizer.load_preprocess_wav(in_fpath) | |
| embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) | |
| texts = input_txt.split("\n") | |
| seq=0 | |
| each_num=1500 | |
| punctuation = '!,。、,' # punctuate and split/clean text | |
| processed_texts = [] | |
| cur_num = 0 | |
| for text in texts: | |
| for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'): | |
| if processed_text: | |
| processed_texts.append(processed_text.strip()) | |
| cur_num += len(processed_text.strip()) | |
| if cur_num > each_num: | |
| seq = seq +1 | |
| gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq) | |
| processed_texts = [] | |
| cur_num = 0 | |
| if len(processed_texts)>0: | |
| seq = seq +1 | |
| gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq) | |
| if (len(sys.argv)>=3): | |
| my_txt = "" | |
| print("reading from :", sys.argv[1]) | |
| with open(sys.argv[1], "r") as f: | |
| for line in f.readlines(): | |
| #line = line.strip('\n') | |
| my_txt += line | |
| txt_file_name = sys.argv[1] | |
| wav_file_name = sys.argv[2] | |
| output = cn2an.transform(my_txt, "an2cn") | |
| print(output) | |
| generate_wav( | |
| Path("encoder/saved_models/pretrained.pt"), | |
| Path("synthesizer/saved_models/mandarin.pt"), | |
| Path("vocoder/saved_models/pretrained/g_hifigan.pt"), wav_file_name, output, txt_file_name | |
| ) | |
| else: | |
| print("please input the file name") | |
| exit(1) | |