update
Browse files- .gitignore +1 -0
- .models/clip.pth +0 -3
- api.py +11 -22
- data/mel_norms.pth +0 -0
- do_tts.py +5 -1
- models/arch_util.py +1 -2
- models/clvp.py +1 -1
- models/xtransformers.py +0 -47
- read.py +10 -12
- requirements.txt +1 -2
.gitignore
CHANGED
|
@@ -130,3 +130,4 @@ dmypy.json
|
|
| 130 |
.pyre/
|
| 131 |
|
| 132 |
.idea/*
|
|
|
|
|
|
| 130 |
.pyre/
|
| 131 |
|
| 132 |
.idea/*
|
| 133 |
+
.models/*
|
.models/clip.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:8ab5a7751b6098b7e57528b5d812ea2ffbaa16f1b36c02e143c501c74900140d
|
| 3 |
-
size 271601435
|
|
|
|
|
|
|
|
|
|
|
|
api.py
CHANGED
|
@@ -23,9 +23,11 @@ from utils.tokenizer import VoiceBpeTokenizer, lev_distance
|
|
| 23 |
pbar = None
|
| 24 |
def download_models():
|
| 25 |
MODELS = {
|
| 26 |
-
'
|
| 27 |
-
'
|
| 28 |
-
'
|
|
|
|
|
|
|
| 29 |
}
|
| 30 |
os.makedirs('.models', exist_ok=True)
|
| 31 |
def show_progress(block_num, block_size, total_size):
|
|
@@ -162,25 +164,12 @@ class TextToSpeech:
|
|
| 162 |
train_solo_embeddings=False,
|
| 163 |
average_conditioning_embeddings=True).cpu().eval()
|
| 164 |
self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
|
| 165 |
-
'''
|
| 166 |
-
self.autoregressive = UnifiedVoice(max_mel_tokens=2048, max_text_tokens=1024, max_conditioning_inputs=1, layers=42,
|
| 167 |
-
model_dim=1152, heads=18, number_text_tokens=256, train_solo_embeddings=False,
|
| 168 |
-
average_conditioning_embeddings=True, types=2).cpu().eval()
|
| 169 |
-
self.autoregressive.load_state_dict(torch.load('X:\\dlas\\experiments\\train_gpt_tts_xl\\models\\15250_gpt_ema.pth'))
|
| 170 |
-
'''
|
| 171 |
-
|
| 172 |
-
self.autoregressive_for_diffusion = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
|
| 173 |
-
model_dim=1024,
|
| 174 |
-
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
|
| 175 |
-
train_solo_embeddings=False,
|
| 176 |
-
average_conditioning_embeddings=True).cpu().eval()
|
| 177 |
-
self.autoregressive_for_diffusion.load_state_dict(torch.load('.models/autoregressive.pth'))
|
| 178 |
|
| 179 |
self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
|
| 180 |
text_seq_len=350, text_heads=8,
|
| 181 |
num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
|
| 182 |
use_xformers=True).cpu().eval()
|
| 183 |
-
self.clvp.load_state_dict(torch.load('.models/
|
| 184 |
|
| 185 |
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
|
| 186 |
speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
|
|
@@ -213,7 +202,7 @@ class TextToSpeech:
|
|
| 213 |
'ultra_fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 16, 'cond_free': False},
|
| 214 |
'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 32},
|
| 215 |
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 128},
|
| 216 |
-
'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations':
|
| 217 |
}
|
| 218 |
kwargs.update(presets[preset])
|
| 219 |
return self.tts(text, voice_samples, **kwargs)
|
|
@@ -281,11 +270,11 @@ class TextToSpeech:
|
|
| 281 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
| 282 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
| 283 |
# results, but will increase memory usage.
|
| 284 |
-
self.
|
| 285 |
-
best_latents = self.
|
| 286 |
-
torch.tensor([best_results.shape[-1]*self.
|
| 287 |
return_latent=True, clip_inputs=False)
|
| 288 |
-
self.
|
| 289 |
|
| 290 |
print("Performing vocoding..")
|
| 291 |
wav_candidates = []
|
|
|
|
| 23 |
pbar = None
|
| 24 |
def download_models():
|
| 25 |
MODELS = {
|
| 26 |
+
'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/autoregressive.pth',
|
| 27 |
+
'clvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/clip.pth',
|
| 28 |
+
'cvvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/cvvp.pth',
|
| 29 |
+
'diffusion_decoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/diffusion_decoder.pth',
|
| 30 |
+
'vocoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/vocoder.pth',
|
| 31 |
}
|
| 32 |
os.makedirs('.models', exist_ok=True)
|
| 33 |
def show_progress(block_num, block_size, total_size):
|
|
|
|
| 164 |
train_solo_embeddings=False,
|
| 165 |
average_conditioning_embeddings=True).cpu().eval()
|
| 166 |
self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
|
| 169 |
text_seq_len=350, text_heads=8,
|
| 170 |
num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
|
| 171 |
use_xformers=True).cpu().eval()
|
| 172 |
+
self.clvp.load_state_dict(torch.load('.models/clvp.pth'))
|
| 173 |
|
| 174 |
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
|
| 175 |
speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
|
|
|
|
| 202 |
'ultra_fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 16, 'cond_free': False},
|
| 203 |
'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 32},
|
| 204 |
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 128},
|
| 205 |
+
'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations': 1024},
|
| 206 |
}
|
| 207 |
kwargs.update(presets[preset])
|
| 208 |
return self.tts(text, voice_samples, **kwargs)
|
|
|
|
| 270 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
| 271 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
| 272 |
# results, but will increase memory usage.
|
| 273 |
+
self.autoregressive = self.autoregressive.cuda()
|
| 274 |
+
best_latents = self.autoregressive(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
|
| 275 |
+
torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device),
|
| 276 |
return_latent=True, clip_inputs=False)
|
| 277 |
+
self.autoregressive = self.autoregressive.cpu()
|
| 278 |
|
| 279 |
print("Performing vocoding..")
|
| 280 |
wav_candidates = []
|
data/mel_norms.pth
CHANGED
|
Binary files a/data/mel_norms.pth and b/data/mel_norms.pth differ
|
|
|
do_tts.py
CHANGED
|
@@ -11,6 +11,10 @@ if __name__ == '__main__':
|
|
| 11 |
parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
|
| 12 |
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
| 13 |
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
|
| 15 |
args = parser.parse_args()
|
| 16 |
os.makedirs(args.output_path, exist_ok=True)
|
|
@@ -25,6 +29,6 @@ if __name__ == '__main__':
|
|
| 25 |
for cond_path in cond_paths:
|
| 26 |
c = load_audio(cond_path, 22050)
|
| 27 |
conds.append(c)
|
| 28 |
-
gen = tts.tts_with_preset(args.text, conds, preset=
|
| 29 |
torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000)
|
| 30 |
|
|
|
|
| 11 |
parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
|
| 12 |
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
| 13 |
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
|
| 14 |
+
parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
|
| 15 |
+
parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
|
| 16 |
+
help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility',
|
| 17 |
+
default=.5)
|
| 18 |
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
|
| 19 |
args = parser.parse_args()
|
| 20 |
os.makedirs(args.output_path, exist_ok=True)
|
|
|
|
| 29 |
for cond_path in cond_paths:
|
| 30 |
c = load_audio(cond_path, 22050)
|
| 31 |
conds.append(c)
|
| 32 |
+
gen = tts.tts_with_preset(args.text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
|
| 33 |
torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000)
|
| 34 |
|
models/arch_util.py
CHANGED
|
@@ -5,8 +5,7 @@ import torch
|
|
| 5 |
import torch.nn as nn
|
| 6 |
import torch.nn.functional as F
|
| 7 |
import torchaudio
|
| 8 |
-
from
|
| 9 |
-
from x_transformers.x_transformers import RelativePositionBias
|
| 10 |
|
| 11 |
|
| 12 |
def zero_module(module):
|
|
|
|
| 5 |
import torch.nn as nn
|
| 6 |
import torch.nn.functional as F
|
| 7 |
import torchaudio
|
| 8 |
+
from models.xtransformers import ContinuousTransformerWrapper, RelativePositionBias
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def zero_module(module):
|
models/clvp.py
CHANGED
|
@@ -2,10 +2,10 @@ import torch
|
|
| 2 |
import torch.nn as nn
|
| 3 |
import torch.nn.functional as F
|
| 4 |
from torch import einsum
|
| 5 |
-
from x_transformers import Encoder
|
| 6 |
|
| 7 |
from models.arch_util import CheckpointedXTransformerEncoder
|
| 8 |
from models.transformer import Transformer
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def exists(val):
|
|
|
|
| 2 |
import torch.nn as nn
|
| 3 |
import torch.nn.functional as F
|
| 4 |
from torch import einsum
|
|
|
|
| 5 |
|
| 6 |
from models.arch_util import CheckpointedXTransformerEncoder
|
| 7 |
from models.transformer import Transformer
|
| 8 |
+
from models.xtransformers import Encoder
|
| 9 |
|
| 10 |
|
| 11 |
def exists(val):
|
models/xtransformers.py
CHANGED
|
@@ -1253,50 +1253,3 @@ class ContinuousTransformerWrapper(nn.Module):
|
|
| 1253 |
return tuple(res)
|
| 1254 |
return res[0]
|
| 1255 |
|
| 1256 |
-
|
| 1257 |
-
class XTransformer(nn.Module):
|
| 1258 |
-
def __init__(
|
| 1259 |
-
self,
|
| 1260 |
-
*,
|
| 1261 |
-
dim,
|
| 1262 |
-
tie_token_emb=False,
|
| 1263 |
-
**kwargs
|
| 1264 |
-
):
|
| 1265 |
-
super().__init__()
|
| 1266 |
-
enc_kwargs, kwargs = groupby_prefix_and_trim('enc_', kwargs)
|
| 1267 |
-
dec_kwargs, kwargs = groupby_prefix_and_trim('dec_', kwargs)
|
| 1268 |
-
|
| 1269 |
-
assert 'dim' not in enc_kwargs and 'dim' not in dec_kwargs, 'dimension of either encoder or decoder must be set with `dim` keyword'
|
| 1270 |
-
enc_transformer_kwargs = pick_and_pop(['num_tokens', 'max_seq_len'], enc_kwargs)
|
| 1271 |
-
enc_transformer_kwargs['emb_dropout'] = enc_kwargs.pop('emb_dropout', 0)
|
| 1272 |
-
enc_transformer_kwargs['num_memory_tokens'] = enc_kwargs.pop('num_memory_tokens', None)
|
| 1273 |
-
enc_transformer_kwargs['use_pos_emb'] = enc_kwargs.pop('use_pos_emb', True)
|
| 1274 |
-
|
| 1275 |
-
dec_transformer_kwargs = pick_and_pop(['num_tokens', 'max_seq_len'], dec_kwargs)
|
| 1276 |
-
dec_transformer_kwargs['emb_dropout'] = dec_kwargs.pop('emb_dropout', 0)
|
| 1277 |
-
dec_transformer_kwargs['use_pos_emb'] = dec_kwargs.pop('use_pos_emb', True)
|
| 1278 |
-
|
| 1279 |
-
self.encoder = TransformerWrapper(
|
| 1280 |
-
**enc_transformer_kwargs,
|
| 1281 |
-
attn_layers=Encoder(dim=dim, **enc_kwargs)
|
| 1282 |
-
)
|
| 1283 |
-
|
| 1284 |
-
self.decoder = TransformerWrapper(
|
| 1285 |
-
**dec_transformer_kwargs,
|
| 1286 |
-
attn_layers=Decoder(dim=dim, cross_attend=True, **dec_kwargs)
|
| 1287 |
-
)
|
| 1288 |
-
|
| 1289 |
-
if tie_token_emb:
|
| 1290 |
-
self.decoder.token_emb = self.encoder.token_emb
|
| 1291 |
-
|
| 1292 |
-
self.decoder = AutoregressiveWrapper(self.decoder)
|
| 1293 |
-
|
| 1294 |
-
@torch.no_grad()
|
| 1295 |
-
def generate(self, seq_in, seq_out_start, seq_len, src_mask=None, src_attn_mask=None, **kwargs):
|
| 1296 |
-
encodings = self.encoder(seq_in, mask=src_mask, attn_mask=src_attn_mask, return_embeddings=True)
|
| 1297 |
-
return self.decoder.generate(seq_out_start, seq_len, context=encodings, context_mask=src_mask, **kwargs)
|
| 1298 |
-
|
| 1299 |
-
def forward(self, src, tgt, src_mask=None, tgt_mask=None, src_attn_mask=None):
|
| 1300 |
-
enc = self.encoder(src, mask=src_mask, attn_mask=src_attn_mask, return_embeddings=True)
|
| 1301 |
-
out = self.decoder(tgt, context=enc, mask=tgt_mask, context_mask=src_mask)
|
| 1302 |
-
return out
|
|
|
|
| 1253 |
return tuple(res)
|
| 1254 |
return res[0]
|
| 1255 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
read.py
CHANGED
|
@@ -28,11 +28,14 @@ def split_and_recombine_text(texts, desired_length=200, max_len=300):
|
|
| 28 |
|
| 29 |
if __name__ == '__main__':
|
| 30 |
parser = argparse.ArgumentParser()
|
| 31 |
-
parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/
|
| 32 |
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
| 33 |
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
|
| 34 |
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
|
| 35 |
-
parser.add_argument('--
|
|
|
|
|
|
|
|
|
|
| 36 |
args = parser.parse_args()
|
| 37 |
|
| 38 |
outpath = args.output_path
|
|
@@ -60,16 +63,11 @@ if __name__ == '__main__':
|
|
| 60 |
if not cond_paths:
|
| 61 |
print('Error: no valid voices specified. Try again.')
|
| 62 |
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
| 64 |
for j, text in enumerate(texts):
|
| 65 |
-
conds =
|
| 66 |
-
for cond_path in cond_paths:
|
| 67 |
-
c = load_audio(cond_path, 22050)
|
| 68 |
-
conds.append(c)
|
| 69 |
-
gen = tts.tts_with_preset(text, conds, preset=args.generation_preset)
|
| 70 |
torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen.squeeze(0).cpu(), 24000)
|
| 71 |
|
| 72 |
-
priors.append(torchaudio.functional.resample(gen, 24000, 22050).squeeze(0))
|
| 73 |
-
while len(priors) > 2:
|
| 74 |
-
priors.pop(0)
|
| 75 |
-
|
|
|
|
| 28 |
|
| 29 |
if __name__ == '__main__':
|
| 30 |
parser = argparse.ArgumentParser()
|
| 31 |
+
parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt")
|
| 32 |
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
| 33 |
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
|
| 34 |
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
|
| 35 |
+
parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
|
| 36 |
+
parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
|
| 37 |
+
help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility',
|
| 38 |
+
default=.5)
|
| 39 |
args = parser.parse_args()
|
| 40 |
|
| 41 |
outpath = args.output_path
|
|
|
|
| 63 |
if not cond_paths:
|
| 64 |
print('Error: no valid voices specified. Try again.')
|
| 65 |
|
| 66 |
+
conds = []
|
| 67 |
+
for cond_path in cond_paths:
|
| 68 |
+
c = load_audio(cond_path, 22050)
|
| 69 |
+
conds.append(c)
|
| 70 |
for j, text in enumerate(texts):
|
| 71 |
+
gen = tts.tts_with_preset(text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen.squeeze(0).cpu(), 24000)
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -6,5 +6,4 @@ tokenizers
|
|
| 6 |
inflect
|
| 7 |
progressbar
|
| 8 |
einops
|
| 9 |
-
unidecode
|
| 10 |
-
x-transformers
|
|
|
|
| 6 |
inflect
|
| 7 |
progressbar
|
| 8 |
einops
|
| 9 |
+
unidecode
|
|
|