EXP

Browse files

Files changed (6) hide show

infer/lib/predictors/FCPE/FCPE.py +581 -0
infer/lib/predictors/FCPE/attentions.py +448 -0
infer/lib/predictors/FCPE/encoder.py +183 -0
infer/lib/predictors/FCPE/stft.py +102 -0
infer/lib/predictors/FCPE/utils.py +92 -0
infer/lib/predictors/FCPE/wav2mel.py +78 -0

infer/lib/predictors/FCPE/FCPE.py ADDED Viewed

	@@ -0,0 +1,581 @@

+import os
+import sys
+import torch
+import numpy as np
+import torch.nn as nn
+import onnxruntime as ort
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils.parametrizations import weight_norm
+sys.path.append(os.getcwd())
+os.environ["LRU_CACHE_CAPACITY"] = "3"
+from main.library.predictors.FCPE.wav2mel import Wav2Mel
+from main.library.predictors.FCPE.encoder import EncoderLayer, ConformerNaiveEncoder
+from main.library.predictors.FCPE.utils import batch_interp_with_replacement_detach, decrypt_model, DotDict
+@torch.no_grad()
+def cent_to_f0(cent):
+    return 10 * 2 ** (cent / 1200)
+@torch.no_grad()
+def f0_to_cent(f0):
+    return 1200 * (f0 / 10).log2()
+@torch.no_grad()
+def latent2cents_decoder(cent_table, y, threshold = 0.05, mask = True):
+    if str(y.device).startswith("privateuseone"):
+        cent_table = cent_table.cpu()
+        y = y.cpu()
+    B, N, _ = y.size()
+    ci = cent_table[None, None, :].expand(B, N, -1)
+    rtn = (ci * y).sum(dim=-1, keepdim=True) / y.sum(dim=-1, keepdim=True)
+    if mask:
+        confident = y.max(dim=-1, keepdim=True)[0]
+        confident_mask = torch.ones_like(confident)
+        confident_mask[confident <= threshold] = float("-INF")
+        rtn = rtn * confident_mask
+    return rtn
+@torch.no_grad()
+def latent2cents_local_decoder(cent_table, out_dims, y, threshold = 0.05, mask = True):
+    if str(y.device).startswith("privateuseone"):
+        cent_table = cent_table.cpu()
+        y = y.cpu()
+    B, N, _ = y.size()
+    ci = cent_table[None, None, :].expand(B, N, -1)
+    confident, max_index = y.max(dim=-1, keepdim=True)
+    local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4)
+    local_argmax_index[local_argmax_index < 0] = 0
+    local_argmax_index[local_argmax_index >= out_dims] = out_dims - 1
+    y_l = y.gather(-1, local_argmax_index)
+    rtn = (ci.gather(-1, local_argmax_index) * y_l).sum(dim=-1, keepdim=True) / y_l.sum(dim=-1, keepdim=True)
+    if mask:
+        confident_mask = torch.ones_like(confident)
+        confident_mask[confident <= threshold] = float("-INF")
+        rtn = rtn * confident_mask
+    return rtn
+def cents_decoder(cent_table, y, confidence, threshold = 0.05, mask=True):
+    if str(y.device).startswith("privateuseone"):
+        cent_table = cent_table.cpu()
+        y = y.cpu()
+    B, N, _ = y.size()
+    rtn = (cent_table[None, None, :].expand(B, N, -1) * y).sum(dim=-1, keepdim=True) / y.sum(dim=-1, keepdim=True)
+    if mask:
+        confident = y.max(dim=-1, keepdim=True)[0]
+        confident_mask = torch.ones_like(confident)
+        confident_mask[confident <= threshold] = float("-INF")
+        rtn = rtn * confident_mask
+    return (rtn, confident) if confidence else rtn
+def cents_local_decoder(cent_table, y, n_out, confidence, threshold = 0.05, mask=True):
+    if str(y.device).startswith("privateuseone"):
+        cent_table = cent_table.cpu()
+        y = y.cpu()
+    B, N, _ = y.size()
+    confident, max_index = y.max(dim=-1, keepdim=True)
+    local_argmax_index = (torch.arange(0, 9).to(max_index.device) + (max_index - 4)).clamp(0, n_out - 1)
+    y_l = y.gather(-1, local_argmax_index)
+    rtn = (cent_table[None, None, :].expand(B, N, -1).gather(-1, local_argmax_index) * y_l).sum(dim=-1, keepdim=True) / y_l.sum(dim=-1, keepdim=True)
+    if mask:
+        confident_mask = torch.ones_like(confident)
+        confident_mask[confident <= threshold] = float("-INF")
+        rtn = rtn * confident_mask
+    return (rtn, confident) if confidence else rtn
+class PCmer(nn.Module):
+    def __init__(
+        self,
+        num_layers,
+        num_heads,
+        dim_model,
+        dim_keys,
+        dim_values,
+        residual_dropout,
+        attention_dropout
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.dim_model = dim_model
+        self.dim_values = dim_values
+        self.dim_keys = dim_keys
+        self.residual_dropout = residual_dropout
+        self.attention_dropout = attention_dropout
+        self._layers = nn.ModuleList([EncoderLayer(self) for _ in range(num_layers)])
+    def forward(self, phone, mask=None):
+        for layer in self._layers:
+            phone = layer(phone, mask)
+        return phone
+class CFNaiveMelPE(nn.Module):
+    def __init__(
+        self,
+        input_channels,
+        out_dims,
+        hidden_dims = 512,
+        n_layers = 6,
+        n_heads = 8,
+        f0_max = 1975.5,
+        f0_min = 32.70,
+        use_fa_norm = False,
+        conv_only = False,
+        conv_dropout = 0,
+        atten_dropout = 0,
+        use_harmonic_emb = False
+    ):
+        super().__init__()
+        self.input_channels = input_channels
+        self.out_dims = out_dims
+        self.hidden_dims = hidden_dims
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.f0_max = f0_max
+        self.f0_min = f0_min
+        self.use_fa_norm = use_fa_norm
+        self.harmonic_emb = nn.Embedding(9, hidden_dims) if use_harmonic_emb else None
+        self.input_stack = nn.Sequential(
+            nn.Conv1d(
+                input_channels,
+                hidden_dims,
+                3,
+                1,
+                1
+            ),
+            nn.GroupNorm(
+                4,
+                hidden_dims
+            ),
+            nn.LeakyReLU(),
+            nn.Conv1d(
+                hidden_dims,
+                hidden_dims,
+                3,
+                1,
+                1
+            )
+        )
+        self.net = ConformerNaiveEncoder(
+            num_layers=n_layers,
+            num_heads=n_heads,
+            dim_model=hidden_dims,
+            use_norm=use_fa_norm,
+            conv_only=conv_only,
+            conv_dropout=conv_dropout,
+            atten_dropout=atten_dropout
+        )
+        self.norm = nn.LayerNorm(hidden_dims)
+        self.output_proj = weight_norm(
+            nn.Linear(
+                hidden_dims,
+                out_dims
+            )
+        )
+        self.cent_table_b = torch.linspace(
+            f0_to_cent(torch.Tensor([f0_min]))[0],
+            f0_to_cent(torch.Tensor([f0_max]))[0],
+            out_dims
+        ).detach()
+        self.gaussian_blurred_cent_mask_b = (
+            1200 * torch.Tensor([self.f0_max / 10.]).log2()
+        )[0].detach()
+        self.register_buffer("cent_table", self.cent_table_b)
+        self.register_buffer("gaussian_blurred_cent_mask", self.gaussian_blurred_cent_mask_b)
+    def forward(self, x, _h_emb=None):
+        x = self.input_stack(x.transpose(-1, -2)).transpose(-1, -2)
+        if self.harmonic_emb is not None:
+            if _h_emb is None:
+                x += self.harmonic_emb(torch.LongTensor([0]).to(x.device))
+            else:
+                x += self.harmonic_emb(torch.LongTensor([int(_h_emb)]).to(x.device))
+        return self.output_proj(self.norm(self.net(x))).sigmoid()
+    @torch.no_grad()
+    def infer(self, mel, decoder = "local_argmax", threshold = 0.05):
+        latent = self.forward(mel)
+        return cent_to_f0(
+            (
+                latent2cents_decoder(
+                    self.cent_table,
+                    latent,
+                    threshold=threshold
+                )
+            ) if decoder == "argmax" else (
+                latent2cents_local_decoder(
+                    self.cent_table,
+                    self.out_dims,
+                    latent,
+                    threshold=threshold
+                )
+            )
+        )
+class FCPE_LEGACY(nn.Module):
+    def __init__(
+        self,
+        input_channel=128,
+        out_dims=360,
+        n_layers=12,
+        n_chans=512,
+        f0_max=1975.5,
+        f0_min=32.70,
+        confidence=False,
+        threshold=0.05,
+        use_input_conv=True
+    ):
+        super().__init__()
+        self.n_out = out_dims
+        self.f0_max = f0_max
+        self.f0_min = f0_min
+        self.confidence = confidence
+        self.threshold = threshold
+        self.use_input_conv = use_input_conv
+        self.cent_table_b = torch.Tensor(
+            np.linspace(
+                f0_to_cent(torch.Tensor([f0_min]))[0],
+                f0_to_cent(torch.Tensor([f0_max]))[0],
+                out_dims
+            )
+        )
+        self.register_buffer("cent_table", self.cent_table_b)
+        self.stack = nn.Sequential(
+            nn.Conv1d(
+                input_channel,
+                n_chans,
+                3,
+                1,
+                1
+            ),
+            nn.GroupNorm(
+                4,
+                n_chans
+            ),
+            nn.LeakyReLU(),
+            nn.Conv1d(
+                n_chans,
+                n_chans,
+                3,
+                1,
+                1
+            )
+        )
+        self.decoder = PCmer(
+            num_layers=n_layers,
+            num_heads=8,
+            dim_model=n_chans,
+            dim_keys=n_chans,
+            dim_values=n_chans,
+            residual_dropout=0.1,
+            attention_dropout=0.1
+        )
+        self.norm = nn.LayerNorm(n_chans)
+        self.dense_out = weight_norm(
+            nn.Linear(
+                n_chans,
+                self.n_out
+            )
+        )
+    def forward(self, mel, return_hz_f0=False, cdecoder="local_argmax", output_interp_target_length=None):
+        x = self.decoder(self.stack(mel.transpose(1, 2)).transpose(1, 2) if self.use_input_conv else mel)
+        x = self.dense_out(self.norm(x)).sigmoid()
+        x = cent_to_f0(
+            (
+                cents_decoder(
+                    self.cent_table,
+                    x,
+                    self.confidence,
+                    threshold=self.threshold,
+                    mask=True
+                )
+            ) if cdecoder == "argmax" else (
+                cents_local_decoder(
+                    self.cent_table,
+                    x,
+                    self.n_out,
+                    self.confidence,
+                    threshold=self.threshold,
+                    mask=True
+                )
+            )
+        )
+        x = (1 + x / 700).log() if not return_hz_f0 else x
+        if output_interp_target_length is not None:
+            x = F.interpolate(
+                torch.where(x == 0, float("nan"), x).transpose(1, 2),
+                size=int(output_interp_target_length),
+                mode="linear"
+            ).transpose(1, 2)
+            x = torch.where(x.isnan(), float(0.0), x)
+        return x
+    def gaussian_blurred_cent(self, cents):
+        B, N, _ = cents.size()
+        return (
+            -(self.cent_table[None, None, :].expand(B, N, -1) - cents).square() / 1250
+        ).exp() * (cents > 0.1) & (
+            cents < (1200.0 * np.log2(self.f0_max / 10.0))
+        ).float()
+class InferCFNaiveMelPE(torch.nn.Module):
+    def __init__(
+        self,
+        args,
+        state_dict
+    ):
+        super().__init__()
+        self.model = CFNaiveMelPE(
+            input_channels=args.mel.num_mels,
+            out_dims=args.model.out_dims,
+            hidden_dims=args.model.hidden_dims,
+            n_layers=args.model.n_layers,
+            n_heads=args.model.n_heads,
+            f0_max=args.model.f0_max,
+            f0_min=args.model.f0_min,
+            use_fa_norm=args.model.use_fa_norm,
+            conv_only=args.model.conv_only,
+            conv_dropout=args.model.conv_dropout,
+            atten_dropout=args.model.atten_dropout,
+            use_harmonic_emb=False
+        )
+        self.model.load_state_dict(state_dict)
+        self.model.eval()
+        self.register_buffer("tensor_device_marker", torch.tensor(1.0).float(), persistent=False)
+    def forward(self, mel, decoder_mode = "local_argmax", threshold = 0.006):
+        with torch.no_grad():
+            mels = rearrange(torch.stack([mel], -1), "B T C K -> (B K) T C")
+            f0s = rearrange(self.model.infer(mels, decoder=decoder_mode, threshold=threshold), "(B K) T 1 -> B T (K 1)", K=1)
+        return f0s
+    def infer(
+        self,
+        mel,
+        decoder_mode = "local_argmax",
+        threshold = 0.006,
+        f0_min = None,
+        f0_max = None,
+        interp_uv = False,
+        output_interp_target_length = None,
+        return_uv = False
+    ):
+        f0 = self.__call__(mel, decoder_mode, threshold)
+        f0_for_uv = f0
+        uv = (f0_for_uv < f0_min).type(f0_for_uv.dtype)
+        f0 = f0 * (1 - uv)
+        if interp_uv:
+            f0 = batch_interp_with_replacement_detach(
+                uv.squeeze(-1).bool(),
+                f0.squeeze(-1)
+            ).unsqueeze(-1)
+        if f0_max is not None: f0[f0 > f0_max] = f0_max
+        if output_interp_target_length is not None:
+            f0 = F.interpolate(
+                torch.where(f0 == 0, float("nan"), f0).transpose(1, 2),
+                size=int(output_interp_target_length),
+                mode="linear"
+            ).transpose(1, 2)
+            f0 = torch.where(f0.isnan(), float(0.0), f0)
+        if return_uv: return f0, F.interpolate(uv.transpose(1, 2), size=int(output_interp_target_length), mode="nearest").transpose(1, 2)
+        else: return f0
+class FCPEInfer_LEGACY:
+    def __init__(
+        self,
+        configs,
+        model_path,
+        device=None,
+        dtype=torch.float32,
+        providers=None,
+        onnx=False,
+        f0_min=50,
+        f0_max=1100
+    ):
+        if device is None: device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = device
+        self.dtype = dtype
+        self.onnx = onnx
+        self.f0_min = f0_min
+        self.f0_max = f0_max
+        self.wav2mel = Wav2Mel(device=self.device, dtype=self.dtype)
+        if self.onnx:
+            sess_options = ort.SessionOptions()
+            sess_options.log_severity_level = 3
+            self.model = ort.InferenceSession(decrypt_model(configs, model_path), sess_options=sess_options, providers=providers)
+        else:
+            ckpt = torch.load(model_path, map_location="cpu", weights_only=True)
+            self.args = DotDict(ckpt["config"])
+            model = FCPE_LEGACY(
+                input_channel=self.args.model.input_channel,
+                out_dims=self.args.model.out_dims,
+                n_layers=self.args.model.n_layers,
+                n_chans=self.args.model.n_chans,
+                f0_max=self.f0_max,
+                f0_min=self.f0_min,
+                confidence=self.args.model.confidence
+            )
+            model.to(self.device).to(self.dtype)
+            model.load_state_dict(ckpt["model"])
+            model.eval()
+            self.model = model
+    @torch.no_grad()
+    def __call__(self, audio, sr, threshold=0.05, p_len=None):
+        if not self.onnx: self.model.threshold = threshold
+        if not hasattr(self, "numpy_threshold") and self.onnx: self.numpy_threshold = np.array(threshold, dtype=np.float32)
+        mel = self.wav2mel(audio=audio[None, :], sample_rate=sr).to(self.dtype)
+        if self.onnx:
+            return torch.as_tensor(
+                self.model.run(
+                    [self.model.get_outputs()[0].name],
+                    {
+                        self.model.get_inputs()[0].name: mel.detach().cpu().numpy(),
+                        self.model.get_inputs()[1].name: self.numpy_threshold
+                    }
+                )[0],
+                dtype=self.dtype,
+                device=self.device
+            )
+        else:
+            return self.model(
+                mel=mel,
+                return_hz_f0=True,
+                output_interp_target_length=p_len
+            )
+class FCPEInfer:
+    def __init__(
+        self,
+        configs,
+        model_path,
+        device=None,
+        dtype=torch.float32,
+        providers=None,
+        onnx=False,
+        f0_min=50,
+        f0_max=1100
+    ):
+        if device is None: device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = device
+        self.dtype = dtype
+        self.onnx = onnx
+        self.f0_min = f0_min
+        self.f0_max = f0_max
+        self.wav2mel = Wav2Mel(device=self.device, dtype=self.dtype)
+        if self.onnx:
+            sess_options = ort.SessionOptions()
+            sess_options.log_severity_level = 3
+            self.model = ort.InferenceSession(decrypt_model(configs, model_path), sess_options=sess_options, providers=providers)
+        else:
+            ckpt = torch.load(model_path, map_location="cpu", weights_only=True)
+            ckpt["config_dict"]["model"]["conv_dropout"] = ckpt["config_dict"]["model"]["atten_dropout"] = 0.0
+            self.args = DotDict(ckpt["config_dict"])
+            model = InferCFNaiveMelPE(self.args, ckpt["model"])
+            self.model = model.to(device).to(self.dtype).eval()
+    @torch.no_grad()
+    def __call__(self, audio, sr, threshold=0.05, p_len=None):
+        if not hasattr(self, "numpy_threshold") and self.onnx: self.numpy_threshold = np.array(threshold, dtype=np.float32)
+        mel = self.wav2mel(audio=audio[None, :], sample_rate=sr).to(self.dtype)
+        if self.onnx:
+            return torch.as_tensor(
+                self.model.run(
+                    [self.model.get_outputs()[0].name],
+                    {
+                        self.model.get_inputs()[0].name: mel.detach().cpu().numpy(),
+                        self.model.get_inputs()[1].name: self.numpy_threshold
+                    }
+                )[0],
+                dtype=self.dtype,
+                device=self.device
+            )
+        else:
+            return self.model.infer(
+                mel,
+                threshold=threshold,
+                f0_min=self.f0_min,
+                f0_max=self.f0_max,
+                output_interp_target_length=p_len
+            )
+class FCPE:
+    def __init__(
+        self,
+        configs,
+        model_path,
+        hop_length=512,
+        f0_min=50,
+        f0_max=1100,
+        dtype=torch.float32,
+        device=None,
+        sample_rate=16000,
+        threshold=0.05,
+        providers=None,
+        onnx=False,
+        legacy=False
+    ):
+        self.model = FCPEInfer_LEGACY if legacy else FCPEInfer
+        self.fcpe = self.model(configs, model_path, device=device, dtype=dtype, providers=providers, onnx=onnx, f0_min=f0_min, f0_max=f0_max)
+        self.hop_length = hop_length
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.threshold = threshold
+        self.sample_rate = sample_rate
+        self.dtype = dtype
+        self.legacy = legacy
+    def compute_f0(self, wav, p_len=None):
+        x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
+        p_len = (x.shape[0] // self.hop_length) if p_len is None else p_len
+        f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold, p_len=p_len)
+        f0 = f0[:] if f0.dim() == 1 else f0[0, :, 0]
+        if torch.all(f0 == 0): return f0.cpu().numpy() if p_len is None else np.zeros(p_len)
+        return f0.cpu().numpy()

infer/lib/predictors/FCPE/attentions.py ADDED Viewed

	@@ -0,0 +1,448 @@

+import math
+import torch
+import torch.nn.functional as F
+from torch import nn, einsum
+from functools import partial
+from einops import rearrange, repeat, pack, unpack
+def exists(val):
+    return val is not None
+def default(value, d):
+    return value if exists(value) else d
+def empty(tensor):
+    return tensor.numel() == 0
+def pad_to_multiple(tensor, multiple, dim=-1, value=0):
+    seqlen = tensor.shape[dim]
+    m = seqlen / multiple
+    if m.is_integer(): return False, tensor
+    return True, F.pad(tensor, (*((0,) * (-1 - dim) * 2), 0, (math.ceil(m) * multiple - seqlen)), value = value)
+def look_around(x, backward = 1, forward = 0, pad_value = -1, dim = 2):
+    t = x.shape[1]
+    dims = (len(x.shape) - dim) * (0, 0)
+    padded_x = F.pad(x, (*dims, backward, forward), value = pad_value)
+    return torch.cat([padded_x[:, ind:(ind + t), ...] for ind in range(forward + backward + 1)], dim = dim)
+def rotate_half(x):
+    x1, x2 = rearrange(x, 'b ... (r d) -> b ... r d', r = 2).unbind(dim = -2)
+    return torch.cat((-x2, x1), dim = -1)
+def apply_rotary_pos_emb(q, k, freqs, scale = 1):
+    q_len = q.shape[-2]
+    q_freqs = freqs[..., -q_len:, :]
+    inv_scale = scale ** -1
+    if scale.ndim == 2: scale = scale[-q_len:, :]
+    q = (q * q_freqs.cos() * scale) + (rotate_half(q) * q_freqs.sin() * scale)
+    k = (k * freqs.cos() * inv_scale) + (rotate_half(k) * freqs.sin() * inv_scale)
+    return q, k
+def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None):
+    unstructured_block = torch.randn((cols, cols), device=device)
+    q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced")
+    q, r = map(lambda t: t.to(device), (q, r))
+    if qr_uniform_q:
+        d = r.diag(0)
+        q *= d.sign()
+    return q.t()
+def gaussian_orthogonal_random_matrix(nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None):
+    nb_full_blocks = int(nb_rows / nb_columns)
+    block_list = []
+    for _ in range(nb_full_blocks):
+        block_list.append(orthogonal_matrix_chunk(nb_columns, qr_uniform_q=qr_uniform_q, device=device))
+    remaining_rows = nb_rows - nb_full_blocks * nb_columns
+    if remaining_rows > 0:
+        block_list.append(
+            orthogonal_matrix_chunk(
+                nb_columns,
+                qr_uniform_q=qr_uniform_q,
+                device=device
+            )[:remaining_rows]
+        )
+    if scaling == 0:
+        multiplier = torch.randn(
+            (nb_rows, nb_columns),
+            device=device
+        ).norm(dim=1)
+    elif scaling == 1:
+        multiplier = math.sqrt(
+            (float(nb_columns))
+        ) * torch.ones(
+            (nb_rows,),
+            device=device
+        )
+    else: raise ValueError(f"{scaling} != 0, 1")
+    return multiplier.diag() @ torch.cat(block_list)
+def linear_attention(q, k, v):
+    return einsum(
+        "...ed,...nd->...ne",
+        k,
+        q
+    ) if v is None else einsum(
+        "...de,...nd,...n->...ne",
+        einsum(
+            "...nd,...ne->...de",
+            k,
+            v
+        ),
+        q,
+        1.0 / (einsum(
+            "...nd,...d->...n",
+            q,
+            k.sum(dim=-2).type_as(q)
+        ) + 1e-8)
+    )
+def softmax_kernel(data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None):
+    b, h, *_ = data.shape
+    data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0
+    ratio = projection_matrix.shape[0] ** -0.5
+    data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), repeat(projection_matrix, "j d -> b h j d", b=b, h=h).type_as(data))
+    diag_data = (((data**2).sum(dim=-1) / 2.0) * (data_normalizer**2)).unsqueeze(dim=-1)
+    return (ratio * ((data_dash - diag_data - data_dash.max(dim=-1, keepdim=True).values).exp() + eps) if is_query else ratio * ((data_dash - diag_data + eps).exp())).type_as(data)
+class SinusoidalEmbeddings(nn.Module):
+    def __init__(
+        self,
+        dim,
+        scale_base = None,
+        use_xpos = False,
+        theta = 10000
+    ):
+        super().__init__()
+        inv_freq = 1. / (theta ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq)
+        self.use_xpos = use_xpos
+        self.scale_base = scale_base
+        assert not (use_xpos and not exists(scale_base))
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.register_buffer('scale', scale, persistent = False)
+    def forward(self, x):
+        seq_len, device = x.shape[-2], x.device
+        t = torch.arange(seq_len, device = x.device).type_as(self.inv_freq)
+        freqs = torch.einsum('i , j -> i j', t, self.inv_freq)
+        freqs =  torch.cat((freqs, freqs), dim = -1)
+        if not self.use_xpos: return freqs, torch.ones(1, device = device)
+        power = (t - (seq_len // 2)) / self.scale_base
+        scale = self.scale ** rearrange(power, 'n -> n 1')
+        return freqs, torch.cat((scale, scale), dim = -1)
+class LocalAttention(nn.Module):
+    def __init__(
+        self,
+        window_size,
+        causal = False,
+        look_backward = 1,
+        look_forward = None,
+        dropout = 0.,
+        shared_qk = False,
+        rel_pos_emb_config = None,
+        dim = None,
+        autopad = False,
+        exact_windowsize = False,
+        scale = None,
+        use_rotary_pos_emb = True,
+        use_xpos = False,
+        xpos_scale_base = None
+    ):
+        super().__init__()
+        look_forward = default(look_forward, 0 if causal else 1)
+        assert not (causal and look_forward > 0)
+        self.scale = scale
+        self.window_size = window_size
+        self.autopad = autopad
+        self.exact_windowsize = exact_windowsize
+        self.causal = causal
+        self.look_backward = look_backward
+        self.look_forward = look_forward
+        self.dropout = nn.Dropout(dropout)
+        self.shared_qk = shared_qk
+        self.rel_pos = None
+        self.use_xpos = use_xpos
+        if use_rotary_pos_emb and (exists(rel_pos_emb_config) or exists(dim)):
+            if exists(rel_pos_emb_config): dim = rel_pos_emb_config[0]
+            self.rel_pos = SinusoidalEmbeddings(dim, use_xpos = use_xpos, scale_base = default(xpos_scale_base, window_size // 2))
+    def forward(self, q, k, v, mask = None, input_mask = None, attn_bias = None, window_size = None):
+        mask = default(mask, input_mask)
+        assert not (exists(window_size) and not self.use_xpos)
+        (
+            _,
+            autopad,
+            pad_value,
+            window_size,
+            causal,
+            look_backward,
+            look_forward,
+            shared_qk
+        ) = (
+            q.shape,
+            self.autopad,
+            -1,
+            default(
+                window_size,
+                self.window_size
+            ),
+            self.causal,
+            self.look_backward,
+            self.look_forward,
+            self.shared_qk
+        )
+        (q, packed_shape), (k, _), (v, _) = map(lambda t: pack([t], '* n d'), (q, k, v))
+        if autopad:
+            orig_seq_len = q.shape[1]
+            (_, q), (_, k), (_, v) = map(lambda t: pad_to_multiple(t, self.window_size, dim = -2), (q, k, v))
+        b, n, dim_head, device, dtype = *q.shape, q.device, q.dtype
+        scale = default(self.scale, dim_head ** -0.5)
+        assert (n % window_size) == 0
+        windows = n // window_size
+        if shared_qk: k = F.normalize(k, dim = -1).type(k.dtype)
+        seq = torch.arange(n, device = device)
+        b_t = rearrange(seq, '(w n) -> 1 w n', w = windows, n = window_size)
+        bq, bk, bv = map(lambda t: rearrange(t, 'b (w n) d -> b w n d', w = windows), (q, k, v))
+        bq = bq * scale
+        look_around_kwargs = dict(backward =  look_backward, forward =  look_forward, pad_value = pad_value)
+        bk = look_around(bk, **look_around_kwargs)
+        bv = look_around(bv, **look_around_kwargs)
+        if exists(self.rel_pos):
+            pos_emb, xpos_scale = self.rel_pos(bk)
+            bq, bk = apply_rotary_pos_emb(bq, bk, pos_emb, scale = xpos_scale)
+        bq_t = b_t
+        bq_k = look_around(b_t, **look_around_kwargs)
+        bq_t = rearrange(bq_t, '... i -> ... i 1')
+        bq_k = rearrange(bq_k, '... j -> ... 1 j')
+        pad_mask = bq_k == pad_value
+        sim = einsum('b h i e, b h j e -> b h i j', bq, bk)
+        if exists(attn_bias):
+            heads = attn_bias.shape[0]
+            assert (b % heads) == 0
+            attn_bias = repeat(attn_bias, 'h i j -> (b h) 1 i j', b = b // heads)
+            sim = sim + attn_bias
+        mask_value = -torch.finfo(sim.dtype).max
+        if shared_qk:
+            self_mask = bq_t == bq_k
+            sim = sim.masked_fill(self_mask, -5e4)
+            del self_mask
+        if causal:
+            causal_mask = bq_t < bq_k
+            if self.exact_windowsize: causal_mask = causal_mask | (bq_t > (bq_k + (self.window_size * self.look_backward)))
+            sim = sim.masked_fill(causal_mask, mask_value)
+            del causal_mask
+        sim = sim.masked_fill(
+            ((bq_k - (self.window_size * self.look_forward)) > bq_t) | (bq_t > (bq_k + (self.window_size * self.look_backward))) | pad_mask,
+            mask_value
+        ) if not causal and self.exact_windowsize else sim.masked_fill(
+            pad_mask,
+            mask_value
+        )
+        if exists(mask):
+            batch = mask.shape[0]
+            assert (b % batch) == 0
+            h = b // mask.shape[0]
+            if autopad: _, mask = pad_to_multiple(mask, window_size, dim = -1, value = False)
+            mask = repeat(
+                rearrange(
+                    look_around(
+                        rearrange(
+                            mask,
+                            '... (w n) -> (...) w n',
+                            w = windows,
+                            n = window_size
+                        ),
+                        **{
+                            **look_around_kwargs,
+                            'pad_value': False
+                        }
+                    ),
+                    '... j -> ... 1 j'
+                ),
+                'b ... -> (b h) ...',
+                h = h
+            )
+            sim = sim.masked_fill(~mask, mask_value)
+            del mask
+        out = rearrange(
+            einsum(
+                'b h i j, b h j e -> b h i e',
+                self.dropout(sim.softmax(dim = -1)),
+                bv
+            ),
+            'b w n d -> b (w n) d'
+        )
+        if autopad: out = out[:, :orig_seq_len, :]
+        out, *_ = unpack(out, packed_shape, '* n d')
+        return out
+class FastAttention(nn.Module):
+    def __init__(
+        self,
+        dim_heads,
+        nb_features=None,
+        ortho_scaling=0,
+        causal=False,
+        generalized_attention=False,
+        kernel_fn=nn.ReLU(),
+        qr_uniform_q=False,
+        no_projection=False
+    ):
+        super().__init__()
+        nb_features = default(nb_features, int(dim_heads * math.log(dim_heads)))
+        self.dim_heads = dim_heads
+        self.nb_features = nb_features
+        self.ortho_scaling = ortho_scaling
+        self.create_projection = partial(
+            gaussian_orthogonal_random_matrix,
+            nb_rows=self.nb_features,
+            nb_columns=dim_heads,
+            scaling=ortho_scaling,
+            qr_uniform_q=qr_uniform_q
+        )
+        projection_matrix = self.create_projection()
+        self.register_buffer("projection_matrix", projection_matrix)
+        self.generalized_attention = generalized_attention
+        self.kernel_fn = kernel_fn
+        self.no_projection = no_projection
+        self.causal = causal
+    @torch.no_grad()
+    def redraw_projection_matrix(self):
+        projections = self.create_projection()
+        self.projection_matrix.copy_(projections)
+        del projections
+    def forward(self, q, k, v):
+        if self.no_projection: q, k = q.softmax(dim=-1), (k.exp() if self.causal else k.softmax(dim=-2))
+        else:
+            create_kernel = partial(softmax_kernel, projection_matrix=self.projection_matrix, device=q.device)
+            q, k = create_kernel(q, is_query=True), create_kernel(k, is_query=False)
+        attn_fn = linear_attention if not self.causal else self.causal_linear_fn
+        return attn_fn(q, k, None) if v is None else attn_fn(q, k, v)
+class SelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        causal=False,
+        heads=8,
+        dim_head=64,
+        local_heads=0,
+        local_window_size=256,
+        nb_features=None,
+        feature_redraw_interval=1000,
+        generalized_attention=False,
+        kernel_fn=nn.ReLU(),
+        qr_uniform_q=False,
+        dropout=0.0,
+        no_projection=False
+    ):
+        super().__init__()
+        assert dim % heads == 0
+        dim_head = default(dim_head, dim // heads)
+        inner_dim = dim_head * heads
+        self.fast_attention = FastAttention(
+            dim_head,
+            nb_features,
+            causal=causal,
+            generalized_attention=generalized_attention,
+            kernel_fn=kernel_fn,
+            qr_uniform_q=qr_uniform_q,
+            no_projection=no_projection
+        )
+        self.heads = heads
+        self.global_heads = heads - local_heads
+        self.local_attn = (
+            LocalAttention(
+                window_size=local_window_size,
+                causal=causal,
+                autopad=True,
+                dropout=dropout,
+                look_forward=int(not causal),
+                rel_pos_emb_config=(dim_head, local_heads)
+            ) if local_heads > 0 else None
+        )
+        self.to_q = nn.Linear(dim, inner_dim)
+        self.to_k = nn.Linear(dim, inner_dim)
+        self.to_v = nn.Linear(dim, inner_dim)
+        self.to_out = nn.Linear(inner_dim, dim)
+        self.dropout = nn.Dropout(dropout)
+    @torch.no_grad()
+    def redraw_projection_matrix(self):
+        self.fast_attention.redraw_projection_matrix()
+    def forward(self, x, context=None, mask=None, context_mask=None, name=None, inference=False, **kwargs):
+        _, _, _, h, gh = *x.shape, self.heads, self.global_heads
+        cross_attend = exists(context)
+        context = default(context, x)
+        context_mask = default(context_mask, mask) if not cross_attend else context_mask
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (self.to_q(x), self.to_k(context), self.to_v(context)))
+        (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v))
+        attn_outs = []
+        if not empty(q):
+            if exists(context_mask): v.masked_fill_(~context_mask[:, None, :, None], 0.0)
+            if cross_attend: pass
+            else: out = self.fast_attention(q, k, v)
+            attn_outs.append(out)
+        if not empty(lq):
+            assert (not cross_attend), "not cross_attend"
+            out = self.local_attn(lq, lk, lv, input_mask=mask)
+            attn_outs.append(out)
+        return self.dropout(self.to_out(rearrange(torch.cat(attn_outs, dim=1), "b h n d -> b n (h d)")))

infer/lib/predictors/FCPE/encoder.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import os
+import sys
+import torch.nn as nn
+import torch.nn.functional as F
+sys.path.append(os.getcwd())
+from main.library.predictors.FCPE.attentions import SelfAttention
+from main.library.predictors.FCPE.utils import calc_same_padding, Transpose, GLU, Swish
+class ConformerConvModule_LEGACY(nn.Module):
+    def __init__(
+        self,
+        dim,
+        causal=False,
+        expansion_factor=2,
+        kernel_size=31,
+        dropout=0.0
+    ):
+        super().__init__()
+        inner_dim = dim * expansion_factor
+        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
+            Transpose((1, 2)),
+            nn.Conv1d(dim, inner_dim * 2, 1),
+            GLU(dim=1),
+            DepthWiseConv1d_LEGACY(
+                inner_dim,
+                inner_dim,
+                kernel_size=kernel_size,
+                padding=(
+                    calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
+                )
+            ),
+            Swish(),
+            nn.Conv1d(inner_dim, dim, 1),
+            Transpose((1, 2)),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        return self.net(x)
+class ConformerConvModule(nn.Module):
+    def __init__(
+        self,
+        dim,
+        expansion_factor=2,
+        kernel_size=31,
+        dropout=0
+    ):
+        super().__init__()
+        inner_dim = dim * expansion_factor
+        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
+            Transpose((1, 2)),
+            nn.Conv1d(dim, inner_dim * 2, 1),
+            nn.GLU(dim=1),
+            DepthWiseConv1d(
+                inner_dim,
+                inner_dim,
+                kernel_size=kernel_size,
+                padding=calc_same_padding(kernel_size)[0],
+                groups=inner_dim
+            ),
+            nn.SiLU(),
+            nn.Conv1d(inner_dim, dim, 1),
+            Transpose((1, 2)),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        return self.net(x)
+class DepthWiseConv1d_LEGACY(nn.Module):
+    def __init__(
+        self,
+        chan_in,
+        chan_out,
+        kernel_size,
+        padding
+    ):
+        super().__init__()
+        self.padding = padding
+        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in)
+    def forward(self, x):
+        return self.conv(F.pad(x, self.padding))
+class DepthWiseConv1d(nn.Module):
+    def __init__(
+        self,
+        chan_in,
+        chan_out,
+        kernel_size,
+        padding,
+        groups
+    ):
+        super().__init__()
+        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size=kernel_size, padding=padding, groups=groups)
+    def forward(self, x):
+        return self.conv(x)
+class EncoderLayer(nn.Module):
+    def __init__(
+        self,
+        parent
+    ):
+        super().__init__()
+        self.conformer = ConformerConvModule_LEGACY(parent.dim_model)
+        self.norm = nn.LayerNorm(parent.dim_model)
+        self.dropout = nn.Dropout(parent.residual_dropout)
+        self.attn = SelfAttention(dim=parent.dim_model, heads=parent.num_heads, causal=False)
+    def forward(self, phone, mask=None):
+        phone = phone + (self.attn(self.norm(phone), mask=mask))
+        return phone + (self.conformer(phone))
+class ConformerNaiveEncoder(nn.Module):
+    def __init__(
+        self,
+        num_layers,
+        num_heads,
+        dim_model,
+        use_norm = False,
+        conv_only = False,
+        conv_dropout = 0,
+        atten_dropout = 0
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.dim_model = dim_model
+        self.use_norm = use_norm
+        self.residual_dropout = 0.1
+        self.attention_dropout = 0.1
+        self.encoder_layers = nn.ModuleList([
+            CFNEncoderLayer(dim_model, num_heads, use_norm, conv_only, conv_dropout, atten_dropout)
+            for _ in range(num_layers)
+        ])
+    def forward(self, x, mask=None):
+        for (_, layer) in enumerate(self.encoder_layers):
+            x = layer(x, mask)
+        return x
+class CFNEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        dim_model,
+        num_heads = 8,
+        use_norm = False,
+        conv_only = False,
+        conv_dropout = 0,
+        atten_dropout = 0
+    ):
+        super().__init__()
+        self.conformer = (
+            nn.Sequential(
+                ConformerConvModule(dim_model),
+                nn.Dropout(conv_dropout)
+            )
+        ) if conv_dropout > 0 else (
+            ConformerConvModule(dim_model)
+        )
+        self.norm = nn.LayerNorm(dim_model)
+        self.dropout = nn.Dropout(0.1)
+        self.attn = SelfAttention(
+            dim=dim_model,
+            heads=num_heads,
+            causal=False,
+            use_norm=use_norm,
+            dropout=atten_dropout
+        ) if not conv_only else None
+    def forward(self, x, mask=None):
+        if self.attn is not None: x = x + (self.attn(self.norm(x), mask=mask))
+        return x + (self.conformer(x))

infer/lib/predictors/FCPE/stft.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import os
+import sys
+import torch
+import numpy as np
+import torch.nn.functional as F
+from librosa.filters import mel
+sys.path.append(os.getcwd())
+class STFT:
+    def __init__(
+        self,
+        sr=22050,
+        n_mels=80,
+        n_fft=1024,
+        win_size=1024,
+        hop_length=256,
+        fmin=20,
+        fmax=11025,
+        clip_val=1e-5
+    ):
+        self.target_sr = sr
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        self.win_size = win_size
+        self.hop_length = hop_length
+        self.fmin = fmin
+        self.fmax = fmax
+        self.clip_val = clip_val
+        self.mel_basis = {}
+        self.hann_window = {}
+    def get_mel(self, y, keyshift=0, speed=1, center=False, train=False):
+        n_fft = self.n_fft
+        win_size = self.win_size
+        hop_length = self.hop_length
+        fmax = self.fmax
+        factor = 2 ** (keyshift / 12)
+        win_size_new = int(np.round(win_size * factor))
+        hop_length_new = int(np.round(hop_length * speed))
+        mel_basis = self.mel_basis if not train else {}
+        hann_window = self.hann_window if not train else {}
+        mel_basis_key = str(fmax) + "_" + str(y.device)
+        if mel_basis_key not in mel_basis:
+            mel_basis[mel_basis_key] = torch.from_numpy(
+                mel(
+                    sr=self.target_sr,
+                    n_fft=n_fft,
+                    n_mels=self.n_mels,
+                    fmin=self.fmin,
+                    fmax=fmax
+                )
+            ).float().to(y.device)
+        keyshift_key = str(keyshift) + "_" + str(y.device)
+        if keyshift_key not in hann_window: hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
+        pad_left = (win_size_new - hop_length_new) // 2
+        pad_right = max((win_size_new - hop_length_new + 1) // 2, win_size_new - y.size(-1) - pad_left)
+        pad = F.pad(y.unsqueeze(1), (pad_left, pad_right), mode="reflect" if pad_right < y.size(-1) else "constant").squeeze(1)
+        n_fft = int(np.round(n_fft * factor))
+        if str(y.device).startswith(("ocl", "privateuseone")):
+            if not hasattr(self, "stft"):
+                from main.library.backends.utils import STFT as _STFT
+                self.stft = _STFT(
+                    filter_length=n_fft,
+                    hop_length=hop_length_new,
+                    win_length=win_size_new
+                ).to(y.device)
+            spec = self.stft.transform(pad, 1e-9)
+        else:
+            spec = torch.stft(
+                pad,
+                n_fft,
+                hop_length=hop_length_new,
+                win_length=win_size_new,
+                window=hann_window[keyshift_key],
+                center=center,
+                pad_mode="reflect",
+                normalized=False,
+                onesided=True,
+                return_complex=True
+            )
+            spec = (spec.real.pow(2) + spec.imag.pow(2) + 1e-9).sqrt()
+        if keyshift != 0:
+            size = n_fft // 2 + 1
+            resize = spec.size(1)
+            spec = (F.pad(spec, (0, 0, 0, size - resize)) if resize < size else spec[:, :size, :]) * win_size / win_size_new
+        return ((mel_basis[mel_basis_key] @ spec).clamp(min=self.clip_val) * 1).log()

infer/lib/predictors/FCPE/utils.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import torch
+from torch import nn
+from io import BytesIO
+from Crypto.Cipher import AES
+from Crypto.Util.Padding import unpad
+def decrypt_model(configs, input_path):
+    with open(input_path, "rb") as f:
+        data = f.read()
+    with open(
+        os.path.join(configs["binary_path"], "decrypt.bin"),
+        "rb"
+    ) as f:
+        key = f.read()
+    return BytesIO(
+        unpad(
+            AES.new(
+                key,
+                AES.MODE_CBC,
+                data[:16]
+            ).decrypt(data[16:]),
+            AES.block_size
+        )
+    ).read()
+def calc_same_padding(kernel_size):
+    pad = kernel_size // 2
+    return (pad, pad - (kernel_size + 1) % 2)
+def torch_interp(x, xp, fp):
+    sort_idx = xp.argsort()
+    xp = xp[sort_idx]
+    fp = fp[sort_idx]
+    right_idxs = torch.searchsorted(xp, x).clamp(max=len(xp) - 1)
+    left_idxs = (right_idxs - 1).clamp(min=0)
+    x_left = xp[left_idxs]
+    y_left = fp[left_idxs]
+    interp_vals = y_left + ((x - x_left) * (fp[right_idxs] - y_left) / (xp[right_idxs] - x_left))
+    interp_vals[x < xp[0]] = fp[0]
+    interp_vals[x > xp[-1]] = fp[-1]
+    return interp_vals
+def batch_interp_with_replacement_detach(uv, f0):
+    result = f0.clone()
+    for i in range(uv.shape[0]):
+        interp_vals = torch_interp(
+            torch.where(uv[i])[-1],
+            torch.where(~uv[i])[-1],
+            f0[i][~uv[i]]
+        ).detach()
+        result[i][uv[i]] = interp_vals
+    return result
+class DotDict(dict):
+    def __getattr__(*args):
+        val = dict.get(*args)
+        return DotDict(val) if type(val) is dict else val
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+class Swish(nn.Module):
+    def forward(self, x):
+        return x * x.sigmoid()
+class Transpose(nn.Module):
+    def __init__(self, dims):
+        super().__init__()
+        assert len(dims) == 2, "dims == 2"
+        self.dims = dims
+    def forward(self, x):
+        return x.transpose(*self.dims)
+class GLU(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        out, gate = x.chunk(2, dim=self.dim)
+        return out * gate.sigmoid()

infer/lib/predictors/FCPE/wav2mel.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import os
+import sys
+import torch
+from torchaudio.transforms import Resample
+sys.path.append(os.getcwd())
+from main.library.predictors.FCPE.stft import STFT
+class Wav2Mel:
+    def __init__(
+        self,
+        device=None,
+        dtype=torch.float32
+    ):
+        self.sample_rate = 16000
+        self.hop_size = 160
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = device
+        self.dtype = dtype
+        self.stft = STFT(16000, 128, 1024, 1024, 160, 0, 8000)
+        self.resample_kernel = {}
+    def extract_nvstft(
+        self,
+        audio,
+        keyshift=0,
+        train=False
+    ):
+        return self.stft.get_mel(
+            audio,
+            keyshift=keyshift,
+            train=train
+        ).transpose(1, 2)
+    def extract_mel(
+        self,
+        audio,
+        sample_rate,
+        keyshift=0,
+        train=False
+    ):
+        audio = audio.to(self.dtype).to(self.device)
+        if sample_rate == self.sample_rate:
+            audio_res = audio
+        else:
+            key_str = str(sample_rate)
+            if key_str not in self.resample_kernel:
+                self.resample_kernel[key_str] = Resample(
+                    sample_rate,
+                    self.sample_rate,
+                    lowpass_filter_width=128
+                )
+            self.resample_kernel[key_str] = (
+                self.resample_kernel[key_str].to(self.dtype).to(self.device)
+            )
+            audio_res = self.resample_kernel[key_str](audio)
+        mel = self.extract_nvstft(
+            audio_res,
+            keyshift=keyshift,
+            train=train
+        )
+        n_frames = int(audio.shape[1] // self.hop_size) + 1
+        mel = (torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel)
+        return mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel
+    def __call__(self, audio, sample_rate, keyshift=0, train=False):
+        return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train)