update
Browse files- examples/silero_vad_by_webrtcvad/step_1_prepare_data.py +1 -1
- examples/silero_vad_by_webrtcvad/step_2_make_vad_segments.py +1 -1
- toolbox/torchaudio/models/vad/fsmn_vad/configuration_fsmn_vad.py +66 -0
- toolbox/torchaudio/models/vad/fsmn_vad/fsmn_encoder.py +4 -3
- toolbox/torchaudio/models/vad/fsmn_vad/modeling_fsmn_vad.py +42 -0
- toolbox/torchaudio/modules/freq_bands/__init__.py +6 -0
- toolbox/torchaudio/modules/freq_bands/erb_bands.py +176 -0
- toolbox/torchaudio/modules/freq_bands/mel_bands.py +6 -0
- toolbox/webrtcvad/vad.py +7 -5
examples/silero_vad_by_webrtcvad/step_1_prepare_data.py
CHANGED
|
@@ -43,7 +43,7 @@ def get_args():
|
|
| 43 |
return args
|
| 44 |
|
| 45 |
|
| 46 |
-
def target_second_signal_generator(data_dir: str, duration: int =
|
| 47 |
data_dir = Path(data_dir)
|
| 48 |
for epoch_idx in range(max_epoch):
|
| 49 |
for filename in data_dir.glob("**/*.wav"):
|
|
|
|
| 43 |
return args
|
| 44 |
|
| 45 |
|
| 46 |
+
def target_second_signal_generator(data_dir: str, duration: int = 6, sample_rate: int = 8000, max_epoch: int = 20000):
|
| 47 |
data_dir = Path(data_dir)
|
| 48 |
for epoch_idx in range(max_epoch):
|
| 49 |
for filename in data_dir.glob("**/*.wav"):
|
examples/silero_vad_by_webrtcvad/step_2_make_vad_segments.py
CHANGED
|
@@ -30,7 +30,7 @@ def get_args():
|
|
| 30 |
parser.add_argument("--agg", default=3, type=int)
|
| 31 |
parser.add_argument("--frame_duration_ms", default=30, type=int)
|
| 32 |
parser.add_argument("--padding_duration_ms", default=30, type=int)
|
| 33 |
-
parser.add_argument("--silence_duration_threshold", default=0.
|
| 34 |
|
| 35 |
args = parser.parse_args()
|
| 36 |
return args
|
|
|
|
| 30 |
parser.add_argument("--agg", default=3, type=int)
|
| 31 |
parser.add_argument("--frame_duration_ms", default=30, type=int)
|
| 32 |
parser.add_argument("--padding_duration_ms", default=30, type=int)
|
| 33 |
+
parser.add_argument("--silence_duration_threshold", default=0.0, type=float)
|
| 34 |
|
| 35 |
args = parser.parse_args()
|
| 36 |
return args
|
toolbox/torchaudio/models/vad/fsmn_vad/configuration_fsmn_vad.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
from typing import Tuple
|
| 4 |
+
|
| 5 |
+
from toolbox.torchaudio.configuration_utils import PretrainedConfig
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class FSMNVadConfig(PretrainedConfig):
|
| 9 |
+
def __init__(self,
|
| 10 |
+
sample_rate: int = 8000,
|
| 11 |
+
nfft: int = 512,
|
| 12 |
+
win_size: int = 240,
|
| 13 |
+
hop_size: int = 80,
|
| 14 |
+
win_type: str = "hann",
|
| 15 |
+
|
| 16 |
+
in_channels: int = 64,
|
| 17 |
+
hidden_size: int = 128,
|
| 18 |
+
|
| 19 |
+
lr: float = 0.001,
|
| 20 |
+
lr_scheduler: str = "CosineAnnealingLR",
|
| 21 |
+
lr_scheduler_kwargs: dict = None,
|
| 22 |
+
|
| 23 |
+
max_epochs: int = 100,
|
| 24 |
+
clip_grad_norm: float = 10.,
|
| 25 |
+
seed: int = 1234,
|
| 26 |
+
|
| 27 |
+
num_workers: int = 4,
|
| 28 |
+
batch_size: int = 4,
|
| 29 |
+
eval_steps: int = 25000,
|
| 30 |
+
|
| 31 |
+
**kwargs
|
| 32 |
+
):
|
| 33 |
+
super(FSMNVadConfig, self).__init__(**kwargs)
|
| 34 |
+
# transform
|
| 35 |
+
self.sample_rate = sample_rate
|
| 36 |
+
self.nfft = nfft
|
| 37 |
+
self.win_size = win_size
|
| 38 |
+
self.hop_size = hop_size
|
| 39 |
+
self.win_type = win_type
|
| 40 |
+
|
| 41 |
+
# encoder
|
| 42 |
+
self.in_channels = in_channels
|
| 43 |
+
self.hidden_size = hidden_size
|
| 44 |
+
|
| 45 |
+
# train
|
| 46 |
+
self.lr = lr
|
| 47 |
+
self.lr_scheduler = lr_scheduler
|
| 48 |
+
self.lr_scheduler_kwargs = lr_scheduler_kwargs or dict()
|
| 49 |
+
|
| 50 |
+
self.max_epochs = max_epochs
|
| 51 |
+
self.clip_grad_norm = clip_grad_norm
|
| 52 |
+
self.seed = seed
|
| 53 |
+
|
| 54 |
+
self.num_workers = num_workers
|
| 55 |
+
self.batch_size = batch_size
|
| 56 |
+
self.eval_steps = eval_steps
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def main():
|
| 60 |
+
config = FSMNVadConfig()
|
| 61 |
+
config.to_yaml_file("config.yaml")
|
| 62 |
+
return
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
if __name__ == "__main__":
|
| 66 |
+
main()
|
toolbox/torchaudio/models/vad/fsmn_vad/fsmn_encoder.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
| 1 |
#!/usr/bin/python3
|
| 2 |
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from typing import Tuple, Dict, List
|
| 4 |
-
import copy
|
| 5 |
-
import os
|
| 6 |
|
| 7 |
-
import numpy as np
|
| 8 |
import torch
|
| 9 |
import torch.nn as nn
|
| 10 |
import torch.nn.functional as F
|
|
|
|
| 1 |
#!/usr/bin/python3
|
| 2 |
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
https://github.com/modelscope/FunASR/blob/main/funasr/models/fsmn_vad_streaming/encoder.py
|
| 5 |
+
|
| 6 |
+
"""
|
| 7 |
from typing import Tuple, Dict, List
|
|
|
|
|
|
|
| 8 |
|
|
|
|
| 9 |
import torch
|
| 10 |
import torch.nn as nn
|
| 11 |
import torch.nn.functional as F
|
toolbox/torchaudio/models/vad/fsmn_vad/modeling_fsmn_vad.py
CHANGED
|
@@ -10,8 +10,50 @@ https://github.com/lovemefan/fsmn-vad
|
|
| 10 |
https://github.com/modelscope/FunASR/blob/main/funasr/models/fsmn_vad_streaming/encoder.py
|
| 11 |
|
| 12 |
"""
|
|
|
|
|
|
|
| 13 |
|
|
|
|
|
|
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
if __name__ == "__main__":
|
|
|
|
| 10 |
https://github.com/modelscope/FunASR/blob/main/funasr/models/fsmn_vad_streaming/encoder.py
|
| 11 |
|
| 12 |
"""
|
| 13 |
+
import os
|
| 14 |
+
from typing import Optional, Union
|
| 15 |
|
| 16 |
+
import torch
|
| 17 |
+
import torch.nn as nn
|
| 18 |
|
| 19 |
+
from toolbox.torchaudio.configuration_utils import CONFIG_FILE
|
| 20 |
+
from toolbox.torchaudio.models.vad.fsmn_vad.configuration_fsmn_vad import FSMNVadConfig
|
| 21 |
+
from toolbox.torchaudio.modules.conv_stft import ConvSTFT
|
| 22 |
+
from toolbox.torchaudio.models.vad.fsmn_vad.fsmn_encoder import FSMN
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
MODEL_FILE = "model.pt"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class FSMNVadModel(nn.Module):
|
| 29 |
+
def __init__(self, config: FSMNVadConfig):
|
| 30 |
+
super(FSMNVadModel, self).__init__()
|
| 31 |
+
self.config = config
|
| 32 |
+
self.eps = 1e-12
|
| 33 |
+
|
| 34 |
+
self.stft = ConvSTFT(
|
| 35 |
+
nfft=config.nfft,
|
| 36 |
+
win_size=config.win_size,
|
| 37 |
+
hop_size=config.hop_size,
|
| 38 |
+
win_type=config.win_type,
|
| 39 |
+
power=1,
|
| 40 |
+
requires_grad=False
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
self.fsmn_encoder = FSMN(
|
| 44 |
+
input_size=400,
|
| 45 |
+
input_affine_size=140,
|
| 46 |
+
hidden_size=250,
|
| 47 |
+
basic_block_layers=4,
|
| 48 |
+
basic_block_hidden_size=128,
|
| 49 |
+
basic_block_lorder=20,
|
| 50 |
+
basic_block_rorder=0,
|
| 51 |
+
basic_block_lstride=1,
|
| 52 |
+
basic_block_rstride=0,
|
| 53 |
+
output_affine_size=140,
|
| 54 |
+
output_size=248,
|
| 55 |
+
use_softmax=True,
|
| 56 |
+
)
|
| 57 |
|
| 58 |
|
| 59 |
if __name__ == "__main__":
|
toolbox/torchaudio/modules/freq_bands/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
pass
|
toolbox/torchaudio/modules/freq_bands/erb_bands.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import math
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import torch
|
| 7 |
+
import torch.nn as nn
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class ErbBandsNumpy(object):
|
| 11 |
+
|
| 12 |
+
@staticmethod
|
| 13 |
+
def freq2erb(freq_hz: float) -> float:
|
| 14 |
+
"""
|
| 15 |
+
https://www.cnblogs.com/LXP-Never/p/16011229.html
|
| 16 |
+
1 / (24.7 * 9.265) = 0.00436976
|
| 17 |
+
"""
|
| 18 |
+
return 9.265 * math.log(freq_hz / (24.7 * 9.265) + 1)
|
| 19 |
+
|
| 20 |
+
@staticmethod
|
| 21 |
+
def erb2freq(n_erb: float) -> float:
|
| 22 |
+
return 24.7 * 9.265 * (math.exp(n_erb / 9.265) - 1)
|
| 23 |
+
|
| 24 |
+
@classmethod
|
| 25 |
+
def get_erb_widths(cls, sample_rate: int, nfft: int, erb_bins: int, min_freq_bins_for_erb: int) -> np.ndarray:
|
| 26 |
+
"""
|
| 27 |
+
https://github.com/Rikorose/DeepFilterNet/blob/main/libDF/src/lib.rs
|
| 28 |
+
:param sample_rate:
|
| 29 |
+
:param nfft:
|
| 30 |
+
:param erb_bins: erb (Equivalent Rectangular Bandwidth) 等效矩形带宽的通道数.
|
| 31 |
+
:param min_freq_bins_for_erb: Minimum number of frequency bands per erb band
|
| 32 |
+
:return:
|
| 33 |
+
"""
|
| 34 |
+
nyq_freq = sample_rate / 2.
|
| 35 |
+
freq_width: float = sample_rate / nfft
|
| 36 |
+
|
| 37 |
+
min_erb: float = cls.freq2erb(0.)
|
| 38 |
+
max_erb: float = cls.freq2erb(nyq_freq)
|
| 39 |
+
|
| 40 |
+
erb = [0] * erb_bins
|
| 41 |
+
step = (max_erb - min_erb) / erb_bins
|
| 42 |
+
|
| 43 |
+
prev_freq_bin = 0
|
| 44 |
+
freq_over = 0
|
| 45 |
+
for i in range(1, erb_bins + 1):
|
| 46 |
+
f = cls.erb2freq(min_erb + i * step)
|
| 47 |
+
freq_bin = int(round(f / freq_width))
|
| 48 |
+
freq_bins = freq_bin - prev_freq_bin - freq_over
|
| 49 |
+
|
| 50 |
+
if freq_bins < min_freq_bins_for_erb:
|
| 51 |
+
freq_over = min_freq_bins_for_erb - freq_bins
|
| 52 |
+
freq_bins = min_freq_bins_for_erb
|
| 53 |
+
else:
|
| 54 |
+
freq_over = 0
|
| 55 |
+
erb[i - 1] = freq_bins
|
| 56 |
+
prev_freq_bin = freq_bin
|
| 57 |
+
|
| 58 |
+
erb[erb_bins - 1] += 1
|
| 59 |
+
too_large = sum(erb) - (nfft / 2 + 1)
|
| 60 |
+
if too_large > 0:
|
| 61 |
+
erb[erb_bins - 1] -= too_large
|
| 62 |
+
return np.array(erb, dtype=np.uint64)
|
| 63 |
+
|
| 64 |
+
@staticmethod
|
| 65 |
+
def get_erb_filter_bank(erb_widths: np.ndarray,
|
| 66 |
+
normalized: bool = True,
|
| 67 |
+
inverse: bool = False,
|
| 68 |
+
):
|
| 69 |
+
num_freq_bins = int(np.sum(erb_widths))
|
| 70 |
+
num_erb_bins = len(erb_widths)
|
| 71 |
+
|
| 72 |
+
fb: np.ndarray = np.zeros(shape=(num_freq_bins, num_erb_bins))
|
| 73 |
+
|
| 74 |
+
points = np.cumsum([0] + erb_widths.tolist()).astype(int)[:-1]
|
| 75 |
+
for i, (b, w) in enumerate(zip(points.tolist(), erb_widths.tolist())):
|
| 76 |
+
fb[b: b + w, i] = 1
|
| 77 |
+
|
| 78 |
+
if inverse:
|
| 79 |
+
fb = fb.T
|
| 80 |
+
if not normalized:
|
| 81 |
+
fb /= np.sum(fb, axis=1, keepdims=True)
|
| 82 |
+
else:
|
| 83 |
+
if normalized:
|
| 84 |
+
fb /= np.sum(fb, axis=0)
|
| 85 |
+
return fb
|
| 86 |
+
|
| 87 |
+
@staticmethod
|
| 88 |
+
def spec2erb(spec: np.ndarray, erb_fb: np.ndarray, db: bool = True):
|
| 89 |
+
"""
|
| 90 |
+
ERB filterbank and transform to decibel scale.
|
| 91 |
+
|
| 92 |
+
:param spec: Spectrum of shape [B, C, T, F].
|
| 93 |
+
:param erb_fb: ERB filterbank array of shape [B] containing the ERB widths,
|
| 94 |
+
where B are the number of ERB bins.
|
| 95 |
+
:param db: Whether to transform the output into decibel scale. Defaults to `True`.
|
| 96 |
+
:return:
|
| 97 |
+
"""
|
| 98 |
+
# complex spec to power spec. (real * real + image * image)
|
| 99 |
+
spec_ = np.abs(spec) ** 2
|
| 100 |
+
|
| 101 |
+
# spec to erb feature.
|
| 102 |
+
erb_feat = np.matmul(spec_, erb_fb)
|
| 103 |
+
|
| 104 |
+
if db:
|
| 105 |
+
erb_feat = 10 * np.log10(erb_feat + 1e-10)
|
| 106 |
+
|
| 107 |
+
erb_feat = np.array(erb_feat, dtype=np.float32)
|
| 108 |
+
return erb_feat
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
class ErbBands(nn.Module):
|
| 112 |
+
def __init__(self,
|
| 113 |
+
sample_rate: int = 8000,
|
| 114 |
+
nfft: int = 512,
|
| 115 |
+
erb_bins: int = 32,
|
| 116 |
+
min_freq_bins_for_erb: int = 2,
|
| 117 |
+
):
|
| 118 |
+
super().__init__()
|
| 119 |
+
self.sample_rate = sample_rate
|
| 120 |
+
self.nfft = nfft
|
| 121 |
+
self.erb_bins = erb_bins
|
| 122 |
+
self.min_freq_bins_for_erb = min_freq_bins_for_erb
|
| 123 |
+
|
| 124 |
+
erb_fb, erb_fb_inv = self.init_erb_fb()
|
| 125 |
+
erb_fb = torch.tensor(erb_fb, dtype=torch.float32, requires_grad=False)
|
| 126 |
+
erb_fb_inv = torch.tensor(erb_fb_inv, dtype=torch.float32, requires_grad=False)
|
| 127 |
+
self.erb_fb = nn.Parameter(erb_fb, requires_grad=False)
|
| 128 |
+
self.erb_fb_inv = nn.Parameter(erb_fb_inv, requires_grad=False)
|
| 129 |
+
|
| 130 |
+
def init_erb_fb(self):
|
| 131 |
+
erb_widths = ErbBandsNumpy.get_erb_widths(
|
| 132 |
+
sample_rate=self.sample_rate,
|
| 133 |
+
nfft=self.nfft,
|
| 134 |
+
erb_bins=self.erb_bins,
|
| 135 |
+
min_freq_bins_for_erb=self.min_freq_bins_for_erb,
|
| 136 |
+
)
|
| 137 |
+
erb_fb = ErbBandsNumpy.get_erb_filter_bank(
|
| 138 |
+
erb_widths=erb_widths,
|
| 139 |
+
normalized=True,
|
| 140 |
+
inverse=False,
|
| 141 |
+
)
|
| 142 |
+
erb_fb_inv = ErbBandsNumpy.get_erb_filter_bank(
|
| 143 |
+
erb_widths=erb_widths,
|
| 144 |
+
normalized=True,
|
| 145 |
+
inverse=True,
|
| 146 |
+
)
|
| 147 |
+
return erb_fb, erb_fb_inv
|
| 148 |
+
|
| 149 |
+
def erb_scale(self, spec: torch.Tensor, db: bool = True):
|
| 150 |
+
# spec shape: (b, t, f)
|
| 151 |
+
spec_erb = torch.matmul(spec, self.erb_fb)
|
| 152 |
+
if db:
|
| 153 |
+
spec_erb = 10 * torch.log10(spec_erb + 1e-10)
|
| 154 |
+
return spec_erb
|
| 155 |
+
|
| 156 |
+
def erb_scale_inv(self, spec_erb: torch.Tensor):
|
| 157 |
+
spec = torch.matmul(spec_erb, self.erb_fb_inv)
|
| 158 |
+
return spec
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def main():
|
| 162 |
+
|
| 163 |
+
erb_bands = ErbBands()
|
| 164 |
+
|
| 165 |
+
spec = torch.randn(size=(2, 199, 257), dtype=torch.float32)
|
| 166 |
+
spec_erb = erb_bands.erb_scale(spec)
|
| 167 |
+
print(spec_erb.shape)
|
| 168 |
+
|
| 169 |
+
spec = erb_bands.erb_scale_inv(spec_erb)
|
| 170 |
+
print(spec.shape)
|
| 171 |
+
|
| 172 |
+
return
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
if __name__ == "__main__":
|
| 176 |
+
main()
|
toolbox/torchaudio/modules/freq_bands/mel_bands.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
pass
|
toolbox/webrtcvad/vad.py
CHANGED
|
@@ -24,13 +24,15 @@ class WebRTCVad(object):
|
|
| 24 |
frame_duration_ms: int = 30,
|
| 25 |
padding_duration_ms: int = 300,
|
| 26 |
silence_duration_threshold: float = 0.3,
|
| 27 |
-
sample_rate: int = 8000
|
|
|
|
| 28 |
):
|
| 29 |
self.agg = agg
|
| 30 |
self.frame_duration_ms = frame_duration_ms
|
| 31 |
self.padding_duration_ms = padding_duration_ms
|
| 32 |
self.silence_duration_threshold = silence_duration_threshold
|
| 33 |
self.sample_rate = sample_rate
|
|
|
|
| 34 |
|
| 35 |
self._vad = webrtcvad.Vad(mode=agg)
|
| 36 |
|
|
@@ -110,7 +112,7 @@ class WebRTCVad(object):
|
|
| 110 |
self.ring_buffer.append((frame, is_speech))
|
| 111 |
num_voiced = len([f for f, speech in self.ring_buffer if speech])
|
| 112 |
|
| 113 |
-
if num_voiced >
|
| 114 |
self.triggered = True
|
| 115 |
|
| 116 |
for f, _ in self.ring_buffer:
|
|
@@ -120,7 +122,7 @@ class WebRTCVad(object):
|
|
| 120 |
self.voiced_frames.append(frame)
|
| 121 |
self.ring_buffer.append((frame, is_speech))
|
| 122 |
num_unvoiced = len([f for f, speech in self.ring_buffer if not speech])
|
| 123 |
-
if num_unvoiced >
|
| 124 |
self.triggered = False
|
| 125 |
segment = [
|
| 126 |
np.concatenate([f.signal for f in self.voiced_frames]),
|
|
@@ -204,12 +206,12 @@ def get_args():
|
|
| 204 |
)
|
| 205 |
parser.add_argument(
|
| 206 |
"--padding_duration_ms",
|
| 207 |
-
default=
|
| 208 |
type=int,
|
| 209 |
)
|
| 210 |
parser.add_argument(
|
| 211 |
"--silence_duration_threshold",
|
| 212 |
-
default=0.
|
| 213 |
type=float,
|
| 214 |
help="minimum silence duration, in seconds."
|
| 215 |
)
|
|
|
|
| 24 |
frame_duration_ms: int = 30,
|
| 25 |
padding_duration_ms: int = 300,
|
| 26 |
silence_duration_threshold: float = 0.3,
|
| 27 |
+
sample_rate: int = 8000,
|
| 28 |
+
ring_buffer_activity_threshold: float = 0.9,
|
| 29 |
):
|
| 30 |
self.agg = agg
|
| 31 |
self.frame_duration_ms = frame_duration_ms
|
| 32 |
self.padding_duration_ms = padding_duration_ms
|
| 33 |
self.silence_duration_threshold = silence_duration_threshold
|
| 34 |
self.sample_rate = sample_rate
|
| 35 |
+
self.ring_buffer_activity_threshold = ring_buffer_activity_threshold
|
| 36 |
|
| 37 |
self._vad = webrtcvad.Vad(mode=agg)
|
| 38 |
|
|
|
|
| 112 |
self.ring_buffer.append((frame, is_speech))
|
| 113 |
num_voiced = len([f for f, speech in self.ring_buffer if speech])
|
| 114 |
|
| 115 |
+
if num_voiced > self.ring_buffer_activity_threshold * self.ring_buffer.maxlen:
|
| 116 |
self.triggered = True
|
| 117 |
|
| 118 |
for f, _ in self.ring_buffer:
|
|
|
|
| 122 |
self.voiced_frames.append(frame)
|
| 123 |
self.ring_buffer.append((frame, is_speech))
|
| 124 |
num_unvoiced = len([f for f, speech in self.ring_buffer if not speech])
|
| 125 |
+
if num_unvoiced > self.ring_buffer_activity_threshold * self.ring_buffer.maxlen:
|
| 126 |
self.triggered = False
|
| 127 |
segment = [
|
| 128 |
np.concatenate([f.signal for f in self.voiced_frames]),
|
|
|
|
| 206 |
)
|
| 207 |
parser.add_argument(
|
| 208 |
"--padding_duration_ms",
|
| 209 |
+
default=30,
|
| 210 |
type=int,
|
| 211 |
)
|
| 212 |
parser.add_argument(
|
| 213 |
"--silence_duration_threshold",
|
| 214 |
+
default=0.0,
|
| 215 |
type=float,
|
| 216 |
help="minimum silence duration, in seconds."
|
| 217 |
)
|