update
Browse files
examples/silero_vad_by_webrtcvad/run.sh
CHANGED
|
@@ -4,7 +4,7 @@
|
|
| 4 |
|
| 5 |
bash run.sh --stage 3 --stop_stage 5 --system_version centos \
|
| 6 |
--file_folder_name silero-vad-by-webrtcvad-nx2-dns3 \
|
| 7 |
-
--final_model_name silero-vad-by-webrtcvad-nx2-dns3-
|
| 8 |
--noise_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/noise/**/*.wav" \
|
| 9 |
--speech_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech/**/*.wav \
|
| 10 |
/data/tianxing/HuggingDatasets/nx_noise/data/speech/nx-speech2/**/*.wav" \
|
|
|
|
| 4 |
|
| 5 |
bash run.sh --stage 3 --stop_stage 5 --system_version centos \
|
| 6 |
--file_folder_name silero-vad-by-webrtcvad-nx2-dns3 \
|
| 7 |
+
--final_model_name silero-vad-by-webrtcvad-nx2-dns3-20251120 \
|
| 8 |
--noise_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/noise/**/*.wav" \
|
| 9 |
--speech_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech/**/*.wav \
|
| 10 |
/data/tianxing/HuggingDatasets/nx_noise/data/speech/nx-speech2/**/*.wav" \
|
examples/silero_vad_by_webrtcvad/step_5_export_model.py
CHANGED
|
@@ -81,18 +81,23 @@ def main():
|
|
| 81 |
"new_encoder_hidden_cache_list",
|
| 82 |
"new_lstm_hidden_state"
|
| 83 |
],
|
| 84 |
-
dynamic_axes={
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
ort_session = ort.InferenceSession(
|
| 98 |
output_file.as_posix()
|
|
|
|
| 81 |
"new_encoder_hidden_cache_list",
|
| 82 |
"new_lstm_hidden_state"
|
| 83 |
],
|
| 84 |
+
dynamic_axes={"inputs": {2: "num_samples"}},
|
| 85 |
+
# UserWarning: Exporting a model to ONNX with a batch_size other than 1,
|
| 86 |
+
# with a variable length with LSTM can cause an error when running the ONNX model with a different batch size.
|
| 87 |
+
# Make sure to save the model with a batch size of 1, or define the initial states (h0/c0) as inputs of the model.
|
| 88 |
+
# dynamic_axes={
|
| 89 |
+
# "inputs": {0: "batch_size", 2: "num_samples"},
|
| 90 |
+
# "encoder_in_cache": {1: "batch_size"},
|
| 91 |
+
# "encoder_hidden_cache_list": {1: "batch_size"},
|
| 92 |
+
# "lstm_hidden_state": {2: "batch_size"},
|
| 93 |
+
# "logits": {0: "batch_size"},
|
| 94 |
+
# "probs": {0: "batch_size"},
|
| 95 |
+
# "lsnr": {0: "batch_size"},
|
| 96 |
+
# "new_encoder_in_cache": {1: "batch_size"},
|
| 97 |
+
# "new_encoder_hidden_cache_list": {1: "batch_size"},
|
| 98 |
+
# "new_lstm_hidden_state": {2: "batch_size"},
|
| 99 |
+
# },
|
| 100 |
+
)
|
| 101 |
|
| 102 |
ort_session = ort.InferenceSession(
|
| 103 |
output_file.as_posix()
|
examples/silero_vad_by_webrtcvad/yaml/config-256-0-20.yaml
CHANGED
|
@@ -19,12 +19,12 @@ decoder_num_layers: 2
|
|
| 19 |
|
| 20 |
# lsnr
|
| 21 |
n_frame: 3
|
| 22 |
-
min_local_snr_db: -
|
| 23 |
max_local_snr_db: 30
|
| 24 |
norm_tau: 1.
|
| 25 |
|
| 26 |
# data
|
| 27 |
-
min_snr_db:
|
| 28 |
max_snr_db: 20
|
| 29 |
|
| 30 |
# train
|
|
|
|
| 19 |
|
| 20 |
# lsnr
|
| 21 |
n_frame: 3
|
| 22 |
+
min_local_snr_db: -15
|
| 23 |
max_local_snr_db: 30
|
| 24 |
norm_tau: 1.
|
| 25 |
|
| 26 |
# data
|
| 27 |
+
min_snr_db: -10
|
| 28 |
max_snr_db: 20
|
| 29 |
|
| 30 |
# train
|