Shikhar Bharadwaj commited on
Commit
3dd6452
·
1 Parent(s): 2e5a8d0

Update model

Browse files
Files changed (19) hide show
  1. README.md +293 -0
  2. meta.yaml +8 -0
  3. work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/data/esc50_f5/token_list +52 -0
  4. work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/744epoch.pth +3 -0
  5. work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/RESULTS.md +15 -0
  6. work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/config.yaml +241 -0
  7. work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/acc.png +0 -0
  8. work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/backward_time.png +0 -0
  9. work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/clip.png +0 -0
  10. work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/forward_time.png +0 -0
  11. work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/gpu_max_cached_mem_GB.png +0 -0
  12. work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/grad_norm.png +0 -0
  13. work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/iter_time.png +0 -0
  14. work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/loss.png +0 -0
  15. work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/loss_scale.png +0 -0
  16. work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/macro_precision.png +0 -0
  17. work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/optim0_lr0.png +0 -0
  18. work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/optim_step_time.png +0 -0
  19. work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/train_time.png +0 -0
README.md ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - classification
6
+ datasets:
7
+ - esc50
8
+ license: cc-by-4.0
9
+ ---
10
+
11
+ ## ESPnet2 CLS model
12
+
13
+ ### `shikhar7ssu/OpenBEATS-Large-i1-esc50f5`
14
+
15
+ This model was trained by Shikhar Bharadwaj using esc50 recipe in [espnet](https://github.com/espnet/espnet/).
16
+
17
+ ## CLS config
18
+
19
+ <details><summary>expand</summary>
20
+
21
+ ```
22
+ config: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/earlarge1/conf/ear_large/esc50_f5.yaml
23
+ print_config: false
24
+ log_level: INFO
25
+ drop_last_iter: false
26
+ dry_run: false
27
+ iterator_type: sequence
28
+ valid_iterator_type: null
29
+ output_dir: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1
30
+ ngpu: 1
31
+ seed: 0
32
+ num_workers: 2
33
+ num_att_plot: 0
34
+ dist_backend: nccl
35
+ dist_init_method: env://
36
+ dist_world_size: null
37
+ dist_rank: null
38
+ local_rank: 0
39
+ dist_master_addr: null
40
+ dist_master_port: null
41
+ dist_launcher: null
42
+ multiprocessing_distributed: false
43
+ unused_parameters: true
44
+ sharded_ddp: false
45
+ use_deepspeed: false
46
+ deepspeed_config: null
47
+ gradient_as_bucket_view: true
48
+ ddp_comm_hook: null
49
+ cudnn_enabled: true
50
+ cudnn_benchmark: false
51
+ cudnn_deterministic: true
52
+ use_tf32: false
53
+ collect_stats: false
54
+ write_collected_feats: false
55
+ max_epoch: 1000
56
+ patience: null
57
+ val_scheduler_criterion:
58
+ - valid
59
+ - loss
60
+ early_stopping_criterion:
61
+ - valid
62
+ - loss
63
+ - min
64
+ best_model_criterion:
65
+ - - valid
66
+ - acc
67
+ - max
68
+ keep_nbest_models: 1
69
+ nbest_averaging_interval: 0
70
+ grad_clip: 1
71
+ grad_clip_type: 2.0
72
+ grad_noise: false
73
+ accum_grad: 1
74
+ no_forward_run: false
75
+ resume: true
76
+ train_dtype: float32
77
+ use_amp: false
78
+ log_interval: null
79
+ use_matplotlib: true
80
+ use_tensorboard: true
81
+ create_graph_in_tensorboard: false
82
+ use_wandb: false
83
+ wandb_project: null
84
+ wandb_id: null
85
+ wandb_entity: null
86
+ wandb_name: null
87
+ wandb_model_log_interval: -1
88
+ detect_anomaly: false
89
+ use_adapter: false
90
+ adapter: lora
91
+ save_strategy: all
92
+ adapter_conf: {}
93
+ pretrain_path: null
94
+ init_param: []
95
+ ignore_init_mismatch: false
96
+ freeze_param: []
97
+ num_iters_per_epoch: null
98
+ batch_size: 512
99
+ valid_batch_size: null
100
+ batch_bins: 1000000
101
+ valid_batch_bins: null
102
+ category_sample_size: 10
103
+ train_shape_file:
104
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_stats_16k/train/speech_shape
105
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_stats_16k/train/label_shape
106
+ valid_shape_file:
107
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_stats_16k/valid/speech_shape
108
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_stats_16k/valid/label_shape
109
+ batch_type: folded
110
+ valid_batch_type: null
111
+ fold_length:
112
+ - 100000
113
+ - 1
114
+ sort_in_batch: descending
115
+ shuffle_within_batch: false
116
+ sort_batch: descending
117
+ multiple_iterator: false
118
+ chunk_length: 500
119
+ chunk_shift_ratio: 0.5
120
+ num_cache_chunks: 1024
121
+ chunk_excluded_key_prefixes: []
122
+ chunk_default_fs: null
123
+ chunk_max_abs_length: null
124
+ chunk_discard_short_samples: true
125
+ train_data_path_and_name_and_type:
126
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/esc50_f5/train5/wav.scp
127
+ - speech
128
+ - sound
129
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/esc50_f5/train5/text
130
+ - label
131
+ - text
132
+ valid_data_path_and_name_and_type:
133
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/esc50_f5/val5/wav.scp
134
+ - speech
135
+ - sound
136
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/esc50_f5/val5/text
137
+ - label
138
+ - text
139
+ multi_task_dataset: false
140
+ allow_variable_data_keys: false
141
+ max_cache_size: 0.0
142
+ max_cache_fd: 32
143
+ allow_multi_rates: false
144
+ valid_max_cache_size: null
145
+ exclude_weight_decay: false
146
+ exclude_weight_decay_conf: {}
147
+ optim: adamw
148
+ optim_conf:
149
+ lr: 0.0001
150
+ weight_decay: 0.01
151
+ betas:
152
+ - 0.9
153
+ - 0.98
154
+ scheduler: cosineannealingwarmuprestarts
155
+ scheduler_conf:
156
+ first_cycle_steps: 7000
157
+ warmup_steps: 350
158
+ max_lr: 0.0001
159
+ min_lr: 5.0e-06
160
+ token_list:
161
+ - audio_class:0
162
+ - audio_class:14
163
+ - audio_class:36
164
+ - audio_class:19
165
+ - audio_class:30
166
+ - audio_class:34
167
+ - audio_class:9
168
+ - audio_class:22
169
+ - audio_class:48
170
+ - audio_class:41
171
+ - audio_class:47
172
+ - audio_class:31
173
+ - audio_class:17
174
+ - audio_class:45
175
+ - audio_class:8
176
+ - audio_class:15
177
+ - audio_class:46
178
+ - audio_class:37
179
+ - audio_class:32
180
+ - audio_class:16
181
+ - audio_class:25
182
+ - audio_class:4
183
+ - audio_class:3
184
+ - audio_class:27
185
+ - audio_class:43
186
+ - audio_class:12
187
+ - audio_class:40
188
+ - audio_class:29
189
+ - audio_class:10
190
+ - audio_class:7
191
+ - audio_class:26
192
+ - audio_class:6
193
+ - audio_class:44
194
+ - audio_class:23
195
+ - audio_class:20
196
+ - audio_class:49
197
+ - audio_class:24
198
+ - audio_class:39
199
+ - audio_class:28
200
+ - audio_class:18
201
+ - audio_class:2
202
+ - audio_class:35
203
+ - audio_class:38
204
+ - audio_class:21
205
+ - audio_class:1
206
+ - audio_class:11
207
+ - audio_class:42
208
+ - audio_class:5
209
+ - audio_class:33
210
+ - audio_class:13
211
+ - <blank>
212
+ - <unk>
213
+ text_token_list: null
214
+ text_bpemodel: null
215
+ init: xavier_normal
216
+ input_size: 1
217
+ use_preprocessor: true
218
+ frontend: null
219
+ frontend_conf: {}
220
+ specaug: null
221
+ specaug_conf: {}
222
+ normalize: null
223
+ normalize_conf: {}
224
+ preencoder: null
225
+ preencoder_conf: {}
226
+ encoder: beats
227
+ encoder_conf:
228
+ beats_ckpt_path: /work/nvme/bbjs/sbharadwaj/model_checkpoints/ear_large/beats_iter0_large.tune_lr1.0e-4_warmup40000_bins1600000_totalsteps400000/epoch59.pt
229
+ fbank_mean: 11.72215
230
+ fbank_std: 10.60431
231
+ beats_config:
232
+ layer_wise_gradient_decay_ratio: 0.2
233
+ encoder_layerdrop: 0.1
234
+ dropout: 0.0
235
+ specaug_config:
236
+ apply_time_warp: true
237
+ apply_freq_mask: false
238
+ apply_time_mask: true
239
+ time_mask_width_ratio_range:
240
+ - 0
241
+ - 0.06
242
+ num_time_mask: 1
243
+ roll_augment: true
244
+ roll_interval: 16000
245
+ use_weighted_representation: false
246
+ text_encoder: null
247
+ text_encoder_conf: {}
248
+ embedding_fusion: null
249
+ embedding_fusion_conf: {}
250
+ decoder: linear
251
+ decoder_conf:
252
+ pooling: mean
253
+ dropout: 0.1
254
+ model: espnet
255
+ model_conf:
256
+ classification_type: multi-class
257
+ lsm_weight: 0.1
258
+ required:
259
+ - output_dir
260
+ - token_list
261
+ version: '202412'
262
+ distributed: false
263
+ ```
264
+
265
+ </details>
266
+
267
+ ### Citations
268
+
269
+ ```BibTex
270
+
271
+ @article{bharadwaj2025openbeats,
272
+ title={OpenBEATs: A Fully Open-Source General-Purpose Audio Encoder},
273
+ author={Bharadwaj, Shikhar and Cornell, Samuele and Choi, Kwanghee and Fukayama, Satoru and Shim, Hye-jin and Deshmukh, Soham and Watanabe, Shinji},
274
+ journal={arXiv preprint arXiv:2507.14129},
275
+ year={2025}
276
+ }
277
+
278
+ @inproceedings{watanabe2018espnet,
279
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
280
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
281
+ year={2018},
282
+ booktitle={Proceedings of Interspeech},
283
+ pages={2207--2211},
284
+ doi={10.21437/Interspeech.2018-1456},
285
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
286
+ }
287
+
288
+
289
+
290
+
291
+
292
+
293
+ ```
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202503'
2
+ files:
3
+ classification_model_file: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/744epoch.pth
4
+ python: "3.9.18 | packaged by conda-forge | (main, Dec 23 2023, 17:20:25) \n[GCC 12.3.0]"
5
+ timestamp: 1763321396.696586
6
+ torch: 2.1.2
7
+ yaml_files:
8
+ classification_train_config: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/config.yaml
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/data/esc50_f5/token_list ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio_class:0
2
+ audio_class:14
3
+ audio_class:36
4
+ audio_class:19
5
+ audio_class:30
6
+ audio_class:34
7
+ audio_class:9
8
+ audio_class:22
9
+ audio_class:48
10
+ audio_class:41
11
+ audio_class:47
12
+ audio_class:31
13
+ audio_class:17
14
+ audio_class:45
15
+ audio_class:8
16
+ audio_class:15
17
+ audio_class:46
18
+ audio_class:37
19
+ audio_class:32
20
+ audio_class:16
21
+ audio_class:25
22
+ audio_class:4
23
+ audio_class:3
24
+ audio_class:27
25
+ audio_class:43
26
+ audio_class:12
27
+ audio_class:40
28
+ audio_class:29
29
+ audio_class:10
30
+ audio_class:7
31
+ audio_class:26
32
+ audio_class:6
33
+ audio_class:44
34
+ audio_class:23
35
+ audio_class:20
36
+ audio_class:49
37
+ audio_class:24
38
+ audio_class:39
39
+ audio_class:28
40
+ audio_class:18
41
+ audio_class:2
42
+ audio_class:35
43
+ audio_class:38
44
+ audio_class:21
45
+ audio_class:1
46
+ audio_class:11
47
+ audio_class:42
48
+ audio_class:5
49
+ audio_class:33
50
+ audio_class:13
51
+ <blank>
52
+ <unk>
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/744epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5716e7a1c66624f16c7e66fd33d62a072fd6218e694b45c51ee23abf7f196ff2
3
+ size 1245900959
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/RESULTS.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_cls_result.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Wed Mar 12 18:37:04 CDT 2025`
5
+ - python version: `3.9.18 | packaged by conda-forge | (main, Dec 23 2023, 17:20:25) [GCC 12.3.0]`
6
+ - espnet version: `espnet 202412`
7
+ - pytorch version: `pytorch 2.6.0.dev20241210+cu124`
8
+ - Git hash: `6a4d1394930044a7d083ffe4647dc0e709726ba2`
9
+ - Commit date: `Wed Mar 12 13:24:03 2025 -0500`
10
+
11
+ ## cls_earlarge1
12
+ |Split|mean_acc|mAP|mean_auc|n_labels|n_instances|
13
+ |---|---|---|---|---|---|
14
+ cls_val5|91.75|96.84|99.90|50.00|400.00
15
+
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/config.yaml ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/earlarge1/conf/ear_large/esc50_f5.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1
9
+ ngpu: 1
10
+ seed: 0
11
+ num_workers: 2
12
+ num_att_plot: 0
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ use_deepspeed: false
25
+ deepspeed_config: null
26
+ gradient_as_bucket_view: true
27
+ ddp_comm_hook: null
28
+ cudnn_enabled: true
29
+ cudnn_benchmark: false
30
+ cudnn_deterministic: true
31
+ use_tf32: false
32
+ collect_stats: false
33
+ write_collected_feats: false
34
+ max_epoch: 1000
35
+ patience: null
36
+ val_scheduler_criterion:
37
+ - valid
38
+ - loss
39
+ early_stopping_criterion:
40
+ - valid
41
+ - loss
42
+ - min
43
+ best_model_criterion:
44
+ - - valid
45
+ - acc
46
+ - max
47
+ keep_nbest_models: 1
48
+ nbest_averaging_interval: 0
49
+ grad_clip: 1
50
+ grad_clip_type: 2.0
51
+ grad_noise: false
52
+ accum_grad: 1
53
+ no_forward_run: false
54
+ resume: true
55
+ train_dtype: float32
56
+ use_amp: false
57
+ log_interval: null
58
+ use_matplotlib: true
59
+ use_tensorboard: true
60
+ create_graph_in_tensorboard: false
61
+ use_wandb: false
62
+ wandb_project: null
63
+ wandb_id: null
64
+ wandb_entity: null
65
+ wandb_name: null
66
+ wandb_model_log_interval: -1
67
+ detect_anomaly: false
68
+ use_adapter: false
69
+ adapter: lora
70
+ save_strategy: all
71
+ adapter_conf: {}
72
+ pretrain_path: null
73
+ init_param: []
74
+ ignore_init_mismatch: false
75
+ freeze_param: []
76
+ num_iters_per_epoch: null
77
+ batch_size: 512
78
+ valid_batch_size: null
79
+ batch_bins: 1000000
80
+ valid_batch_bins: null
81
+ category_sample_size: 10
82
+ train_shape_file:
83
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_stats_16k/train/speech_shape
84
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_stats_16k/train/label_shape
85
+ valid_shape_file:
86
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_stats_16k/valid/speech_shape
87
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_stats_16k/valid/label_shape
88
+ batch_type: folded
89
+ valid_batch_type: null
90
+ fold_length:
91
+ - 100000
92
+ - 1
93
+ sort_in_batch: descending
94
+ shuffle_within_batch: false
95
+ sort_batch: descending
96
+ multiple_iterator: false
97
+ chunk_length: 500
98
+ chunk_shift_ratio: 0.5
99
+ num_cache_chunks: 1024
100
+ chunk_excluded_key_prefixes: []
101
+ chunk_default_fs: null
102
+ chunk_max_abs_length: null
103
+ chunk_discard_short_samples: true
104
+ train_data_path_and_name_and_type:
105
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/esc50_f5/train5/wav.scp
106
+ - speech
107
+ - sound
108
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/esc50_f5/train5/text
109
+ - label
110
+ - text
111
+ valid_data_path_and_name_and_type:
112
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/esc50_f5/val5/wav.scp
113
+ - speech
114
+ - sound
115
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/esc50_f5/val5/text
116
+ - label
117
+ - text
118
+ multi_task_dataset: false
119
+ allow_variable_data_keys: false
120
+ max_cache_size: 0.0
121
+ max_cache_fd: 32
122
+ allow_multi_rates: false
123
+ valid_max_cache_size: null
124
+ exclude_weight_decay: false
125
+ exclude_weight_decay_conf: {}
126
+ optim: adamw
127
+ optim_conf:
128
+ lr: 0.0001
129
+ weight_decay: 0.01
130
+ betas:
131
+ - 0.9
132
+ - 0.98
133
+ scheduler: cosineannealingwarmuprestarts
134
+ scheduler_conf:
135
+ first_cycle_steps: 7000
136
+ warmup_steps: 350
137
+ max_lr: 0.0001
138
+ min_lr: 5.0e-06
139
+ token_list:
140
+ - audio_class:0
141
+ - audio_class:14
142
+ - audio_class:36
143
+ - audio_class:19
144
+ - audio_class:30
145
+ - audio_class:34
146
+ - audio_class:9
147
+ - audio_class:22
148
+ - audio_class:48
149
+ - audio_class:41
150
+ - audio_class:47
151
+ - audio_class:31
152
+ - audio_class:17
153
+ - audio_class:45
154
+ - audio_class:8
155
+ - audio_class:15
156
+ - audio_class:46
157
+ - audio_class:37
158
+ - audio_class:32
159
+ - audio_class:16
160
+ - audio_class:25
161
+ - audio_class:4
162
+ - audio_class:3
163
+ - audio_class:27
164
+ - audio_class:43
165
+ - audio_class:12
166
+ - audio_class:40
167
+ - audio_class:29
168
+ - audio_class:10
169
+ - audio_class:7
170
+ - audio_class:26
171
+ - audio_class:6
172
+ - audio_class:44
173
+ - audio_class:23
174
+ - audio_class:20
175
+ - audio_class:49
176
+ - audio_class:24
177
+ - audio_class:39
178
+ - audio_class:28
179
+ - audio_class:18
180
+ - audio_class:2
181
+ - audio_class:35
182
+ - audio_class:38
183
+ - audio_class:21
184
+ - audio_class:1
185
+ - audio_class:11
186
+ - audio_class:42
187
+ - audio_class:5
188
+ - audio_class:33
189
+ - audio_class:13
190
+ - <blank>
191
+ - <unk>
192
+ text_token_list: null
193
+ text_bpemodel: null
194
+ init: xavier_normal
195
+ input_size: 1
196
+ use_preprocessor: true
197
+ frontend: null
198
+ frontend_conf: {}
199
+ specaug: null
200
+ specaug_conf: {}
201
+ normalize: null
202
+ normalize_conf: {}
203
+ preencoder: null
204
+ preencoder_conf: {}
205
+ encoder: beats
206
+ encoder_conf:
207
+ beats_ckpt_path: /work/nvme/bbjs/sbharadwaj/model_checkpoints/ear_large/beats_iter0_large.tune_lr1.0e-4_warmup40000_bins1600000_totalsteps400000/epoch59.pt
208
+ fbank_mean: 11.72215
209
+ fbank_std: 10.60431
210
+ beats_config:
211
+ layer_wise_gradient_decay_ratio: 0.2
212
+ encoder_layerdrop: 0.1
213
+ dropout: 0.0
214
+ specaug_config:
215
+ apply_time_warp: true
216
+ apply_freq_mask: false
217
+ apply_time_mask: true
218
+ time_mask_width_ratio_range:
219
+ - 0
220
+ - 0.06
221
+ num_time_mask: 1
222
+ roll_augment: true
223
+ roll_interval: 16000
224
+ use_weighted_representation: false
225
+ text_encoder: null
226
+ text_encoder_conf: {}
227
+ embedding_fusion: null
228
+ embedding_fusion_conf: {}
229
+ decoder: linear
230
+ decoder_conf:
231
+ pooling: mean
232
+ dropout: 0.1
233
+ model: espnet
234
+ model_conf:
235
+ classification_type: multi-class
236
+ lsm_weight: 0.1
237
+ required:
238
+ - output_dir
239
+ - token_list
240
+ version: '202412'
241
+ distributed: false
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/acc.png ADDED
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/backward_time.png ADDED
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/clip.png ADDED
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/forward_time.png ADDED
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/gpu_max_cached_mem_GB.png ADDED
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/grad_norm.png ADDED
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/iter_time.png ADDED
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/loss.png ADDED
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/loss_scale.png ADDED
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/macro_precision.png ADDED
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/optim0_lr0.png ADDED
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/optim_step_time.png ADDED
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/esc50_f5/cls_earlarge1/images/train_time.png ADDED