| from easydict import EasyDict as dict | |
| D_MODEL = 768 | |
| HIDDEN_SIZE = 512 | |
| context_encoder = dict( | |
| feature_projection=dict( | |
| in_features=HIDDEN_SIZE, | |
| out_features=D_MODEL, | |
| dropout=0.1, | |
| ), | |
| encoder=dict( | |
| d_model=D_MODEL, | |
| num_layers=12, | |
| layer_drop=0.05, | |
| pos_embedding=dict( | |
| d_model=D_MODEL, | |
| kernel_size=3, | |
| groups=2, | |
| dropout=0.1, | |
| ), | |
| layer=dict( | |
| d_model=D_MODEL, | |
| num_heads=8, | |
| layer_norm_first=False, | |
| feed_forward_dim=2048, | |
| dropout=0.1, | |
| ), | |
| ) | |
| ) | |
| feature_extractor = dict( | |
| num_channels=7 * (HIDDEN_SIZE,), | |
| kernel_sizes=(10,) + 4 * (3,) + 2 * (2,), | |
| strides=(5,) + 6 * (2,), | |
| ) | |
| quantizer = dict( | |
| in_features=HIDDEN_SIZE, | |
| num_codebooks=2, | |
| num_codewords=320, | |
| d_model=D_MODEL, | |
| ) | |
| wav2vec2_pretraining = dict( | |
| context_encoder=context_encoder, | |
| feature_extractor=feature_extractor, | |
| quantizer=quantizer, | |
| mask_prob=0.65, | |
| mask_length=10, | |
| min_masks=2, | |
| num_negatives=100, | |
| contrastive_logits_temperature=0.1, | |
| diversity_loss_weight=0.2, | |
| ) |