File size: 1,373 Bytes
aca1242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
model_name: pretrain

model:
  dim: 1024
  depth: 24
  heads: 16
  ff_mult: 4
  text_dim: 512
  # disable convnext in text embedding
  conv_layers: 0
  # phoneme vocab size
  text_num_embeds: 200
  mel_dim: 100
  t5_dim: 1024
  clap_dim: 512
  # disable it on a100
  use_checkpoint: false
  qk_norm: true
  skip: true

mel:
  target_sample_rate: 24000
  n_mel_channels: 100
  hop_length: 256

opt:
  learning_rate: 2.0e-04
  beta1: 0.9
  beta2: 0.999
  weight_decay: 0.01
  adam_epsilon: 1.0e-08
  grad_clip: 1.0
  batch_size: 64
  accumulation_steps: 1
  # mask_range: [0.7, 1.0]
  drop_spk: 0.1
  drop_text: 0.5

  lr_scheduler:
    warmup_steps: 5000
    decay_steps: 150000
    end_factor: 1.0e-02

data:
  trainset:
    dataset_dir: "" # your processed path
    clap_emb_dir: "./data/clap_embs/"
    t5_folder_name: "t5"
    phn_folder_name: "g2p"
    manifest_name: "manifest"
    json_name: "jsons"
    dynamic_batching: true
    text_pad_token: -1
    audio_pad_token: 0.0
    split: "train_PT"
    sr: 24000
    norm_audio: false

  valset:
    dataset_dir: "" # your processed path
    clap_emb_dir: "./data/clap_embs/"
    t5_folder_name: "t5"
    phn_folder_name: "g2p"
    manifest_name: "manifest"
    json_name: "jsons"
    dynamic_batching: true
    text_pad_token: -1
    audio_pad_token: 0.0
    split: "validation_PT"
    sr: 24000
    norm_audio: false