ANLGBOY commited on
Commit
3181388
·
1 Parent(s): cb3a414
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ window.json
2
+ filter_bank.json
3
+ style_extractor.onnx
4
+ *.npy
onnx/duration_predictor.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e30b9d2dc2da1b12af5ff4a9b6e6ac00ca0bc900a2648ec0bf6e134e386c8133
3
+ size 1590703
onnx/text_encoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c857c504f84855a78e3506e882ddd4b414c7e6cc5c96a87d4e1fc741917aa4b
3
+ size 27978387
onnx/tts.json ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tts_version": "v1.5.0",
3
+ "split": "opensource-en",
4
+ "ttl_ckpt_path": "unknown.pt",
5
+ "dp_ckpt_path": "unknown.pt",
6
+ "ae_ckpt_path": "unknown.pt",
7
+ "ttl_train": "unknown",
8
+ "dp_train": "unknown",
9
+ "ae_train": "unknown",
10
+ "ttl": {
11
+ "latent_dim": 24,
12
+ "chunk_compress_factor": 6,
13
+ "batch_expander": {
14
+ "n_batch_expand": 6
15
+ },
16
+ "normalizer": {
17
+ "scale": 0.25
18
+ },
19
+ "text_encoder": {
20
+ "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
21
+ "text_embedder": {
22
+ "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
23
+ "char_emb_dim": 256
24
+ },
25
+ "convnext": {
26
+ "idim": 256,
27
+ "ksz": 5,
28
+ "intermediate_dim": 1024,
29
+ "num_layers": 6,
30
+ "dilation_lst": [
31
+ 1,
32
+ 1,
33
+ 1,
34
+ 1,
35
+ 1,
36
+ 1
37
+ ]
38
+ },
39
+ "attn_encoder": {
40
+ "hidden_channels": 256,
41
+ "filter_channels": 1024,
42
+ "n_heads": 4,
43
+ "n_layers": 4,
44
+ "p_dropout": 0.0
45
+ },
46
+ "proj_out": {
47
+ "idim": 256,
48
+ "odim": 256
49
+ }
50
+ },
51
+ "flow_matching": {
52
+ "sig_min": 0
53
+ },
54
+ "style_encoder": {
55
+ "proj_in": {
56
+ "ldim": 24,
57
+ "chunk_compress_factor": 6,
58
+ "odim": 256
59
+ },
60
+ "convnext": {
61
+ "idim": 256,
62
+ "ksz": 5,
63
+ "intermediate_dim": 1024,
64
+ "num_layers": 6,
65
+ "dilation_lst": [
66
+ 1,
67
+ 1,
68
+ 1,
69
+ 1,
70
+ 1,
71
+ 1
72
+ ]
73
+ },
74
+ "style_token_layer": {
75
+ "input_dim": 256,
76
+ "n_style": 50,
77
+ "style_key_dim": 256,
78
+ "style_value_dim": 256,
79
+ "prototype_dim": 256,
80
+ "n_units": 256,
81
+ "n_heads": 2
82
+ }
83
+ },
84
+ "speech_prompted_text_encoder": {
85
+ "text_dim": 256,
86
+ "style_dim": 256,
87
+ "n_units": 256,
88
+ "n_heads": 2
89
+ },
90
+ "uncond_masker": {
91
+ "prob_both_uncond": 0.04,
92
+ "prob_text_uncond": 0.01,
93
+ "std": 0.1,
94
+ "text_dim": 256,
95
+ "n_style": 50,
96
+ "style_key_dim": 256,
97
+ "style_value_dim": 256
98
+ },
99
+ "vector_field": {
100
+ "proj_in": {
101
+ "ldim": 24,
102
+ "chunk_compress_factor": 6,
103
+ "odim": 512
104
+ },
105
+ "time_encoder": {
106
+ "time_dim": 64,
107
+ "hdim": 256
108
+ },
109
+ "main_blocks": {
110
+ "n_blocks": 4,
111
+ "time_cond_layer": {
112
+ "idim": 512,
113
+ "time_dim": 64
114
+ },
115
+ "style_cond_layer": {
116
+ "idim": 512,
117
+ "style_dim": 256
118
+ },
119
+ "text_cond_layer": {
120
+ "idim": 512,
121
+ "text_dim": 256,
122
+ "n_heads": 4,
123
+ "use_residual": true,
124
+ "rotary_base": 10000,
125
+ "rotary_scale": 10
126
+ },
127
+ "convnext_0": {
128
+ "idim": 512,
129
+ "ksz": 5,
130
+ "intermediate_dim": 1024,
131
+ "num_layers": 4,
132
+ "dilation_lst": [
133
+ 1,
134
+ 2,
135
+ 4,
136
+ 8
137
+ ]
138
+ },
139
+ "convnext_1": {
140
+ "idim": 512,
141
+ "ksz": 5,
142
+ "intermediate_dim": 1024,
143
+ "num_layers": 1,
144
+ "dilation_lst": [
145
+ 1
146
+ ]
147
+ },
148
+ "convnext_2": {
149
+ "idim": 512,
150
+ "ksz": 5,
151
+ "intermediate_dim": 1024,
152
+ "num_layers": 1,
153
+ "dilation_lst": [
154
+ 1
155
+ ]
156
+ }
157
+ },
158
+ "last_convnext": {
159
+ "idim": 512,
160
+ "ksz": 5,
161
+ "intermediate_dim": 1024,
162
+ "num_layers": 4,
163
+ "dilation_lst": [
164
+ 1,
165
+ 1,
166
+ 1,
167
+ 1
168
+ ]
169
+ },
170
+ "proj_out": {
171
+ "idim": 512,
172
+ "chunk_compress_factor": 6,
173
+ "ldim": 24
174
+ }
175
+ }
176
+ },
177
+ "ae": {
178
+ "sample_rate": 44100,
179
+ "n_delay": 0,
180
+ "base_chunk_size": 512,
181
+ "chunk_compress_factor": 1,
182
+ "ldim": 24,
183
+ "encoder": {
184
+ "spec_processor": {
185
+ "n_fft": 2048,
186
+ "win_length": 2048,
187
+ "hop_length": 512,
188
+ "n_mels": 228,
189
+ "sample_rate": 44100,
190
+ "eps": 1e-05,
191
+ "norm_mean": 0.0,
192
+ "norm_std": 1.0
193
+ },
194
+ "ksz_init": 7,
195
+ "ksz": 7,
196
+ "num_layers": 10,
197
+ "dilation_lst": [
198
+ 1,
199
+ 1,
200
+ 1,
201
+ 1,
202
+ 1,
203
+ 1,
204
+ 1,
205
+ 1,
206
+ 1,
207
+ 1
208
+ ],
209
+ "intermediate_dim": 2048,
210
+ "idim": 1253,
211
+ "hdim": 512,
212
+ "odim": 24
213
+ },
214
+ "decoder": {
215
+ "ksz_init": 7,
216
+ "ksz": 7,
217
+ "num_layers": 10,
218
+ "dilation_lst": [
219
+ 1,
220
+ 2,
221
+ 4,
222
+ 1,
223
+ 2,
224
+ 4,
225
+ 1,
226
+ 1,
227
+ 1,
228
+ 1
229
+ ],
230
+ "intermediate_dim": 2048,
231
+ "idim": 24,
232
+ "hdim": 512,
233
+ "head": {
234
+ "idim": 512,
235
+ "hdim": 2048,
236
+ "odim": 512,
237
+ "ksz": 3
238
+ }
239
+ }
240
+ },
241
+ "dp": {
242
+ "latent_dim": 24,
243
+ "chunk_compress_factor": 6,
244
+ "normalizer": {
245
+ "scale": 1.0
246
+ },
247
+ "sentence_encoder": {
248
+ "char_emb_dim": 64,
249
+ "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
250
+ "text_embedder": {
251
+ "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
252
+ "char_emb_dim": 64
253
+ },
254
+ "convnext": {
255
+ "idim": 64,
256
+ "ksz": 5,
257
+ "intermediate_dim": 256,
258
+ "num_layers": 6,
259
+ "dilation_lst": [
260
+ 1,
261
+ 1,
262
+ 1,
263
+ 1,
264
+ 1,
265
+ 1
266
+ ]
267
+ },
268
+ "attn_encoder": {
269
+ "hidden_channels": 64,
270
+ "filter_channels": 256,
271
+ "n_heads": 2,
272
+ "n_layers": 2,
273
+ "p_dropout": 0.0
274
+ },
275
+ "proj_out": {
276
+ "idim": 64,
277
+ "odim": 64
278
+ }
279
+ },
280
+ "style_encoder": {
281
+ "proj_in": {
282
+ "ldim": 24,
283
+ "chunk_compress_factor": 6,
284
+ "odim": 64
285
+ },
286
+ "convnext": {
287
+ "idim": 64,
288
+ "ksz": 5,
289
+ "intermediate_dim": 256,
290
+ "num_layers": 4,
291
+ "dilation_lst": [
292
+ 1,
293
+ 1,
294
+ 1,
295
+ 1
296
+ ]
297
+ },
298
+ "style_token_layer": {
299
+ "input_dim": 64,
300
+ "n_style": 8,
301
+ "style_key_dim": 0,
302
+ "style_value_dim": 16,
303
+ "prototype_dim": 64,
304
+ "n_units": 64,
305
+ "n_heads": 2
306
+ }
307
+ },
308
+ "predictor": {
309
+ "sentence_dim": 64,
310
+ "n_style": 8,
311
+ "style_dim": 16,
312
+ "hdim": 128,
313
+ "n_layer": 2
314
+ }
315
+ }
316
+ }
onnx/tts.yml ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tts_version: "v1.5.0"
2
+
3
+ split: "opensource-en"
4
+
5
+ ttl_ckpt_path: "unknown.pt"
6
+
7
+ dp_ckpt_path: "unknown.pt"
8
+
9
+ ae_ckpt_path: "unknown.pt"
10
+
11
+ ttl_train: "unknown"
12
+
13
+ dp_train: "unknown"
14
+
15
+ ae_train: "unknown"
16
+
17
+ ttl:
18
+ latent_dim: 24
19
+ chunk_compress_factor: 6
20
+ batch_expander:
21
+ n_batch_expand: 6
22
+ normalizer:
23
+ scale: 0.25
24
+ text_encoder:
25
+ char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
26
+ text_embedder:
27
+ char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
28
+ char_emb_dim: 256
29
+ convnext:
30
+ idim: 256
31
+ ksz: 5
32
+ intermediate_dim: 1024
33
+ num_layers: 6
34
+ dilation_lst: [1, 1, 1, 1, 1, 1]
35
+ attn_encoder:
36
+ hidden_channels: 256
37
+ filter_channels: 1024
38
+ n_heads: 4
39
+ n_layers: 4
40
+ p_dropout: 0.0
41
+ proj_out:
42
+ idim: 256
43
+ odim: 256
44
+ flow_matching:
45
+ sig_min: 0
46
+ style_encoder:
47
+ proj_in:
48
+ ldim: 24
49
+ chunk_compress_factor: 6
50
+ odim: 256
51
+ convnext:
52
+ idim: 256
53
+ ksz: 5
54
+ intermediate_dim: 1024
55
+ num_layers: 6
56
+ dilation_lst: [1, 1, 1, 1, 1, 1]
57
+ style_token_layer:
58
+ input_dim: 256
59
+ n_style: 50
60
+ style_key_dim: 256
61
+ style_value_dim: 256
62
+ prototype_dim: 256
63
+ n_units: 256
64
+ n_heads: 2
65
+ speech_prompted_text_encoder:
66
+ text_dim: 256
67
+ style_dim: 256
68
+ n_units: 256
69
+ n_heads: 2
70
+ uncond_masker:
71
+ prob_both_uncond: 0.04
72
+ prob_text_uncond: 0.01
73
+ std: 0.1
74
+ text_dim: 256
75
+ n_style: 50
76
+ style_key_dim: 256
77
+ style_value_dim: 256
78
+ vector_field:
79
+ proj_in:
80
+ ldim: 24
81
+ chunk_compress_factor: 6
82
+ odim: 512
83
+ time_encoder:
84
+ time_dim: 64
85
+ hdim: 256
86
+ main_blocks:
87
+ n_blocks: 4
88
+ time_cond_layer:
89
+ idim: 512
90
+ time_dim: 64
91
+ style_cond_layer:
92
+ idim: 512
93
+ style_dim: 256
94
+ text_cond_layer:
95
+ idim: 512
96
+ text_dim: 256
97
+ n_heads: 4
98
+ use_residual: True
99
+ rotary_base: 10000
100
+ rotary_scale: 10
101
+ convnext_0:
102
+ idim: 512
103
+ ksz: 5
104
+ intermediate_dim: 1024
105
+ num_layers: 4
106
+ dilation_lst: [1, 2, 4, 8]
107
+ convnext_1:
108
+ idim: 512
109
+ ksz: 5
110
+ intermediate_dim: 1024
111
+ num_layers: 1
112
+ dilation_lst: [1]
113
+ convnext_2:
114
+ idim: 512
115
+ ksz: 5
116
+ intermediate_dim: 1024
117
+ num_layers: 1
118
+ dilation_lst: [1]
119
+ last_convnext:
120
+ idim: 512
121
+ ksz: 5
122
+ intermediate_dim: 1024
123
+ num_layers: 4
124
+ dilation_lst: [1, 1, 1, 1]
125
+ proj_out:
126
+ idim: 512
127
+ chunk_compress_factor: 6
128
+ ldim: 24
129
+
130
+ ae:
131
+ sample_rate: 44100
132
+ n_delay: 0
133
+ base_chunk_size: 512
134
+ chunk_compress_factor: 1
135
+ ldim: 24
136
+ encoder:
137
+ spec_processor:
138
+ n_fft: 2048
139
+ win_length: 2048
140
+ hop_length: 512
141
+ n_mels: 228
142
+ sample_rate: 44100
143
+ eps: 1e-05
144
+ norm_mean: 0.0
145
+ norm_std: 1.0
146
+ ksz_init: 7
147
+ ksz: 7
148
+ num_layers: 10
149
+ dilation_lst: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
150
+ intermediate_dim: 2048
151
+ idim: 1253
152
+ hdim: 512
153
+ odim: 24
154
+ decoder:
155
+ ksz_init: 7
156
+ ksz: 7
157
+ num_layers: 10
158
+ dilation_lst: [1, 2, 4, 1, 2, 4, 1, 1, 1, 1]
159
+ intermediate_dim: 2048
160
+ idim: 24
161
+ hdim: 512
162
+ head:
163
+ idim: 512
164
+ hdim: 2048
165
+ odim: 512
166
+ ksz: 3
167
+
168
+ dp:
169
+ latent_dim: 24
170
+ chunk_compress_factor: 6
171
+ normalizer:
172
+ scale: 1.0
173
+ sentence_encoder:
174
+ char_emb_dim: 64
175
+ char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
176
+ text_embedder:
177
+ char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
178
+ char_emb_dim: 64
179
+ convnext:
180
+ idim: 64
181
+ ksz: 5
182
+ intermediate_dim: 256
183
+ num_layers: 6
184
+ dilation_lst: [1, 1, 1, 1, 1, 1]
185
+ attn_encoder:
186
+ hidden_channels: 64
187
+ filter_channels: 256
188
+ n_heads: 2
189
+ n_layers: 2
190
+ p_dropout: 0.0
191
+ proj_out:
192
+ idim: 64
193
+ odim: 64
194
+ style_encoder:
195
+ proj_in:
196
+ ldim: 24
197
+ chunk_compress_factor: 6
198
+ odim: 64
199
+ convnext:
200
+ idim: 64
201
+ ksz: 5
202
+ intermediate_dim: 256
203
+ num_layers: 4
204
+ dilation_lst: [1, 1, 1, 1]
205
+ style_token_layer:
206
+ input_dim: 64
207
+ n_style: 8
208
+ style_key_dim: 0
209
+ style_value_dim: 16
210
+ prototype_dim: 64
211
+ n_units: 64
212
+ n_heads: 2
213
+ predictor:
214
+ sentence_dim: 64
215
+ n_style: 8
216
+ style_dim: 16
217
+ hdim: 128
218
+ n_layer: 2
219
+
220
+ unicode_indexer_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.npy"
221
+ unicode_indexer_json_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.json"
222
+ window_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/window.json"
223
+ filter_bank_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/filter_bank.json"
onnx/unicode_indexer.json ADDED
The diff for this file is too large to render. See raw diff
 
onnx/vector_estimator.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3eb36e8cc102e0db3171229a6ae87be3ff244d949997010c0edf0fd6b643483d
3
+ size 132517477
onnx/vocoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6886622edc640d74d28c22e8282f0cf8d8eb5dc33d0ced67ed652ef6ea68d0c3
3
+ size 101424195
voice_styles/F1.json ADDED
The diff for this file is too large to render. See raw diff
 
voice_styles/F2.json ADDED
The diff for this file is too large to render. See raw diff
 
voice_styles/M1.json ADDED
The diff for this file is too large to render. See raw diff
 
voice_styles/M2.json ADDED
The diff for this file is too large to render. See raw diff