init

3181388 18 days ago

5.82 kB

	tts_version: "v1.5.0"

	split: "opensource-en"

	ttl_ckpt_path: "unknown.pt"

	dp_ckpt_path: "unknown.pt"

	ae_ckpt_path: "unknown.pt"

	ttl_train: "unknown"

	dp_train: "unknown"

	ae_train: "unknown"

	ttl:
	latent_dim: 24
	chunk_compress_factor: 6
	batch_expander:
	n_batch_expand: 6
	normalizer:
	scale: 0.25
	text_encoder:
	char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
	text_embedder:
	char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
	char_emb_dim: 256
	convnext:
	idim: 256
	ksz: 5
	intermediate_dim: 1024
	num_layers: 6
	dilation_lst: [1, 1, 1, 1, 1, 1]
	attn_encoder:
	hidden_channels: 256
	filter_channels: 1024
	n_heads: 4
	n_layers: 4
	p_dropout: 0.0
	proj_out:
	idim: 256
	odim: 256
	flow_matching:
	sig_min: 0
	style_encoder:
	proj_in:
	ldim: 24
	chunk_compress_factor: 6
	odim: 256
	convnext:
	idim: 256
	ksz: 5
	intermediate_dim: 1024
	num_layers: 6
	dilation_lst: [1, 1, 1, 1, 1, 1]
	style_token_layer:
	input_dim: 256
	n_style: 50
	style_key_dim: 256
	style_value_dim: 256
	prototype_dim: 256
	n_units: 256
	n_heads: 2
	speech_prompted_text_encoder:
	text_dim: 256
	style_dim: 256
	n_units: 256
	n_heads: 2
	uncond_masker:
	prob_both_uncond: 0.04
	prob_text_uncond: 0.01
	std: 0.1
	text_dim: 256
	n_style: 50
	style_key_dim: 256
	style_value_dim: 256
	vector_field:
	proj_in:
	ldim: 24
	chunk_compress_factor: 6
	odim: 512
	time_encoder:
	time_dim: 64
	hdim: 256
	main_blocks:
	n_blocks: 4
	time_cond_layer:
	idim: 512
	time_dim: 64
	style_cond_layer:
	idim: 512
	style_dim: 256
	text_cond_layer:
	idim: 512
	text_dim: 256
	n_heads: 4
	use_residual: True
	rotary_base: 10000
	rotary_scale: 10
	convnext_0:
	idim: 512
	ksz: 5
	intermediate_dim: 1024
	num_layers: 4
	dilation_lst: [1, 2, 4, 8]
	convnext_1:
	idim: 512
	ksz: 5
	intermediate_dim: 1024
	num_layers: 1
	dilation_lst: [1]
	convnext_2:
	idim: 512
	ksz: 5
	intermediate_dim: 1024
	num_layers: 1
	dilation_lst: [1]
	last_convnext:
	idim: 512
	ksz: 5
	intermediate_dim: 1024
	num_layers: 4
	dilation_lst: [1, 1, 1, 1]
	proj_out:
	idim: 512
	chunk_compress_factor: 6
	ldim: 24

	ae:
	sample_rate: 44100
	n_delay: 0
	base_chunk_size: 512
	chunk_compress_factor: 1
	ldim: 24
	encoder:
	spec_processor:
	n_fft: 2048
	win_length: 2048
	hop_length: 512
	n_mels: 228
	sample_rate: 44100
	eps: 1e-05
	norm_mean: 0.0
	norm_std: 1.0
	ksz_init: 7
	ksz: 7
	num_layers: 10
	dilation_lst: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
	intermediate_dim: 2048
	idim: 1253
	hdim: 512
	odim: 24
	decoder:
	ksz_init: 7
	ksz: 7
	num_layers: 10
	dilation_lst: [1, 2, 4, 1, 2, 4, 1, 1, 1, 1]
	intermediate_dim: 2048
	idim: 24
	hdim: 512
	head:
	idim: 512
	hdim: 2048
	odim: 512
	ksz: 3

	dp:
	latent_dim: 24
	chunk_compress_factor: 6
	normalizer:
	scale: 1.0
	sentence_encoder:
	char_emb_dim: 64
	char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
	text_embedder:
	char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
	char_emb_dim: 64
	convnext:
	idim: 64
	ksz: 5
	intermediate_dim: 256
	num_layers: 6
	dilation_lst: [1, 1, 1, 1, 1, 1]
	attn_encoder:
	hidden_channels: 64
	filter_channels: 256
	n_heads: 2
	n_layers: 2
	p_dropout: 0.0
	proj_out:
	idim: 64
	odim: 64
	style_encoder:
	proj_in:
	ldim: 24
	chunk_compress_factor: 6
	odim: 64
	convnext:
	idim: 64
	ksz: 5
	intermediate_dim: 256
	num_layers: 4
	dilation_lst: [1, 1, 1, 1]
	style_token_layer:
	input_dim: 64
	n_style: 8
	style_key_dim: 0
	style_value_dim: 16
	prototype_dim: 64
	n_units: 64
	n_heads: 2
	predictor:
	sentence_dim: 64
	n_style: 8
	style_dim: 16
	hdim: 128
	n_layer: 2

	unicode_indexer_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.npy"
	unicode_indexer_json_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.json"
	window_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/window.json"
	filter_bank_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/filter_bank.json"