InteractiveOmni-8B / configuration_hifigan.py
tongww's picture
upload initial model
4cffcdc verified
# --------------------------------------------------------
# SenseTime
# Copyright (c) 2025 SenseTime
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import copy
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
class HiFiGanConfig(PretrainedConfig):
def __init__(
self,
in_channels = 80,
base_channels = 512,
nb_harmonics = 8,
sampling_rate =24000,
nsf_alpha= 0.1,
nsf_sigma= 0.003,
nsf_voiced_threshold = 10,
upsample_rates = [8, 5, 3],
upsample_kernel_sizes = [16, 11, 7],
istft_params ={'n_fft': 16,
'hop_len': 4,
},
resblock_kernel_sizes = [3, 7, 11],
resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
source_resblock_kernel_sizes = [7, 7, 11],
source_resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
lrelu_slope = 0.1,
audio_limit =0.99,
f0_predictor_config={
'num_class': 1,
'in_channels': 80,
'cond_channels': 512
},
**kwargs):
super().__init__(**kwargs)
self.in_channels = in_channels
self.base_channels = base_channels
self.nb_harmonics = nb_harmonics
self.sampling_rate = sampling_rate
self.nsf_alpha = nsf_alpha
self.nsf_sigma = nsf_sigma
self.nsf_voiced_threshold = nsf_voiced_threshold
self.upsample_rates = upsample_rates
self.upsample_kernel_sizes = upsample_kernel_sizes
self.istft_params = istft_params
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes= resblock_dilation_sizes
self.source_resblock_kernel_sizes = source_resblock_kernel_sizes
self.source_resblock_dilation_sizes = source_resblock_dilation_sizes
self.lrelu_slope = lrelu_slope
self.audio_limit = audio_limit
self.f0_predictor_config = f0_predictor_config
pass
def to_dict(self):
"""
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
Returns:
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
output['in_channels'] = self.in_channels
output['base_channels'] = self.base_channels
output['nb_harmonics'] = self.nb_harmonics
output['sampling_rate'] = self.sampling_rate
output['nsf_alpha'] = self.nsf_alpha
output['nsf_sigma'] = self.nsf_sigma
output['nsf_voiced_threshold'] = self.nsf_voiced_threshold
output['upsample_rates'] = self.upsample_rates
output['upsample_kernel_sizes'] = self.upsample_kernel_sizes
output['istft_params'] = self.istft_params
output['resblock_kernel_sizes'] = self.resblock_kernel_sizes
output['resblock_dilation_sizes'] = self.resblock_dilation_sizes
output['source_resblock_dilation_sizes'] = self.source_resblock_dilation_sizes
output['lrelu_slope'] = self.lrelu_slope
output['audio_limit'] = self.audio_limit
output['f0_predictor_config'] = self.f0_predictor_config
return output