File size: 3,543 Bytes

4cffcdc

# --------------------------------------------------------
# SenseTime
# Copyright (c) 2025 SenseTime
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import copy

from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging

logger = logging.get_logger(__name__)

class HiFiGanConfig(PretrainedConfig):
    def __init__(
            self,
            in_channels = 80,
            base_channels = 512,
            nb_harmonics = 8,
            sampling_rate =24000,
            nsf_alpha= 0.1,
            nsf_sigma= 0.003,
            nsf_voiced_threshold = 10,
            upsample_rates = [8, 5, 3],
            upsample_kernel_sizes = [16, 11, 7],
            istft_params ={'n_fft': 16,
                           'hop_len': 4,
                           },
            resblock_kernel_sizes = [3, 7, 11],
            resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 
            source_resblock_kernel_sizes = [7, 7, 11],
            source_resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
            lrelu_slope = 0.1,
            audio_limit =0.99,
            f0_predictor_config={
                'num_class': 1,
                'in_channels': 80,
                'cond_channels': 512
                },
            **kwargs):
        super().__init__(**kwargs)

        self.in_channels = in_channels
        self.base_channels = base_channels
        self.nb_harmonics = nb_harmonics
        self.sampling_rate = sampling_rate
        self.nsf_alpha = nsf_alpha
        self.nsf_sigma = nsf_sigma
        self.nsf_voiced_threshold = nsf_voiced_threshold
        self.upsample_rates = upsample_rates
        self.upsample_kernel_sizes = upsample_kernel_sizes
        self.istft_params = istft_params
        self.resblock_kernel_sizes = resblock_kernel_sizes
        self.resblock_dilation_sizes= resblock_dilation_sizes
        self.source_resblock_kernel_sizes = source_resblock_kernel_sizes
        self.source_resblock_dilation_sizes = source_resblock_dilation_sizes
        self.lrelu_slope = lrelu_slope
        self.audio_limit = audio_limit
        self.f0_predictor_config = f0_predictor_config
        pass


    def to_dict(self):
        """
        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].

        Returns:
            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
        """
        output = copy.deepcopy(self.__dict__)
        output['in_channels'] = self.in_channels
        output['base_channels'] = self.base_channels
        output['nb_harmonics'] = self.nb_harmonics
        output['sampling_rate'] = self.sampling_rate
        output['nsf_alpha'] = self.nsf_alpha
        output['nsf_sigma'] = self.nsf_sigma
        output['nsf_voiced_threshold'] = self.nsf_voiced_threshold
        output['upsample_rates'] = self.upsample_rates
        output['upsample_kernel_sizes'] = self.upsample_kernel_sizes
        output['istft_params'] = self.istft_params
        output['resblock_kernel_sizes'] = self.resblock_kernel_sizes
        output['resblock_dilation_sizes'] = self.resblock_dilation_sizes
        output['source_resblock_dilation_sizes'] = self.source_resblock_dilation_sizes
        output['lrelu_slope'] = self.lrelu_slope
        output['audio_limit'] = self.audio_limit
        output['f0_predictor_config'] = self.f0_predictor_config

        return output