Spaces:

2pift
/

Speaker_Verification_Demo

Sleeping

App Files Files Community

2pift commited on Sep 11

Commit

2792f07

1 Parent(s): f7bd486

Update dependecies

Browse files

Files changed (4) hide show

src/custom_layers.py +67 -0
src/custom_losses.py +200 -0
src/custom_models.py +88 -0
src/streamlit_app.py +5 -7

src/custom_layers.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import tensorflow as tf
+import keras
+@keras.saving.register_keras_serializable()
+class L2Normalization(tf.keras.layers.Layer):
+    """
+    Applies L2 normalization to the last axis of the input tensor.
+    This is used as a top layer in speaker embedding models before
+    cosine similarity computation.
+    """
+    def call(self, inputs):
+        return tf.math.l2_normalize(inputs, axis=1)
+    def compute_output_shape(self, input_shape):
+        return input_shape
+@keras.saving.register_keras_serializable()
+class CosineLayer(tf.keras.layers.Layer):
+    """
+    Dense layer with L2-normalized weights, for cosine similarity-based classification.
+    Args:
+        out_features (int): Number of output features/classes.
+        use_bias (bool): Whether to use bias term.
+        name (str, optional): Layer name.
+    """
+    def __init__(self, out_features, use_bias=False, name=None, **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.out_features = out_features
+        self.use_bias = use_bias
+    def build(self, input_shape):
+        self.w = self.add_weight(
+            shape=(int(input_shape[-1]), self.out_features),
+            initializer='glorot_uniform',
+            trainable=True,
+            name='weights'
+        )
+        if self.use_bias:
+            self.b = self.add_weight(
+                shape=(self.out_features,),
+                initializer='zeros',
+                trainable=True,
+                name='bias'
+            )
+        else:
+            self.b = None
+        super().build(input_shape)
+    def call(self, inputs):
+        w_normalized = tf.math.l2_normalize(self.w, axis=0)
+        logits = tf.linalg.matmul(inputs, w_normalized)
+        if self.use_bias:
+            logits = logits + self.b
+        return logits
+    def compute_output_shape(self, input_shape):
+        return (input_shape[0], self.out_features)
+    def get_config(self):
+        base_config = super().get_config()
+        return {
+            **base_config,
+            'out_features': self.out_features,
+            'use_bias': self.use_bias
+        }

src/custom_losses.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import tensorflow as tf
+import keras
+import numpy as np
+@keras.saving.register_keras_serializable()
+class AdaCosLoss(tf.keras.losses.Loss):
+    """
+    Adaptive Cosine Loss (AdaCos).
+    Implements the AdaCos loss function as described in:
+    "AdaCos: Adaptively Scaling Cosine Logits for Effectively Learning Deep Face Representations"
+    (Zhang et al., 2019).
+    Args:
+        num_classes (int): Number of classes in the classification problem.
+        name (str, optional): Name for the loss instance.
+    """
+    def __init__(self, num_classes=None, name="AdaCos", **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.num_classes = num_classes
+        self.scale = tf.Variable(
+            np.sqrt(2) * np.log(num_classes - 1),
+            dtype=tf.float32, trainable=False
+        )
+    def call(self, y_true, y_pred):
+        """
+        Args:
+            y_true: (batch_size,) integer labels [0, num_classes-1].
+            y_pred: (batch_size, num_classes) classification cosine similarities.
+        Returns:
+            Tensor scalar: Mean AdaCos loss over the batch.
+        """
+        y_true = tf.cast(y_true, tf.int32)
+        y_pred = tf.clip_by_value(
+            y_pred,
+            -1.0 + tf.keras.backend.epsilon(),
+            1.0 - tf.keras.backend.epsilon()
+        )
+        # correct class mask
+        mask = tf.one_hot(y_true, depth=self.num_classes) # shape (batch_size, n_classes)
+        # get theta angles for corresponding class
+        theta_true = tf.math.acos(tf.boolean_mask(y_pred, mask)) # shape (batch_size,)
+        # compute median of 'correct' angles
+        theta_med = tf.keras.ops.median(theta_true)
+        # get non-corresponding cosine values (cos(theta) j is not yi)
+        neg_mask = tf.logical_not(mask > 0) # shape (batch_size, n_classes)
+        cos_theta_neg = tf.boolean_mask(y_pred, neg_mask) # shape (batch_size*(n_classes-1),)
+        neg_y_pred = tf.reshape(cos_theta_neg, [-1, self.num_classes - 1]) # shape (batch_size, n_classes-1)
+        B_avg = tf.reduce_mean(tf.reduce_sum(tf.math.exp(self.scale * neg_y_pred), axis=-1))
+        #B_avg = tf.cast(B_avg, tf.float32)
+        #with tf.control_dependencies([theta_med, B_avg]):
+        new_scale = (
+            tf.math.log(B_avg) /
+            tf.math.cos(tf.minimum(tf.constant(np.pi / 4), theta_med))
+        )
+        # keep current scale if new_scale is invalid
+        safe_scale = tf.cond(
+            tf.math.is_finite(new_scale) & (new_scale > 0),
+            lambda: new_scale,
+            lambda: self.scale
+        )
+        self.scale.assign(safe_scale)
+        logits = self.scale * y_pred
+        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=logits)
+        return tf.reduce_mean(loss)
+    def get_config(self):
+        base_config = super().get_config()
+        return {**base_config, 'num_classes': self.num_classes}
+    def __repr__(self):
+        return (f"{self.__class__.__name__}(num_classes={self.num_classes}, "
+                f"name='{self.name}')")
+    def __str__(self):
+        return self.__repr__()
+    @property
+    def num_classes(self):
+        return self._num_classes
+    @num_classes.setter
+    def num_classes(self, value):
+        if not isinstance(value, int):
+            raise TypeError(f"`num_classes` must be an int, got {type(value).__name__}")
+        if value < 2:
+            raise ValueError(f"`num_classes` must be >= 2, got {value}")
+        self._num_classes = value
+@keras.saving.register_keras_serializable()
+class AdaCosLossMargin(tf.keras.losses.Loss):
+    """
+    Adaptive Cosine Loss with Margin (AdaCosMargin).
+    Extends AdaCos by introducing a fixed margin penalty for the target class logits,
+    encouraging greater separation between classes in angular (cosine) space.
+    Reference:
+    - AdaCos: Adaptively Scaling Cosine Logits for Effectively Learning Deep Face Representations (Zhang et al., 2019)
+    - Large Margin Cosine Loss (CosFace): https://arxiv.org/abs/1801.09414
+    Args:
+        margin (float): Margin to subtract from the target class cosine similarity (0.0–1.0).
+        num_classes (int): Number of classes.
+        name (str, optional): Name for the loss.
+    """
+    def __init__(self, margin=0.1, num_classes=None, name="AdaCosLossMargin", **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.margin = margin
+        self.num_classes = num_classes
+        self.scale = tf.Variable(
+            np.sqrt(2) * np.log(num_classes - 1),
+            dtype=tf.float32, trainable=False
+        )
+    def call(self, y_true, y_pred):
+        """
+        Args:
+            y_true: (batch_size,) integer labels [0, num_classes-1].
+            y_pred: (batch_size, num_classes) cosine similarities.
+        Returns:
+            Tensor scalar: Mean AdaCosMargin loss over the batch.
+        """
+        batch_size = tf.shape(y_pred)[0]
+        y_true = tf.cast(y_true, tf.int32)
+        y_pred = tf.clip_by_value(
+            y_pred,
+            -1.0 + tf.keras.backend.epsilon(),
+            1.0 - tf.keras.backend.epsilon()
+        )
+        mask = tf.one_hot(y_true, depth=self.num_classes)
+        theta_true = tf.math.acos(tf.boolean_mask(y_pred, mask))
+        theta_med = tf.keras.ops.median(theta_true)
+        neg_mask = tf.cast(tf.logical_not(mask > 0), dtype=tf.float32)
+        cos_theta_neg = tf.boolean_mask(y_pred, neg_mask)
+        neg_y_pred = tf.reshape(cos_theta_neg, [batch_size, self.num_classes - 1])
+        B_avg = tf.reduce_mean(tf.reduce_sum(tf.math.exp(self.scale * neg_y_pred), axis=-1))
+        B_avg = tf.cast(B_avg, tf.float32)
+        with tf.control_dependencies([theta_med, B_avg]):
+            new_scale = (
+                tf.math.log(B_avg) /
+                tf.math.cos(tf.minimum(tf.constant(np.pi / 4), theta_med))
+            )
+            safe_scale = tf.cond(
+                tf.math.is_finite(new_scale) & (new_scale > 0),
+                lambda: new_scale,
+                lambda: self.scale
+            )
+            self.scale.assign(safe_scale)
+            logits = self.scale * (y_pred - self.margin * mask)
+            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=logits)
+        return tf.reduce_mean(loss)
+    def get_config(self):
+        base_config = super().get_config()
+        return {
+            **base_config,
+            'num_classes': self.num_classes,
+            'margin': self.margin
+        }
+    def __repr__(self):
+        return (f"{self.__class__.__name__}(margin={self.margin}, num_classes={self.num_classes}, "
+                f"name='{self.name}')")
+    def __str__(self):
+        return self.__repr__()
+    @property
+    def num_classes(self):
+        return self._num_classes
+    @num_classes.setter
+    def num_classes(self, value):
+        if not isinstance(value, int):
+            raise TypeError(f"`num_classes` must be an int, got {type(value).__name__}")
+        if value < 2:
+            raise ValueError(f"`num_classes` must be >= 2, got {value}")
+        self._num_classes = value
+    @property
+    def margin(self):
+        return self._margin
+    @margin.setter
+    def margin(self, value):
+        if not isinstance(value, (float, int)):
+            raise TypeError(f"`margin` must be a float or int, got {type(value).__name__}")
+        value = float(value)
+        if not (0.0 <= value <= 1.0):
+            raise ValueError(f"`margin` must be between 0.0 and 1.0, got {value}")
+        self._margin = value

src/custom_models.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import keras
+import tensorflow as tf
+from custom_layers import L2Normalization, CosineLayer
+@keras.saving.register_keras_serializable()
+class VerificationModel(tf.keras.Model):
+    """
+    Modular Speaker Verification Model.
+    Combines a backbone (feature extractor), an embedding projection, optional L2 normalization,
+    and a cosine classification head (CosineLayer).
+    Args:
+        base_model (tf.keras.Model): Backbone model (e.g., ResNet18).
+        number_of_classes (int): Number of speaker classes for classification.
+        embedding_dim (int, optional): Size of embedding vector. Default: 512.
+        return_embedding (bool, optional): If True, returns only embeddings (for verification);
+            if False, returns logits for classification. Default: False.
+        base_training (bool, optional): If set, overrides 'training' flag for base model (controls BatchNorm, Dropout).
+    """
+    def __init__(
+        self,
+        base_model,
+        number_of_classes,
+        normalization_layer,
+        cosine_layer,
+        embedding_dim: int = 512,
+        return_embedding: bool = False,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.base_model = base_model
+        self.embedding_dim = embedding_dim
+        self.number_of_classes = number_of_classes
+        self.return_embedding = return_embedding
+        self.embedding_layer = tf.keras.layers.Dense(
+            embedding_dim,
+            activation='tanh',
+            use_bias=False,
+            name='embedding_dense'
+        )
+        self.bn_neck = tf.keras.layers.BatchNormalization(name="bn_neck")
+        self.normalization_layer = normalization_layer
+        self.cosine_layer = cosine_layer
+    def call(self, inputs, training=None):
+        """
+        Forward pass.
+        Args:
+            inputs: Input tensor (e.g., spectrograms).
+            training (bool, optional): Training mode (Keras convention).
+        Returns:
+            Embeddings (if return_embedding=True) or logits for classification.
+        """
+        x = self.base_model(inputs, training=training)
+        x = self.embedding_layer(x)
+        x = self.bn_neck(x, training=training)
+        x = self.normalization_layer(x)
+        if self.return_embedding:
+            return x
+        return self.cosine_layer(x)
+    def get_config(self):
+        base_config = super().get_config()
+        return {
+            **base_config,
+            "base_model": keras.saving.serialize_keras_object(self.base_model),
+            "normalization_layer": keras.saving.serialize_keras_object(
+                self.normalization_layer
+            ),
+            "cosine_layer": keras.saving.serialize_keras_object(self.cosine_layer),
+            "number_of_classes": self.number_of_classes,
+            "embedding_dim": self.embedding_dim,
+            "return_embedding": self.return_embedding
+        }
+    @classmethod
+    def from_config(cls, config):
+        base_model = keras.saving.deserialize_keras_object(config.pop("base_model"))
+        normalization_layer = keras.saving.deserialize_keras_object(config.pop("normalization_layer"))
+        cosine_layer = keras.saving.deserialize_keras_object(config.pop("cosine_layer"))
+        return cls(base_model=base_model,
+                   normalization_layer=normalization_layer,
+                   cosine_layer=cosine_layer,
+                   **config)

src/streamlit_app.py CHANGED Viewed

@@ -14,8 +14,6 @@ st.title("Speaker Verification - Demo")
 # ========= Session state =========
 if "load_model_button" not in st.session_state:
     st.session_state.load_model_button = False
-# if "verify_speaker_button" not in st.session_state:
-#     st.session_state.verify_speaker_button = False
 if "audio_left" not in st.session_state:
     st.session_state.audio_left = None
 if "audio_right" not in st.session_state:
@@ -60,7 +58,7 @@ def bytes_to_pcm16k_mono(data: bytes, in_format: str | None) -> np.ndarray:
     out, err = ffmpeg.run(stream, capture_stdout=True, capture_stderr=True, input=data)
     audio = np.frombuffer(out, dtype="<i2").astype(np.float32) / 32768.0
     if audio.size < WT:
-        # padding do WT
         audio = np.pad(audio, (int((WT - audio.size) / 2) + 1, int((WT - audio.size) / 2) + 1), mode="constant")
     return audio
@@ -85,13 +83,13 @@ def load_model_from_hub(repo_id: str, filename: str, revision: str):
         repo_type="model",
         revision=revision,
     )
-    # Import modułu z customami, żeby rejestratory Keras się wykonały
-    import custom_models, custom_losses  # noqa: F401
     model = keras.models.load_model(model_path)
     if hasattr(model, "return_embedding"):
         model.return_embedding = True
     with open(model_path, "rb") as f:
-        model_bytes = f.read()  # do download_button (bez trzymania otwartego pliku)
     return model, model_path, model_bytes
 def handle_record(label: str) -> np.ndarray | None:
@@ -190,7 +188,7 @@ if st.session_state.load_model_button:
     except Exception as e:
         st.error(f"Error loading model: {e}")
-    # ========= Two columns (symetryczne) =========
     left_column, right_column = st.columns(2)
     with left_column:

 # ========= Session state =========
 if "load_model_button" not in st.session_state:
     st.session_state.load_model_button = False
 if "audio_left" not in st.session_state:
     st.session_state.audio_left = None
 if "audio_right" not in st.session_state:
     out, err = ffmpeg.run(stream, capture_stdout=True, capture_stderr=True, input=data)
     audio = np.frombuffer(out, dtype="<i2").astype(np.float32) / 32768.0
     if audio.size < WT:
+        # Padding (centered)
         audio = np.pad(audio, (int((WT - audio.size) / 2) + 1, int((WT - audio.size) / 2) + 1), mode="constant")
     return audio
         repo_type="model",
         revision=revision,
     )
+    # Import custom modules
+    import custom_models, custom_losses
     model = keras.models.load_model(model_path)
     if hasattr(model, "return_embedding"):
         model.return_embedding = True
     with open(model_path, "rb") as f:
+        model_bytes = f.read()
     return model, model_path, model_bytes
 def handle_record(label: str) -> np.ndarray | None:
     except Exception as e:
         st.error(f"Error loading model: {e}")
+    # ========= Two columns =========
     left_column, right_column = st.columns(2)
     with left_column: