2pift commited on
Commit
2792f07
·
1 Parent(s): f7bd486

Update dependecies

Browse files
src/custom_layers.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ import keras
3
+
4
+ @keras.saving.register_keras_serializable()
5
+ class L2Normalization(tf.keras.layers.Layer):
6
+ """
7
+ Applies L2 normalization to the last axis of the input tensor.
8
+
9
+ This is used as a top layer in speaker embedding models before
10
+ cosine similarity computation.
11
+ """
12
+ def call(self, inputs):
13
+ return tf.math.l2_normalize(inputs, axis=1)
14
+
15
+ def compute_output_shape(self, input_shape):
16
+ return input_shape
17
+
18
+ @keras.saving.register_keras_serializable()
19
+ class CosineLayer(tf.keras.layers.Layer):
20
+ """
21
+ Dense layer with L2-normalized weights, for cosine similarity-based classification.
22
+
23
+ Args:
24
+ out_features (int): Number of output features/classes.
25
+ use_bias (bool): Whether to use bias term.
26
+ name (str, optional): Layer name.
27
+ """
28
+ def __init__(self, out_features, use_bias=False, name=None, **kwargs):
29
+ super().__init__(name=name, **kwargs)
30
+ self.out_features = out_features
31
+ self.use_bias = use_bias
32
+
33
+ def build(self, input_shape):
34
+ self.w = self.add_weight(
35
+ shape=(int(input_shape[-1]), self.out_features),
36
+ initializer='glorot_uniform',
37
+ trainable=True,
38
+ name='weights'
39
+ )
40
+ if self.use_bias:
41
+ self.b = self.add_weight(
42
+ shape=(self.out_features,),
43
+ initializer='zeros',
44
+ trainable=True,
45
+ name='bias'
46
+ )
47
+ else:
48
+ self.b = None
49
+ super().build(input_shape)
50
+
51
+ def call(self, inputs):
52
+ w_normalized = tf.math.l2_normalize(self.w, axis=0)
53
+ logits = tf.linalg.matmul(inputs, w_normalized)
54
+ if self.use_bias:
55
+ logits = logits + self.b
56
+ return logits
57
+
58
+ def compute_output_shape(self, input_shape):
59
+ return (input_shape[0], self.out_features)
60
+
61
+ def get_config(self):
62
+ base_config = super().get_config()
63
+ return {
64
+ **base_config,
65
+ 'out_features': self.out_features,
66
+ 'use_bias': self.use_bias
67
+ }
src/custom_losses.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ import keras
3
+ import numpy as np
4
+
5
+ @keras.saving.register_keras_serializable()
6
+ class AdaCosLoss(tf.keras.losses.Loss):
7
+ """
8
+ Adaptive Cosine Loss (AdaCos).
9
+
10
+ Implements the AdaCos loss function as described in:
11
+ "AdaCos: Adaptively Scaling Cosine Logits for Effectively Learning Deep Face Representations"
12
+ (Zhang et al., 2019).
13
+
14
+ Args:
15
+ num_classes (int): Number of classes in the classification problem.
16
+ name (str, optional): Name for the loss instance.
17
+ """
18
+ def __init__(self, num_classes=None, name="AdaCos", **kwargs):
19
+ super().__init__(name=name, **kwargs)
20
+ self.num_classes = num_classes
21
+ self.scale = tf.Variable(
22
+ np.sqrt(2) * np.log(num_classes - 1),
23
+ dtype=tf.float32, trainable=False
24
+ )
25
+
26
+ def call(self, y_true, y_pred):
27
+ """
28
+ Args:
29
+ y_true: (batch_size,) integer labels [0, num_classes-1].
30
+ y_pred: (batch_size, num_classes) classification cosine similarities.
31
+
32
+ Returns:
33
+ Tensor scalar: Mean AdaCos loss over the batch.
34
+ """
35
+ y_true = tf.cast(y_true, tf.int32)
36
+ y_pred = tf.clip_by_value(
37
+ y_pred,
38
+ -1.0 + tf.keras.backend.epsilon(),
39
+ 1.0 - tf.keras.backend.epsilon()
40
+ )
41
+ # correct class mask
42
+ mask = tf.one_hot(y_true, depth=self.num_classes) # shape (batch_size, n_classes)
43
+ # get theta angles for corresponding class
44
+ theta_true = tf.math.acos(tf.boolean_mask(y_pred, mask)) # shape (batch_size,)
45
+ # compute median of 'correct' angles
46
+ theta_med = tf.keras.ops.median(theta_true)
47
+ # get non-corresponding cosine values (cos(theta) j is not yi)
48
+ neg_mask = tf.logical_not(mask > 0) # shape (batch_size, n_classes)
49
+ cos_theta_neg = tf.boolean_mask(y_pred, neg_mask) # shape (batch_size*(n_classes-1),)
50
+
51
+ neg_y_pred = tf.reshape(cos_theta_neg, [-1, self.num_classes - 1]) # shape (batch_size, n_classes-1)
52
+
53
+ B_avg = tf.reduce_mean(tf.reduce_sum(tf.math.exp(self.scale * neg_y_pred), axis=-1))
54
+ #B_avg = tf.cast(B_avg, tf.float32)
55
+
56
+ #with tf.control_dependencies([theta_med, B_avg]):
57
+ new_scale = (
58
+ tf.math.log(B_avg) /
59
+ tf.math.cos(tf.minimum(tf.constant(np.pi / 4), theta_med))
60
+ )
61
+ # keep current scale if new_scale is invalid
62
+ safe_scale = tf.cond(
63
+ tf.math.is_finite(new_scale) & (new_scale > 0),
64
+ lambda: new_scale,
65
+ lambda: self.scale
66
+ )
67
+ self.scale.assign(safe_scale)
68
+ logits = self.scale * y_pred
69
+ loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=logits)
70
+
71
+ return tf.reduce_mean(loss)
72
+
73
+ def get_config(self):
74
+ base_config = super().get_config()
75
+ return {**base_config, 'num_classes': self.num_classes}
76
+
77
+ def __repr__(self):
78
+ return (f"{self.__class__.__name__}(num_classes={self.num_classes}, "
79
+ f"name='{self.name}')")
80
+
81
+ def __str__(self):
82
+ return self.__repr__()
83
+
84
+ @property
85
+ def num_classes(self):
86
+ return self._num_classes
87
+
88
+ @num_classes.setter
89
+ def num_classes(self, value):
90
+ if not isinstance(value, int):
91
+ raise TypeError(f"`num_classes` must be an int, got {type(value).__name__}")
92
+ if value < 2:
93
+ raise ValueError(f"`num_classes` must be >= 2, got {value}")
94
+ self._num_classes = value
95
+
96
+ @keras.saving.register_keras_serializable()
97
+ class AdaCosLossMargin(tf.keras.losses.Loss):
98
+ """
99
+ Adaptive Cosine Loss with Margin (AdaCosMargin).
100
+
101
+ Extends AdaCos by introducing a fixed margin penalty for the target class logits,
102
+ encouraging greater separation between classes in angular (cosine) space.
103
+
104
+ Reference:
105
+ - AdaCos: Adaptively Scaling Cosine Logits for Effectively Learning Deep Face Representations (Zhang et al., 2019)
106
+ - Large Margin Cosine Loss (CosFace): https://arxiv.org/abs/1801.09414
107
+
108
+ Args:
109
+ margin (float): Margin to subtract from the target class cosine similarity (0.0–1.0).
110
+ num_classes (int): Number of classes.
111
+ name (str, optional): Name for the loss.
112
+ """
113
+ def __init__(self, margin=0.1, num_classes=None, name="AdaCosLossMargin", **kwargs):
114
+ super().__init__(name=name, **kwargs)
115
+ self.margin = margin
116
+ self.num_classes = num_classes
117
+ self.scale = tf.Variable(
118
+ np.sqrt(2) * np.log(num_classes - 1),
119
+ dtype=tf.float32, trainable=False
120
+ )
121
+
122
+ def call(self, y_true, y_pred):
123
+ """
124
+ Args:
125
+ y_true: (batch_size,) integer labels [0, num_classes-1].
126
+ y_pred: (batch_size, num_classes) cosine similarities.
127
+
128
+ Returns:
129
+ Tensor scalar: Mean AdaCosMargin loss over the batch.
130
+ """
131
+ batch_size = tf.shape(y_pred)[0]
132
+ y_true = tf.cast(y_true, tf.int32)
133
+ y_pred = tf.clip_by_value(
134
+ y_pred,
135
+ -1.0 + tf.keras.backend.epsilon(),
136
+ 1.0 - tf.keras.backend.epsilon()
137
+ )
138
+ mask = tf.one_hot(y_true, depth=self.num_classes)
139
+ theta_true = tf.math.acos(tf.boolean_mask(y_pred, mask))
140
+ theta_med = tf.keras.ops.median(theta_true)
141
+ neg_mask = tf.cast(tf.logical_not(mask > 0), dtype=tf.float32)
142
+ cos_theta_neg = tf.boolean_mask(y_pred, neg_mask)
143
+ neg_y_pred = tf.reshape(cos_theta_neg, [batch_size, self.num_classes - 1])
144
+ B_avg = tf.reduce_mean(tf.reduce_sum(tf.math.exp(self.scale * neg_y_pred), axis=-1))
145
+ B_avg = tf.cast(B_avg, tf.float32)
146
+
147
+ with tf.control_dependencies([theta_med, B_avg]):
148
+ new_scale = (
149
+ tf.math.log(B_avg) /
150
+ tf.math.cos(tf.minimum(tf.constant(np.pi / 4), theta_med))
151
+ )
152
+ safe_scale = tf.cond(
153
+ tf.math.is_finite(new_scale) & (new_scale > 0),
154
+ lambda: new_scale,
155
+ lambda: self.scale
156
+ )
157
+ self.scale.assign(safe_scale)
158
+ logits = self.scale * (y_pred - self.margin * mask)
159
+ loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=logits)
160
+ return tf.reduce_mean(loss)
161
+
162
+ def get_config(self):
163
+ base_config = super().get_config()
164
+ return {
165
+ **base_config,
166
+ 'num_classes': self.num_classes,
167
+ 'margin': self.margin
168
+ }
169
+
170
+ def __repr__(self):
171
+ return (f"{self.__class__.__name__}(margin={self.margin}, num_classes={self.num_classes}, "
172
+ f"name='{self.name}')")
173
+
174
+ def __str__(self):
175
+ return self.__repr__()
176
+
177
+ @property
178
+ def num_classes(self):
179
+ return self._num_classes
180
+
181
+ @num_classes.setter
182
+ def num_classes(self, value):
183
+ if not isinstance(value, int):
184
+ raise TypeError(f"`num_classes` must be an int, got {type(value).__name__}")
185
+ if value < 2:
186
+ raise ValueError(f"`num_classes` must be >= 2, got {value}")
187
+ self._num_classes = value
188
+
189
+ @property
190
+ def margin(self):
191
+ return self._margin
192
+
193
+ @margin.setter
194
+ def margin(self, value):
195
+ if not isinstance(value, (float, int)):
196
+ raise TypeError(f"`margin` must be a float or int, got {type(value).__name__}")
197
+ value = float(value)
198
+ if not (0.0 <= value <= 1.0):
199
+ raise ValueError(f"`margin` must be between 0.0 and 1.0, got {value}")
200
+ self._margin = value
src/custom_models.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import keras
2
+ import tensorflow as tf
3
+ from custom_layers import L2Normalization, CosineLayer
4
+
5
+ @keras.saving.register_keras_serializable()
6
+ class VerificationModel(tf.keras.Model):
7
+ """
8
+ Modular Speaker Verification Model.
9
+
10
+ Combines a backbone (feature extractor), an embedding projection, optional L2 normalization,
11
+ and a cosine classification head (CosineLayer).
12
+
13
+ Args:
14
+ base_model (tf.keras.Model): Backbone model (e.g., ResNet18).
15
+ number_of_classes (int): Number of speaker classes for classification.
16
+ embedding_dim (int, optional): Size of embedding vector. Default: 512.
17
+ return_embedding (bool, optional): If True, returns only embeddings (for verification);
18
+ if False, returns logits for classification. Default: False.
19
+ base_training (bool, optional): If set, overrides 'training' flag for base model (controls BatchNorm, Dropout).
20
+ """
21
+ def __init__(
22
+ self,
23
+ base_model,
24
+ number_of_classes,
25
+ normalization_layer,
26
+ cosine_layer,
27
+ embedding_dim: int = 512,
28
+ return_embedding: bool = False,
29
+ **kwargs
30
+ ):
31
+ super().__init__(**kwargs)
32
+ self.base_model = base_model
33
+ self.embedding_dim = embedding_dim
34
+ self.number_of_classes = number_of_classes
35
+ self.return_embedding = return_embedding
36
+
37
+ self.embedding_layer = tf.keras.layers.Dense(
38
+ embedding_dim,
39
+ activation='tanh',
40
+ use_bias=False,
41
+ name='embedding_dense'
42
+ )
43
+ self.bn_neck = tf.keras.layers.BatchNormalization(name="bn_neck")
44
+ self.normalization_layer = normalization_layer
45
+ self.cosine_layer = cosine_layer
46
+
47
+ def call(self, inputs, training=None):
48
+ """
49
+ Forward pass.
50
+
51
+ Args:
52
+ inputs: Input tensor (e.g., spectrograms).
53
+ training (bool, optional): Training mode (Keras convention).
54
+ Returns:
55
+ Embeddings (if return_embedding=True) or logits for classification.
56
+ """
57
+
58
+ x = self.base_model(inputs, training=training)
59
+ x = self.embedding_layer(x)
60
+ x = self.bn_neck(x, training=training)
61
+ x = self.normalization_layer(x)
62
+ if self.return_embedding:
63
+ return x
64
+ return self.cosine_layer(x)
65
+
66
+ def get_config(self):
67
+ base_config = super().get_config()
68
+ return {
69
+ **base_config,
70
+ "base_model": keras.saving.serialize_keras_object(self.base_model),
71
+ "normalization_layer": keras.saving.serialize_keras_object(
72
+ self.normalization_layer
73
+ ),
74
+ "cosine_layer": keras.saving.serialize_keras_object(self.cosine_layer),
75
+ "number_of_classes": self.number_of_classes,
76
+ "embedding_dim": self.embedding_dim,
77
+ "return_embedding": self.return_embedding
78
+ }
79
+
80
+ @classmethod
81
+ def from_config(cls, config):
82
+ base_model = keras.saving.deserialize_keras_object(config.pop("base_model"))
83
+ normalization_layer = keras.saving.deserialize_keras_object(config.pop("normalization_layer"))
84
+ cosine_layer = keras.saving.deserialize_keras_object(config.pop("cosine_layer"))
85
+ return cls(base_model=base_model,
86
+ normalization_layer=normalization_layer,
87
+ cosine_layer=cosine_layer,
88
+ **config)
src/streamlit_app.py CHANGED
@@ -14,8 +14,6 @@ st.title("Speaker Verification - Demo")
14
  # ========= Session state =========
15
  if "load_model_button" not in st.session_state:
16
  st.session_state.load_model_button = False
17
- # if "verify_speaker_button" not in st.session_state:
18
- # st.session_state.verify_speaker_button = False
19
  if "audio_left" not in st.session_state:
20
  st.session_state.audio_left = None
21
  if "audio_right" not in st.session_state:
@@ -60,7 +58,7 @@ def bytes_to_pcm16k_mono(data: bytes, in_format: str | None) -> np.ndarray:
60
  out, err = ffmpeg.run(stream, capture_stdout=True, capture_stderr=True, input=data)
61
  audio = np.frombuffer(out, dtype="<i2").astype(np.float32) / 32768.0
62
  if audio.size < WT:
63
- # padding do WT
64
  audio = np.pad(audio, (int((WT - audio.size) / 2) + 1, int((WT - audio.size) / 2) + 1), mode="constant")
65
  return audio
66
 
@@ -85,13 +83,13 @@ def load_model_from_hub(repo_id: str, filename: str, revision: str):
85
  repo_type="model",
86
  revision=revision,
87
  )
88
- # Import modułu z customami, żeby rejestratory Keras się wykonały
89
- import custom_models, custom_losses # noqa: F401
90
  model = keras.models.load_model(model_path)
91
  if hasattr(model, "return_embedding"):
92
  model.return_embedding = True
93
  with open(model_path, "rb") as f:
94
- model_bytes = f.read() # do download_button (bez trzymania otwartego pliku)
95
  return model, model_path, model_bytes
96
 
97
  def handle_record(label: str) -> np.ndarray | None:
@@ -190,7 +188,7 @@ if st.session_state.load_model_button:
190
  except Exception as e:
191
  st.error(f"Error loading model: {e}")
192
 
193
- # ========= Two columns (symetryczne) =========
194
  left_column, right_column = st.columns(2)
195
 
196
  with left_column:
 
14
  # ========= Session state =========
15
  if "load_model_button" not in st.session_state:
16
  st.session_state.load_model_button = False
 
 
17
  if "audio_left" not in st.session_state:
18
  st.session_state.audio_left = None
19
  if "audio_right" not in st.session_state:
 
58
  out, err = ffmpeg.run(stream, capture_stdout=True, capture_stderr=True, input=data)
59
  audio = np.frombuffer(out, dtype="<i2").astype(np.float32) / 32768.0
60
  if audio.size < WT:
61
+ # Padding (centered)
62
  audio = np.pad(audio, (int((WT - audio.size) / 2) + 1, int((WT - audio.size) / 2) + 1), mode="constant")
63
  return audio
64
 
 
83
  repo_type="model",
84
  revision=revision,
85
  )
86
+ # Import custom modules
87
+ import custom_models, custom_losses
88
  model = keras.models.load_model(model_path)
89
  if hasattr(model, "return_embedding"):
90
  model.return_embedding = True
91
  with open(model_path, "rb") as f:
92
+ model_bytes = f.read()
93
  return model, model_path, model_bytes
94
 
95
  def handle_record(label: str) -> np.ndarray | None:
 
188
  except Exception as e:
189
  st.error(f"Error loading model: {e}")
190
 
191
+ # ========= Two columns =========
192
  left_column, right_column = st.columns(2)
193
 
194
  with left_column: