Spaces:

2pift
/

Speaker_Verification_Demo

Sleeping

App Files Files Community

2pift commited on Sep 11

Commit

bceb0a7

1 Parent(s): b3ef023

update dockerfile and appliaction file:

Browse files

Files changed (4) hide show

Dockerfile +1 -0
src/streamlit_app.py +225 -36
src/streamlit_app_old +40 -0
src/streamlit_app_old_old +0 -229

Dockerfile CHANGED Viewed

@@ -13,6 +13,7 @@ RUN mkdir -p /app/.streamlit \
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     git \
     && rm -rf /var/lib/apt/lists/*

 RUN apt-get update && apt-get install -y \
     build-essential \
+    ffmpeg \
     curl \
     git \
     && rm -rf /var/lib/apt/lists/*

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,229 @@
-import altair as alt
 import numpy as np
 import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+from pathlib import Path
+import ffmpeg
+import keras
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import streamlit as st
+import tensorflow as tf
+from huggingface_hub import hf_hub_download
+# ========= App title =========
+st.title("Speaker Verification - Demo")
+# ========= Session state =========
+if "load_model_button" not in st.session_state:
+    st.session_state.load_model_button = False
+# if "verify_speaker_button" not in st.session_state:
+#     st.session_state.verify_speaker_button = False
+if "audio_left" not in st.session_state:
+    st.session_state.audio_left = None
+if "audio_right" not in st.session_state:
+    st.session_state.audio_right = None
+# ========= UI: choose model =========
+model_df = pd.DataFrame({"first column": ["verification_model_resnet34_512dim"]})
+option = st.selectbox("Choose model to test out:", model_df["first column"])
+st.button("Load the model", on_click=lambda: st.session_state.update(load_model_button=True))
+# ========= Helpers =========
+FS = 16000  # target sample rate
+WT = 48560  # window length in samples
+EXT2FMT = {
+    "wav": "wav",
+    "mp3": "mp3",
+    "ogg": "ogg",
+    "aac": "aac",
+    "m4a": "mp4"
+}
+def infer_input_format(name: str) -> str | None:
+    if name and "." in name:
+        ext = name.rsplit(".", 1)[-1].lower()
+        return EXT2FMT.get(ext)
+    return None
+@st.cache_data(show_spinner=False)
+def bytes_to_pcm16k_mono(data: bytes, in_format: str | None) -> np.ndarray:
+    """
+    Konwertuje wejściowe audio (dowolny wspierany kontener) do surowego PCM 16kHz mono 16-bit LE
+    i zwraca jako float32 w zakresie [-1, 1].
+    Cache'owane po (bytes, format).
+    """
+    stream = (
+        ffmpeg
+        .input("pipe:0", **({"format": in_format} if in_format else {}))
+        .output("pipe:1", format="s16le", acodec="pcm_s16le", ar=str(FS), ac=1)
+        .global_args("-hide_banner")
+    )
+    out, err = ffmpeg.run(stream, capture_stdout=True, capture_stderr=True, input=data)
+    audio = np.frombuffer(out, dtype="<i2").astype(np.float32) / 32768.0
+    if audio.size < WT:
+        # padding do WT
+        audio = np.pad(audio, (int((WT - audio.size) / 2) + 1, int((WT - audio.size) / 2) + 1), mode="constant")
+    return audio
+def plot_waveform(audio_np: np.ndarray, fs: int = FS, title: str = "Waveform"):
+    t = np.arange(audio_np.size) / fs if audio_np.size else np.array([0, 1e-6])
+    fig, ax = plt.subplots()
+    ax.plot(t, audio_np)
+    ax.set_title(title)
+    ax.set_xlabel("Time [s]")
+    ax.set_ylabel("Amplitude")
+    ax.margins(x=0, y=0)
+    if audio_np.size:
+        ax.set_xlim(t[0], t[-1])
+    return fig
+@st.cache_resource(show_spinner=True)
+def load_model_from_hub(repo_id: str, filename: str, revision: str):
+    """Pobiera i ładuje model Keras (cache resource – trzymamy w pamięci)."""
+    model_path = hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+        repo_type="model",
+        revision=revision,
+    )
+    # Import modułu z customami, żeby rejestratory Keras się wykonały
+    import custom_models, custom_losses  # noqa: F401
+    model = keras.models.load_model(model_path)
+    if hasattr(model, "return_embedding"):
+        model.return_embedding = True
+    with open(model_path, "rb") as f:
+        model_bytes = f.read()  # do download_button (bez trzymania otwartego pliku)
+    return model, model_path, model_bytes
+def handle_record(label: str) -> np.ndarray | None:
+    rec = st.audio_input(label)
+    if not rec:
+        return None
+    try:
+        audio_np = bytes_to_pcm16k_mono(rec.getvalue(), in_format="wav")
+        return audio_np
+    except ffmpeg.Error as e:
+        st.error("FFmpeg failed while processing recording.")
+        st.code(e.stderr.decode("utf-8", "ignore"))
+        return None
+def handle_upload(label: str, key: str) -> np.ndarray | None:
+    file = st.file_uploader(
+        label,
+        type=["wav", "m4a", "aac", "mp3", "ogg", "webm", "flac"],
+        key=key,
+    )
+    if not file:
+        return None
+    in_fmt = infer_input_format(file.name)
+    try:
+        audio_np = bytes_to_pcm16k_mono(file.getvalue(), in_fmt)
+        return audio_np
+    except ffmpeg.Error as e:
+        st.error("FFmpeg failed while converting uploaded file.")
+        st.code(e.stderr.decode("utf-8", "ignore"))
+        return None
+def delta(x):
+    """Computes first-order difference along time axis."""
+    return x[:, 1:] - x[:, :-1]
+def array_to_spectrogram(audio_np: np.ndarray,
+                         audio_in_samples: int = 48560,
+                         window_length: int = 400,
+                         step_length: int = 160,
+                         fft_length: int = 1023
+                         ) -> tf.Tensor:
+    audio = tf.convert_to_tensor(audio_np, dtype=tf.float32)
+    audio_length = audio_np.size
+    random_int = tf.random.uniform(shape=(), minval=0, maxval=(audio_length-audio_in_samples), dtype=tf.int32)
+    stft = tf.signal.stft(audio[random_int:(random_int+audio_in_samples)],
+                          frame_length=window_length,
+                          frame_step=step_length,
+                          fft_length=fft_length)
+    spectrogram = tf.abs(stft)
+    spectrogram = tf.transpose(spectrogram)  # shape: (freq, time)
+    spectrogram = tf.math.log1p(spectrogram)
+    spectrogram_delta = delta(spectrogram)
+    spectrogram_delta2 = delta(spectrogram_delta)
+    return tf.stack([spectrogram[:, :-2],
+                     spectrogram_delta[:, :-1],
+                     spectrogram_delta2],
+                     axis=-1) # shape: (freq, time, 3)
+@st.cache_data(show_spinner=True)
+def verify_speakers(model, audio_left, audio_right, margin):
+    spec_left = array_to_spectrogram(audio_left)[tf.newaxis, ...]
+    spec_right = array_to_spectrogram(audio_right)[tf.newaxis, ...]
+    emb_left = model.predict(spec_left, verbose=0)
+    emb_right = model.predict(spec_right, verbose=0)
+    cosine_similarity = tf.linalg.matmul(emb_left, emb_right, transpose_b=True)
+    cosine_similarity = float(cosine_similarity.numpy().squeeze())
+    if cosine_similarity >= margin:
+        st.success("Both voice recordings belong to the same person.")
+    else:
+        st.warning("The voice recordings belong to different people.")
+    st.caption(f"Cosine similarity: {cosine_similarity:.4f}, margin: {margin:.4f}")
+# ========= Load model =========
+if st.session_state.load_model_button:
+    try:
+        model, model_path, model_bytes = load_model_from_hub(
+            repo_id="2pift/sv-resnet34-keras",
+            filename="best_model.keras",
+            revision="v1.0.0",
+        )
+        st.success("Model loaded. You can now upload/record audio files.")
+        st.download_button(
+            "Download the model",
+            data=model_bytes,
+            file_name="verification_model_resnet34_512dim.keras",
+        )
+    except Exception as e:
+        st.error(f"Error loading model: {e}")
+    # ========= Two columns (symetryczne) =========
+    left_column, right_column = st.columns(2)
+    with left_column:
+        st.subheader("Left input")
+        record_left = st.checkbox("Record left input")
+        if record_left:
+            audio_left = handle_record("Record (left)")
+        else:
+            audio_left = handle_upload("Upload left audio", key="file_left")
+        if audio_left is not None:
+            st.session_state.audio_left = audio_left
+            fig = plot_waveform(audio_left, FS, "Left audio waveform")
+            st.pyplot(fig, use_container_width=True)
+            st.caption(f"Samples: {audio_left.size}  •  Duration: {audio_left.size/FS:.2f}s")
+    with right_column:
+        st.subheader("Right input")
+        record_right = st.checkbox("Record right input")
+        if record_right:
+            audio_right = handle_record("Record (right)")
+        else:
+            audio_right = handle_upload("Upload right audio", key="file_right")
+        if audio_right is not None:
+            st.session_state.audio_right = audio_right
+            fig = plot_waveform(audio_right, FS, "Right audio waveform")
+            st.pyplot(fig, use_container_width=True)
+            st.caption(f"Samples: {audio_right.size}  •  Duration: {audio_right.size/FS:.2f}s")
+    if audio_left is not None and audio_right is not None:
+        margin = st.slider('Selected margin:', -1.0, 1.0, 0.26, 0.01)
+        verify_button = st.button("Verify speaker!")
+        if verify_button:
+            try:
+                verify_speakers(model, audio_left, audio_right, margin)
+            except Exception as e:
+                st.error(f"Error during verification: {e}")

src/streamlit_app_old ADDED Viewed

	@@ -0,0 +1,40 @@

+import altair as alt
+import numpy as np
+import pandas as pd
+import streamlit as st
+"""
+# Welcome to Streamlit!
+Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
+If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
+forums](https://discuss.streamlit.io).
+In the meantime, below is an example of what you can do with just a few lines of code:
+"""
+num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
+num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
+indices = np.linspace(0, 1, num_points)
+theta = 2 * np.pi * num_turns * indices
+radius = indices
+x = radius * np.cos(theta)
+y = radius * np.sin(theta)
+df = pd.DataFrame({
+    "x": x,
+    "y": y,
+    "idx": indices,
+    "rand": np.random.randn(num_points),
+})
+st.altair_chart(alt.Chart(df, height=700, width=700)
+    .mark_point(filled=True)
+    .encode(
+        x=alt.X("x", axis=None),
+        y=alt.Y("y", axis=None),
+        color=alt.Color("idx", legend=None, scale=alt.Scale()),
+        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
+    ))

src/streamlit_app_old_old DELETED Viewed

@@ -1,229 +0,0 @@
-from pathlib import Path
-import ffmpeg
-import keras
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import streamlit as st
-import tensorflow as tf
-from huggingface_hub import hf_hub_download
-# ========= App title =========
-st.title("Speaker Verification - Demo")
-# ========= Session state =========
-if "load_model_button" not in st.session_state:
-    st.session_state.load_model_button = False
-# if "verify_speaker_button" not in st.session_state:
-#     st.session_state.verify_speaker_button = False
-if "audio_left" not in st.session_state:
-    st.session_state.audio_left = None
-if "audio_right" not in st.session_state:
-    st.session_state.audio_right = None
-# ========= UI: choose model =========
-model_df = pd.DataFrame({"first column": ["verification_model_resnet34_512dim"]})
-option = st.selectbox("Choose model to test out:", model_df["first column"])
-st.button("Load the model", on_click=lambda: st.session_state.update(load_model_button=True))
-# ========= Helpers =========
-FS = 16000  # target sample rate
-WT = 48560  # window length in samples
-EXT2FMT = {
-    "wav": "wav",
-    "mp3": "mp3",
-    "ogg": "ogg",
-    "aac": "aac",
-    "m4a": "mp4"
-}
-def infer_input_format(name: str) -> str | None:
-    if name and "." in name:
-        ext = name.rsplit(".", 1)[-1].lower()
-        return EXT2FMT.get(ext)
-    return None
-@st.cache_data(show_spinner=False)
-def bytes_to_pcm16k_mono(data: bytes, in_format: str | None) -> np.ndarray:
-    """
-    Konwertuje wejściowe audio (dowolny wspierany kontener) do surowego PCM 16kHz mono 16-bit LE
-    i zwraca jako float32 w zakresie [-1, 1].
-    Cache'owane po (bytes, format).
-    """
-    stream = (
-        ffmpeg
-        .input("pipe:0", **({"format": in_format} if in_format else {}))
-        .output("pipe:1", format="s16le", acodec="pcm_s16le", ar=str(FS), ac=1)
-        .global_args("-hide_banner")
-    )
-    out, err = ffmpeg.run(stream, capture_stdout=True, capture_stderr=True, input=data)
-    audio = np.frombuffer(out, dtype="<i2").astype(np.float32) / 32768.0
-    if audio.size < WT:
-        # padding do WT
-        audio = np.pad(audio, (int((WT - audio.size) / 2) + 1, int((WT - audio.size) / 2) + 1), mode="constant")
-    return audio
-def plot_waveform(audio_np: np.ndarray, fs: int = FS, title: str = "Waveform"):
-    t = np.arange(audio_np.size) / fs if audio_np.size else np.array([0, 1e-6])
-    fig, ax = plt.subplots()
-    ax.plot(t, audio_np)
-    ax.set_title(title)
-    ax.set_xlabel("Time [s]")
-    ax.set_ylabel("Amplitude")
-    ax.margins(x=0, y=0)
-    if audio_np.size:
-        ax.set_xlim(t[0], t[-1])
-    return fig
-@st.cache_resource(show_spinner=True)
-def load_model_from_hub(repo_id: str, filename: str, revision: str):
-    """Pobiera i ładuje model Keras (cache resource – trzymamy w pamięci)."""
-    model_path = hf_hub_download(
-        repo_id=repo_id,
-        filename=filename,
-        repo_type="model",
-        revision=revision,
-    )
-    # Import modułu z customami, żeby rejestratory Keras się wykonały
-    import custom_models, custom_losses  # noqa: F401
-    model = keras.models.load_model(model_path)
-    if hasattr(model, "return_embedding"):
-        model.return_embedding = True
-    with open(model_path, "rb") as f:
-        model_bytes = f.read()  # do download_button (bez trzymania otwartego pliku)
-    return model, model_path, model_bytes
-def handle_record(label: str) -> np.ndarray | None:
-    rec = st.audio_input(label)
-    if not rec:
-        return None
-    try:
-        audio_np = bytes_to_pcm16k_mono(rec.getvalue(), in_format="wav")
-        return audio_np
-    except ffmpeg.Error as e:
-        st.error("FFmpeg failed while processing recording.")
-        st.code(e.stderr.decode("utf-8", "ignore"))
-        return None
-def handle_upload(label: str, key: str) -> np.ndarray | None:
-    file = st.file_uploader(
-        label,
-        type=["wav", "m4a", "aac", "mp3", "ogg", "webm", "flac"],
-        key=key,
-    )
-    if not file:
-        return None
-    in_fmt = infer_input_format(file.name)
-    try:
-        audio_np = bytes_to_pcm16k_mono(file.getvalue(), in_fmt)
-        return audio_np
-    except ffmpeg.Error as e:
-        st.error("FFmpeg failed while converting uploaded file.")
-        st.code(e.stderr.decode("utf-8", "ignore"))
-        return None
-def delta(x):
-    """Computes first-order difference along time axis."""
-    return x[:, 1:] - x[:, :-1]
-def array_to_spectrogram(audio_np: np.ndarray,
-                         audio_in_samples: int = 48560,
-                         window_length: int = 400,
-                         step_length: int = 160,
-                         fft_length: int = 1023
-                         ) -> tf.Tensor:
-    audio = tf.convert_to_tensor(audio_np, dtype=tf.float32)
-    audio_length = audio_np.size
-    random_int = tf.random.uniform(shape=(), minval=0, maxval=(audio_length-audio_in_samples), dtype=tf.int32)
-    stft = tf.signal.stft(audio[random_int:(random_int+audio_in_samples)],
-                          frame_length=window_length,
-                          frame_step=step_length,
-                          fft_length=fft_length)
-    spectrogram = tf.abs(stft)
-    spectrogram = tf.transpose(spectrogram)  # shape: (freq, time)
-    spectrogram = tf.math.log1p(spectrogram)
-    spectrogram_delta = delta(spectrogram)
-    spectrogram_delta2 = delta(spectrogram_delta)
-    return tf.stack([spectrogram[:, :-2],
-                     spectrogram_delta[:, :-1],
-                     spectrogram_delta2],
-                     axis=-1) # shape: (freq, time, 3)
-@st.cache_data(show_spinner=True)
-def verify_speakers(model, audio_left, audio_right, margin):
-    spec_left = array_to_spectrogram(audio_left)[tf.newaxis, ...]
-    spec_right = array_to_spectrogram(audio_right)[tf.newaxis, ...]
-    emb_left = model.predict(spec_left, verbose=0)
-    emb_right = model.predict(spec_right, verbose=0)
-    cosine_similarity = tf.linalg.matmul(emb_left, emb_right, transpose_b=True)
-    cosine_similarity = float(cosine_similarity.numpy().squeeze())
-    if cosine_similarity >= margin:
-        st.success("Both voice recordings belong to the same person.")
-    else:
-        st.warning("The voice recordings belong to different people.")
-    st.caption(f"Cosine similarity: {cosine_similarity:.4f}, margin: {margin:.4f}")
-# ========= Load model =========
-if st.session_state.load_model_button:
-    try:
-        model, model_path, model_bytes = load_model_from_hub(
-            repo_id="2pift/sv-resnet34-keras",
-            filename="best_model.keras",
-            revision="v1.0.0",
-        )
-        st.success("Model loaded. You can now upload/record audio files.")
-        st.download_button(
-            "Download the model",
-            data=model_bytes,
-            file_name="verification_model_resnet34_512dim.keras",
-        )
-    except Exception as e:
-        st.error(f"Error loading model: {e}")
-    # ========= Two columns (symetryczne) =========
-    left_column, right_column = st.columns(2)
-    with left_column:
-        st.subheader("Left input")
-        record_left = st.checkbox("Record left input")
-        if record_left:
-            audio_left = handle_record("Record (left)")
-        else:
-            audio_left = handle_upload("Upload left audio", key="file_left")
-        if audio_left is not None:
-            st.session_state.audio_left = audio_left
-            fig = plot_waveform(audio_left, FS, "Left audio waveform")
-            st.pyplot(fig, use_container_width=True)
-            st.caption(f"Samples: {audio_left.size}  •  Duration: {audio_left.size/FS:.2f}s")
-    with right_column:
-        st.subheader("Right input")
-        record_right = st.checkbox("Record right input")
-        if record_right:
-            audio_right = handle_record("Record (right)")
-        else:
-            audio_right = handle_upload("Upload right audio", key="file_right")
-        if audio_right is not None:
-            st.session_state.audio_right = audio_right
-            fig = plot_waveform(audio_right, FS, "Right audio waveform")
-            st.pyplot(fig, use_container_width=True)
-            st.caption(f"Samples: {audio_right.size}  •  Duration: {audio_right.size/FS:.2f}s")
-    if audio_left is not None and audio_right is not None:
-        margin = st.slider('Selected margin:', -1.0, 1.0, 0.26, 0.01)
-        verify_button = st.button("Verify speaker!")
-        if verify_button:
-            try:
-                verify_speakers(model, audio_left, audio_right, margin)
-            except Exception as e:
-                st.error(f"Error during verification: {e}")