Spaces:

Ar4ikov
/

GigaAMv3-preview

Running

Ar4ikov commited on 15 days ago

Commit

08c9a0c

1 Parent(s): 0cf4344

Add Gradio demo application for GigaAM-v3 speech recognition models

- Implemented main application logic in app.py for audio transcription using various model variants.
- Updated README.md to reflect the new demo features and usage instructions.
- Added requirements.txt for necessary dependencies.
- Included runtime.txt specifying Python version.

Files changed (4) hide show

README.md +43 -3
app.py +254 -0
requirements.txt +14 -0
runtime.txt +2 -0

README.md CHANGED Viewed

@@ -4,11 +4,51 @@ emoji: 🔥
 colorFrom: red
 colorTo: green
 sdk: gradio
-sdk_version: 6.0.0
 app_file: app.py
 pinned: false
 license: mit
-short_description: A test Gradio space for showcase the capabilitie of GigaAMv3
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorFrom: red
 colorTo: green
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
 license: mit
+short_description: Interactive Gradio Space demonstrating ai-sage/GigaAM-v3 ASR
 ---
+# GigaAM-v3 Gradio demo
+This Space demonstrates the [`ai-sage/GigaAM-v3`](https://huggingface.co/ai-sage/GigaAM-v3) Russian ASR models built on top of a Conformer encoder and HuBERT-CTC objective. The demo lets you:
+- upload or record audio (WAV/MP3/FLAC) directly in the browser,
+- choose between the `ctc`, `rnnt`, `e2e_ctc`, and `e2e_rnnt` checkpoints,
+- switch between a fast single-pass mode and a segmented long-form mode that returns timestamps.
+The end-to-end variants (`e2e_*`) produce punctuated, normalized text, while the classic CTC/RNN-T checkpoints return raw transcriptions with lower latency. Long-form mode uses `model.transcribe_longform` and requires a Hugging Face token with access to [`pyannote/segmentation-3.0`](https://huggingface.co/pyannote/segmentation-3.0).
+## Requirements
+- Python 3.10
+- PyTorch / torchaudio 2.8.0
+- `transformers==4.57.1`
+- `gradio==4.44.0` (see `requirements.txt` for the full list)
+- Optional: set `HF_TOKEN` (or `HUGGINGFACEHUB_API_TOKEN`) if you want to use the segmented mode or access private weights.
+## Running locally
+```bash
+python -m venv .venv
+source .venv/bin/activate  # or .venv\Scripts\activate on Windows
+pip install -r requirements.txt
+# optional – needed for long-form segmentation
+export HF_TOKEN=<your_hf_token>
+python app.py
+```
+Open the printed URL (default `http://127.0.0.1:7860`) and start transcribing.
+## Deploying to Hugging Face Spaces
+- Keep the YAML front matter above so Spaces can infer the runtime.
+- Upload `app.py`, `requirements.txt`, and `runtime.txt`.
+- Configure an `HF_TOKEN` secret in **Settings → Variables** if you want segmented mode to work for everyone.
+- Assign `CPU Upgrade` or GPU hardware for heavy, long-form workloads.
+For more options (custom hardware, scaling, telemetry), review the [Spaces configuration reference](https://huggingface.co/docs/hub/spaces-config-reference).

app.py ADDED Viewed

	@@ -0,0 +1,254 @@

+"""
+Gradio demo application for the GigaAM-v3 speech recognition models.
+"""
+from __future__ import annotations
+import os
+import threading
+import time
+from typing import Dict, List, Optional
+import gradio as gr
+import soundfile as sf
+import torch
+from transformers import AutoModel
+REPO_ID = "ai-sage/GigaAM-v3"
+MODEL_VARIANTS: Dict[str, str] = {
+    "e2e_rnnt": "End-to-end RNN-T • punctuation + normalization (best quality)",
+    "e2e_ctc": "End-to-end CTC • punctuation + normalization (faster)",
+    "rnnt": "RNN-T decoder • raw text without normalization",
+    "ctc": "CTC decoder • fastest baseline",
+}
+DEFAULT_VARIANT = "e2e_rnnt"
+MAX_SHORT_SECONDS = float(os.getenv("MAX_AUDIO_DURATION_SECONDS", 150))
+MAX_LONG_SECONDS = float(os.getenv("MAX_LONGFORM_DURATION_SECONDS", 600))
+OUTPUT_MODES = {
+    "Short clip (<=150 s)": {
+        "id": "short",
+        "longform": False,
+        "max_duration": MAX_SHORT_SECONDS,
+        "limit_msg": "Запись длиннее 150 секунд. Выберите режим 'Segmented long-form' для более длинных файлов.",
+        "description": "Single call to `model.transcribe`; best latency for concise utterances.",
+        "requires_token": False,
+    },
+    "Segmented long-form (<=10 min)": {
+        "id": "longform",
+        "longform": True,
+        "max_duration": MAX_LONG_SECONDS,
+        "limit_msg": "Длина аудио превышает 10 минут. Сократите запись для сегментированного режима.",
+        "description": "Calls `model.transcribe_longform` to obtain timestamped segments.",
+        "requires_token": True,
+    },
+}
+DEFAULT_MODE_LABEL = next(iter(OUTPUT_MODES))
+HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+MODEL_CACHE: Dict[str, AutoModel] = {}
+MODEL_LOCKS = {variant: threading.Lock() for variant in MODEL_VARIANTS}
+def _format_seconds(value: float) -> str:
+    return f"{value:.2f}s"
+def _read_audio_stats(audio_path: str) -> tuple[float, int]:
+    """Return duration (seconds) and sample rate."""
+    data, sample_rate = sf.read(audio_path)
+    duration = len(data) / float(sample_rate)
+    return duration, int(sample_rate)
+def _normalize_text(text: object) -> str:
+    if text is None:
+        return ""
+    if isinstance(text, str):
+        return text.strip()
+    if isinstance(text, dict):
+        for key in ("transcription", "text"):
+            if key in text and isinstance(text[key], str):
+                return text[key].strip()
+    return str(text)
+def load_model(variant: str) -> AutoModel:
+    if variant not in MODEL_VARIANTS:
+        raise gr.Error(f"Вариант модели '{variant}' не поддерживается.")
+    if variant in MODEL_CACHE:
+        return MODEL_CACHE[variant]
+    lock = MODEL_LOCKS[variant]
+    with lock:
+        if variant in MODEL_CACHE:
+            return MODEL_CACHE[variant]
+        load_kwargs = dict(revision=variant, trust_remote_code=True)
+        if HF_TOKEN:
+            load_kwargs["token"] = HF_TOKEN
+        model = AutoModel.from_pretrained(REPO_ID, **load_kwargs)
+        try:
+            model.to(DEVICE)
+        except Exception:
+            # Some remote implementations manage their own device placement.
+            pass
+        MODEL_CACHE[variant] = model
+        return model
+def transcribe_audio(
+    audio_path: Optional[str],
+    variant: str,
+    mode_label: str,
+) -> tuple[str, List[List[float | str]], str]:
+    if not audio_path or not os.path.exists(audio_path):
+        raise gr.Error("Загрузите или запишите аудиофайл, чтобы начать распознавание.")
+    if mode_label not in OUTPUT_MODES:
+        raise gr.Error("Выберите режим транскрипции.")
+    mode_cfg = OUTPUT_MODES[mode_label]
+    duration, sample_rate = _read_audio_stats(audio_path)
+    if duration < 0.3:
+        raise gr.Error("Запись слишком короткая (<300 мс).")
+    if duration > mode_cfg["max_duration"]:
+        raise gr.Error(mode_cfg["limit_msg"])
+    if mode_cfg["requires_token"] and not HF_TOKEN:
+        raise gr.Error(
+            "Для сегментированного режима требуется переменная окружения HF_TOKEN "
+            "с доступом к модели 'pyannote/segmentation-3.0'."
+        )
+    progress = gr.Progress(track_tqdm=False)
+    progress(0.1, desc="Загрузка модели")
+    model = load_model(variant)
+    start_ts = time.perf_counter()
+    progress(0.55, desc="Распознавание речи")
+    if mode_cfg["longform"]:
+        utterances = model.transcribe_longform(audio_path)
+        segments: List[List[float | str]] = []
+        assembled_text_parts: List[str] = []
+        for utt in utterances:
+            text = _normalize_text(utt)
+            if isinstance(utt, dict):
+                boundaries = utt.get("boundaries") or utt.get("timestamps")
+            else:
+                boundaries = None
+            if not boundaries:
+                boundaries = (0.0, 0.0)
+            start, end = boundaries
+            segments.append([round(float(start), 2), round(float(end), 2), text])
+            assembled_text_parts.append(text)
+        transcription_text = "\n".join(assembled_text_parts).strip()
+    else:
+        result = model.transcribe(audio_path)
+        transcription_text = _normalize_text(result)
+        segments = []
+    latency = time.perf_counter() - start_ts
+    progress(1.0, desc="Готово")
+    metadata_lines = [
+        f"- **Model variant:** {MODEL_VARIANTS[variant]}",
+        f"- **Transcription mode:** {mode_cfg['description']}",
+        f"- **Audio duration:** {_format_seconds(duration)} @ {sample_rate} Hz",
+        f"- **Latency:** {_format_seconds(latency)} on `{DEVICE}`",
+        f"- **HF token configured:** {'yes' if HF_TOKEN else 'no'}",
+    ]
+    return transcription_text, segments, "\n".join(metadata_lines)
+DESCRIPTION_MD = """
+# GigaAM-v3 · Russian ASR demo
+This Space showcases the [`ai-sage/GigaAM-v3`](https://huggingface.co/ai-sage/GigaAM-v3) Conformer-based models.
+- Upload or record Russian audio (WAV/MP3/FLAC, mono preferred).
+- Pick the model variant and transcription mode that matches your latency/quality needs.
+- Long-form mode returns timestamped segments and requires an `HF_TOKEN` with access to `pyannote/segmentation-3.0`.
+"""
+FOOTER_MD = """
+**Tips**
+- Short clips (<150s) work best with the E2E variants (they include punctuation and normalization).
+- Long recordings can take several minutes on CPU-only Spaces; switch to GPU hardware if available.
+- Source: [salute-developers/GigaAM](https://github.com/salute-developers/GigaAM)
+"""
+def build_interface() -> gr.Blocks:
+    with gr.Blocks(title="GigaAM-v3 ASR demo") as demo:
+        gr.Markdown(DESCRIPTION_MD)
+        with gr.Row(equal_height=True):
+            audio_input = gr.Audio(
+                sources=["microphone", "upload"],
+                type="filepath",
+                label="Russian audio",
+                waveform_options=gr.WaveformOptions(
+                    show_controls=True,
+                    waveform_color="#f97316",
+                    skip_length=2,
+                ),
+            )
+            with gr.Column():
+                variant_dropdown = gr.Dropdown(
+                    choices=list(MODEL_VARIANTS.keys()),
+                    value=DEFAULT_VARIANT,
+                    label="Model variant",
+                    info="End-to-end variants add punctuation; base CTC/RNNT are lighter but raw.",
+                )
+                mode_radio = gr.Radio(
+                    choices=list(OUTPUT_MODES.keys()),
+                    value=DEFAULT_MODE_LABEL,
+                    label="Transcription mode",
+                    info="Select segmented mode for >150 second clips (requires HF token).",
+                )
+                transcribe_btn = gr.Button("Transcribe", variant="primary")
+        transcript_output = gr.Textbox(
+            label="Transcript",
+            placeholder="Model output will appear here…",
+            lines=8,
+        )
+        segments_output = gr.Dataframe(
+            headers=["Start (s)", "End (s)", "Utterance"],
+            datatype=["number", "number", "str"],
+            label="Segments (long-form mode)",
+            interactive=False,
+        )
+        metadata_output = gr.Markdown()
+        gr.Markdown(FOOTER_MD)
+        transcribe_btn.click(
+            fn=transcribe_audio,
+            inputs=[audio_input, variant_dropdown, mode_radio],
+            outputs=[transcript_output, segments_output, metadata_output],
+            api_name="transcribe",
+        )
+    return demo
+demo = build_interface()
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+torch==2.8.0
+torchaudio==2.8.0
+transformers==4.57.1
+gradio==4.44.0
+soundfile>=0.12.1
+numpy>=1.26.4
+hydra-core>=1.3.2
+omegaconf>=2.3.0
+sentencepiece>=0.1.99
+pyannote.audio==4.0.0
+torchcodec==0.7.0
+accelerate>=0.34.2
+huggingface_hub>=0.25.2

runtime.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ python-3.10
2	+