whisper-tg

Paused

App Files Files Community

muhtasham commited on Mar 23

Commit

a1d6c0c

1 Parent(s): dbe4a4a

WIP

Browse files

Files changed (2) hide show

app.py +128 -19
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -1,19 +1,30 @@
-import spaces
 import torch
 import gradio as gr
-from transformers import pipeline
 import subprocess
-from loguru import logger
 import datetime
 import tempfile
-import os
-import json
-from pathlib import Path
 MODEL_NAME = "muhtasham/whisper-tg"
 def format_time(seconds):
-    """Convert seconds to SRT time format (HH:MM:SS,mmm)"""
     td = datetime.timedelta(seconds=float(seconds))
     hours = td.seconds // 3600
     minutes = (td.seconds % 3600) // 60
@@ -22,7 +33,35 @@ def format_time(seconds):
     return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
 def generate_srt(chunks):
-    """Generate SRT format subtitles from chunks"""
     srt_content = []
     for i, chunk in enumerate(chunks, 1):
         start_time = format_time(chunk["timestamp"][0])
@@ -32,7 +71,20 @@ def generate_srt(chunks):
     return "".join(srt_content)
 def save_srt_to_file(srt_content):
-    """Save SRT content to a temporary file and return the file path"""
     if not srt_content:
         return None
@@ -54,33 +106,81 @@ def check_ffmpeg():
 # Initialize ffmpeg check
 check_ffmpeg()
-device = 0 if torch.cuda.is_available() else "cpu"
 logger.info(f"Using device: {device}")
-def create_pipeline(chunk_length_s):
-    """Create a new pipeline with specified chunk length"""
     return pipeline(
         task="automatic-speech-recognition",
         model=MODEL_NAME,
-        chunk_length_s=chunk_length_s,
         device=device,
     )
-# Initialize default pipeline
-pipe = create_pipeline(30)
 logger.info(f"Pipeline initialized: {pipe}")
-@spaces.GPU
 def transcribe(inputs, return_timestamps, generate_subs, batch_size, chunk_length_s):
     if inputs is None:
         logger.warning("No audio file submitted")
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     try:
         logger.info(f"Processing audio file: {inputs}")
-        # Create new pipeline with specified chunk length
-        current_pipe = create_pipeline(chunk_length_s)
-        result = current_pipe(inputs, batch_size=batch_size, return_timestamps=return_timestamps)
         logger.debug(f"Pipeline result: {result}")
         # Format response as JSON
@@ -121,8 +221,17 @@ def transcribe(inputs, return_timestamps, generate_subs, batch_size, chunk_lengt
             srt_file = save_srt_to_file(srt_content)
             logger.info("SRT subtitles generated successfully")
         return formatted_result, srt_file, ""  # Return empty string for correction textbox
     except Exception as e:
         logger.exception(f"Error during transcription: {str(e)}")
         raise gr.Error(f"Failed to transcribe audio: {str(e)}")

 import torch
 import gradio as gr
 import subprocess
 import datetime
 import tempfile
+from transformers import pipeline
+from loguru import logger
 MODEL_NAME = "muhtasham/whisper-tg"
 def format_time(seconds):
+    """Convert seconds to SRT time format (HH:MM:SS,mmm).
+    Args:
+        seconds (float): Time in seconds to convert.
+    Returns:
+        str: Time formatted as HH:MM:SS,mmm where:
+            - HH: Hours (00-99)
+            - MM: Minutes (00-59)
+            - SS: Seconds (00-59)
+            - mmm: Milliseconds (000-999)
+    Example:
+        >>> format_time(3661.5)
+        '01:01:01,500'
+    """
     td = datetime.timedelta(seconds=float(seconds))
     hours = td.seconds // 3600
     minutes = (td.seconds % 3600) // 60
     return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
 def generate_srt(chunks):
+    """Generate SRT format subtitles from transcription chunks.
+    Args:
+        chunks (list): List of dictionaries containing transcription chunks.
+            Each chunk must have:
+            - "timestamp": List of [start_time, end_time] in seconds
+            - "text": The transcribed text for that time segment
+    Returns:
+        str: SRT formatted subtitles string with format:
+            ```
+            1
+            HH:MM:SS,mmm --> HH:MM:SS,mmm
+            Text content
+            2
+            HH:MM:SS,mmm --> HH:MM:SS,mmm
+            Text content
+            ...
+            ```
+    Example:
+        >>> chunks = [
+        ...     {"timestamp": [0.0, 1.5], "text": "Hello"},
+        ...     {"timestamp": [1.5, 3.0], "text": "World"}
+        ... ]
+        >>> generate_srt(chunks)
+        '1\\n00:00:00,000 --> 00:00:01,500\\nHello\\n\\n2\\n00:00:01,500 --> 00:00:03,000\\nWorld\\n\\n'
+    """
     srt_content = []
     for i, chunk in enumerate(chunks, 1):
         start_time = format_time(chunk["timestamp"][0])
     return "".join(srt_content)
 def save_srt_to_file(srt_content):
+    """Save SRT content to a temporary file.
+    Args:
+        srt_content (str): The SRT formatted subtitles content to save.
+    Returns:
+        str or None: Path to the temporary file if content was saved,
+                    None if srt_content was empty.
+    Note:
+        The temporary file is created with delete=False to allow it to be
+        used after the function returns. The file should be deleted by the
+        caller when no longer needed.
+    """
     if not srt_content:
         return None
 # Initialize ffmpeg check
 check_ffmpeg()
+# Use T4 GPU if available, otherwise fallback to CPU
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
 logger.info(f"Using device: {device}")
+def create_pipeline():
+    """Create a new pipeline with optimized settings for T4 GPU.
+    Returns:
+        transformers.Pipeline: Configured speech recognition pipeline.
+    """
     return pipeline(
         task="automatic-speech-recognition",
         model=MODEL_NAME,
         device=device,
+        torch_dtype=torch.float16,  # Use float16 for better performance on T4
+        framework="pt",  # Explicitly use PyTorch
+        return_timestamps=True,  # Always return timestamps for better control
+        generate_kwargs={
+            "task": "transcribe",  # Explicitly set transcription task
+            "language": "tg",  # Default to Tajik
+            "condition_on_previous_text": True,  # Use context from previous chunks
+            "compression_ratio_threshold": 1.2,  # Filter out low-quality transcriptions
+            "temperature": 0.0,  # Use greedy decoding for faster inference
+            "no_speech_threshold": 0.6,  # Threshold for detecting speech
+            "logprob_threshold": -1.0,  # Threshold for log probability
+            "best_of": 1,  # Use single best path for faster inference
+        }
     )
+# Initialize pipeline once
+pipe = create_pipeline()
 logger.info(f"Pipeline initialized: {pipe}")
 def transcribe(inputs, return_timestamps, generate_subs, batch_size, chunk_length_s):
+    """Transcribe audio input using Whisper model.
+    Args:
+        inputs (str): Path to audio file to transcribe.
+        return_timestamps (bool): Whether to include timestamps in output.
+        generate_subs (bool): Whether to generate SRT subtitles.
+        batch_size (int): Number of chunks to process in parallel.
+        chunk_length_s (int): Length of audio chunks in seconds.
+    Returns:
+        tuple: (formatted_result, srt_file, correction_text)
+            - formatted_result (dict): Transcription results
+            - srt_file (str): Path to SRT file if generated, None otherwise
+            - correction_text (str): Empty string for corrections
+    Raises:
+        gr.Error: If no audio file is provided or transcription fails.
+    """
     if inputs is None:
         logger.warning("No audio file submitted")
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     try:
         logger.info(f"Processing audio file: {inputs}")
+        # Calculate optimal chunk and stride lengths based on input
+        stride_length_s = chunk_length_s / 6  # Default stride for better context
+        # Clear CUDA cache before processing
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            logger.debug("Cleared CUDA cache before processing")
+        # Process audio with dynamic chunking
+        result = pipe(
+            inputs,
+            batch_size=batch_size,
+            chunk_length_s=chunk_length_s,
+            stride_length_s=stride_length_s,
+            return_timestamps=return_timestamps
+        )
         logger.debug(f"Pipeline result: {result}")
         # Format response as JSON
             srt_file = save_srt_to_file(srt_content)
             logger.info("SRT subtitles generated successfully")
+        # Clear CUDA cache after processing
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            logger.debug("Cleared CUDA cache after processing")
         return formatted_result, srt_file, ""  # Return empty string for correction textbox
     except Exception as e:
+        # Ensure CUDA cache is cleared even if there's an error
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            logger.debug("Cleared CUDA cache after error")
         logger.exception(f"Error during transcription: {str(e)}")
         raise gr.Error(f"Failed to transcribe audio: {str(e)}")

requirements.txt CHANGED Viewed

	@@ -1,2 +1,2 @@
1	transformers
2	- loguru


1	transformers
2	+ loguru