whisper-tg

Paused

App Files Files Community

muhtasham commited on Mar 23

Commit

a6eeb9b

1 Parent(s): 88d0fe2

WIP

Browse files

Files changed (2) hide show

app.py +24 -59
requirements.txt +0 -2

app.py CHANGED Viewed

@@ -1,12 +1,16 @@
-import torch
 import gradio as gr
 import subprocess
 import datetime
 import tempfile
-from transformers import pipeline
 from loguru import logger
-MODEL_NAME = "muhtasham/whisper-tg"
 def format_time(seconds):
     """Convert seconds to SRT time format (HH:MM:SS,mmm).
@@ -66,7 +70,7 @@ def generate_srt(chunks):
     for i, chunk in enumerate(chunks, 1):
         start_time = format_time(chunk["timestamp"][0])
         end_time = format_time(chunk["timestamp"][1])
-        text = chunk["text"].strip()
         srt_content.append(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")
     return "".join(srt_content)
@@ -106,35 +110,13 @@ def check_ffmpeg():
 # Initialize ffmpeg check
 check_ffmpeg()
-# Use T4 GPU if available, otherwise fallback to CPU
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-logger.info(f"Using device: {device}")
-def create_pipeline():
-    """Create a new pipeline with optimized settings for T4 GPU.
-    Returns:
-        transformers.Pipeline: Configured speech recognition pipeline.
-    """
-    return pipeline(
-        task="automatic-speech-recognition",
-        model=MODEL_NAME,
-        device=device,
-    )
-# Initialize pipeline once
-pipe = create_pipeline()
-logger.info(f"Pipeline initialized: {pipe}")
-def transcribe(inputs, return_timestamps, generate_subs, batch_size, chunk_length_s):
-    """Transcribe audio input using Whisper model.
     Args:
         inputs (str): Path to audio file to transcribe.
         return_timestamps (bool): Whether to include timestamps in output.
         generate_subs (bool): Whether to generate SRT subtitles.
-        batch_size (int): Number of chunks to process in parallel.
-        chunk_length_s (int): Length of audio chunks in seconds.
     Returns:
         tuple: (formatted_result, srt_file, correction_text)
@@ -152,27 +134,20 @@ def transcribe(inputs, return_timestamps, generate_subs, batch_size, chunk_lengt
     try:
         logger.info(f"Processing audio file: {inputs}")
-        # Calculate optimal chunk and stride lengths based on input
-        stride_length_s = chunk_length_s / 6
-        # Clear CUDA cache before processing
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            logger.debug("Cleared CUDA cache before processing")
-        # Process audio with dynamic chunking
-        result = pipe(
-            inputs,
-            batch_size=batch_size,
-            chunk_length_s=chunk_length_s,
-            stride_length_s=stride_length_s,
-            return_timestamps="word" if return_timestamps else False
-        )
-        logger.debug(f"Pipeline result: {result}")
         # Format response as JSON
         formatted_result = {
-            "text": result["text"]
         }
         chunks = []
@@ -208,17 +183,11 @@ def transcribe(inputs, return_timestamps, generate_subs, batch_size, chunk_lengt
             srt_file = save_srt_to_file(srt_content)
             logger.info("SRT subtitles generated successfully")
-        # Clear CUDA cache after processing
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            logger.debug("Cleared CUDA cache after processing")
         return formatted_result, srt_file, ""  # Return empty string for correction textbox
     except Exception as e:
-        # Ensure CUDA cache is cleared even if there's an error
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            logger.debug("Cleared CUDA cache after error")
         logger.exception(f"Error during transcription: {str(e)}")
         raise gr.Error(f"Failed to transcribe audio: {str(e)}")
@@ -232,8 +201,6 @@ mf_transcribe = gr.Interface(
         gr.Audio(sources="microphone", type="filepath"),
         gr.Checkbox(label="Include timestamps", value=True),
         gr.Checkbox(label="Generate subtitles", value=True),
-        gr.Slider(minimum=1, maximum=64, value=8, step=1, label="Batch Size"),
-        gr.Slider(minimum=5, maximum=30, value=30, step=5, label="Chunk Length (seconds)"),
     ],
     outputs=[
         gr.JSON(label="Transcription", open=True),
@@ -242,7 +209,7 @@ mf_transcribe = gr.Interface(
     title="Whisper Large V3 Turbo: Transcribe Audio",
     description=(
         "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
-        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
         " of arbitrary length."
     )
 )
@@ -253,8 +220,6 @@ file_transcribe = gr.Interface(
         gr.Audio(sources="upload", type="filepath", label="Audio file"),
         gr.Checkbox(label="Include timestamps", value=True),
         gr.Checkbox(label="Generate subtitles", value=True),
-        gr.Slider(minimum=1, maximum=32, value=8, step=1, label="Batch Size"),
-        gr.Slider(minimum=5, maximum=30, value=15, step=5, label="Chunk Length (seconds)"),
     ],
     outputs=[
         gr.JSON(label="Transcription", open=True),
@@ -263,7 +228,7 @@ file_transcribe = gr.Interface(
     title="Whisper Large V3: Transcribe Audio",
     description=(
         "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
-        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
         " of arbitrary length."
     )
 )

 import gradio as gr
 import subprocess
 import datetime
 import tempfile
+import requests
 from loguru import logger
+from os import getenv
+API_URL = getenv("API_URL")
+headers = {
+    "Accept": "application/json",
+    "Content-Type": "audio/flac"
+}
 def format_time(seconds):
     """Convert seconds to SRT time format (HH:MM:SS,mmm).
     for i, chunk in enumerate(chunks, 1):
         start_time = format_time(chunk["timestamp"][0])
         end_time = format_time(chunk["timestamp"][1])
+        text = chunk.get("text", "").strip()
         srt_content.append(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")
     return "".join(srt_content)
 # Initialize ffmpeg check
 check_ffmpeg()
+def transcribe(inputs, return_timestamps, generate_subs):
+    """Transcribe audio input using Whisper model via Hugging Face Inference API.
     Args:
         inputs (str): Path to audio file to transcribe.
         return_timestamps (bool): Whether to include timestamps in output.
         generate_subs (bool): Whether to generate SRT subtitles.
     Returns:
         tuple: (formatted_result, srt_file, correction_text)
     try:
         logger.info(f"Processing audio file: {inputs}")
+        # Read the audio file
+        with open(inputs, "rb") as f:
+            data = f.read()
+        # Send request to API
+        response = requests.post(API_URL, headers=headers, data=data)
+        response.raise_for_status()  # Raise an exception for bad status codes
+        result = response.json()
+        logger.debug(f"API response: {result}")
         # Format response as JSON
         formatted_result = {
+            "text": result.get("text", "")
         }
         chunks = []
             srt_file = save_srt_to_file(srt_content)
             logger.info("SRT subtitles generated successfully")
         return formatted_result, srt_file, ""  # Return empty string for correction textbox
+    except requests.exceptions.RequestException as e:
+        logger.exception(f"API request failed: {str(e)}")
+        raise gr.Error(f"Failed to transcribe audio: API request failed - {str(e)}")
     except Exception as e:
         logger.exception(f"Error during transcription: {str(e)}")
         raise gr.Error(f"Failed to transcribe audio: {str(e)}")
         gr.Audio(sources="microphone", type="filepath"),
         gr.Checkbox(label="Include timestamps", value=True),
         gr.Checkbox(label="Generate subtitles", value=True),
     ],
     outputs=[
         gr.JSON(label="Transcription", open=True),
     title="Whisper Large V3 Turbo: Transcribe Audio",
     description=(
         "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
+        f" checkpoint [{API_URL}](https://huggingface.co/{API_URL}) and 🤗 Transformers to transcribe audio files"
         " of arbitrary length."
     )
 )
         gr.Audio(sources="upload", type="filepath", label="Audio file"),
         gr.Checkbox(label="Include timestamps", value=True),
         gr.Checkbox(label="Generate subtitles", value=True),
     ],
     outputs=[
         gr.JSON(label="Transcription", open=True),
     title="Whisper Large V3: Transcribe Audio",
     description=(
         "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
+        f" checkpoint [{API_URL}](https://huggingface.co/{API_URL}) and 🤗 Transformers to transcribe audio files"
         " of arbitrary length."
     )
 )

requirements.txt CHANGED Viewed

@@ -1,4 +1,2 @@
-transformers
 loguru
-torch
 gradio



1	loguru

2	gradio