Spaces:

Gapeleon
/

snac_test

Running

App Files Files Community

Gapeleon commited on Apr 9

Commit

dd3ef50

verified ·

1 Parent(s): d54f19d

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -160

app.py CHANGED Viewed

@@ -1,178 +1,154 @@
-import gradio as gr
 import torch
 import torchaudio
-import torchaudio.transforms as T
-import numpy as np
-import traceback
-import io
 import time
-# Attempt to import SNAC (should work if requirements.txt is correct)
-try:
-    from snac import SNAC
-    print("SNAC module imported successfully.")
-except ImportError as e:
-    print(f"Error importing SNAC: {e}")
-    # Raise a more informative error if SNAC isn't installed
-    raise ImportError("Could not import SNAC. Make sure 'snac' is listed in requirements.txt and installed correctly.") from e
-# --- Configuration ---
-TARGET_SR = 24000 # SNAC operates at 24kHz
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {DEVICE}")
-# --- Load Model (Load once globally) ---
-snac_model = None
-try:
-    print("Loading SNAC model...")
     start_time = time.time()
-    snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
-    snac_model = snac_model.to(DEVICE)
-    snac_model.eval() # Set model to evaluation mode
-    end_time = time.time()
-    print(f"SNAC model loaded successfully to {DEVICE}. Time taken: {end_time - start_time:.2f} seconds.")
-except Exception as e:
-    print(f"FATAL: Error loading SNAC model: {e}")
-    print(traceback.format_exc())
-    # If the model fails to load, the app can't function.
-    # Gradio will likely show an error, but we print specifics here.
-# --- Main Processing Function ---
-def process_audio(audio_filepath):
-    """
-    Loads, resamples, encodes, decodes audio using SNAC, and returns results.
-    """
-    if snac_model is None:
-        return None, None, None, "Error: SNAC model could not be loaded. Cannot process audio."
-    if audio_filepath is None:
-        return None, None, None, "Please upload an audio file."
-    logs = ["--- Starting Audio Processing ---"]
     try:
-        # 1. Load Audio
-        logs.append(f"Loading audio file: {audio_filepath}")
-        load_start = time.time()
-        original_waveform, original_sr = torchaudio.load(audio_filepath)
-        load_end = time.time()
-        logs.append(f"Audio loaded. Original SR: {original_sr} Hz, Shape: {original_waveform.shape}, Time: {load_end - load_start:.2f}s")
-        # Ensure float32
-        original_waveform = original_waveform.to(dtype=torch.float32)
-        # Handle multi-channel audio: Use the first channel
-        if original_waveform.shape[0] > 1:
-            logs.append(f"Warning: Input audio has {original_waveform.shape[0]} channels. Using only the first channel.")
-            original_waveform = original_waveform[0:1, :] # Keep channel dim for consistency initially
-        # --- Prepare Original for Playback ---
-        # Gradio Audio component expects (sample_rate, numpy_array)
-        # Ensure numpy array is 1D or 2D [channels, samples]
-        original_audio_playback = (original_sr, original_waveform.squeeze().numpy()) # Squeeze removes channel dim if 1
-        logs.append("Prepared original audio for playback.")
-        # 2. Resample if necessary
-        resample_start = time.time()
-        if original_sr != TARGET_SR:
-            logs.append(f"Resampling waveform from {original_sr} Hz to {TARGET_SR} Hz...")
-            resampler = T.Resample(orig_freq=original_sr, new_freq=TARGET_SR).to(original_waveform.device) # Resampler on same device
-            waveform_to_encode = resampler(original_waveform)
-            logs.append(f"Resampling complete. New Shape: {waveform_to_encode.shape}")
         else:
-            logs.append("Waveform is already at the target sample rate (24kHz).")
-            waveform_to_encode = original_waveform
-        resample_end = time.time()
-        logs.append(f"Resampling time: {resample_end - resample_start:.2f}s")
-        # --- Prepare Resampled for Playback ---
-        resampled_audio_playback = (TARGET_SR, waveform_to_encode.squeeze().numpy())
-        logs.append("Prepared resampled audio for playback.")
-        # 3. Prepare for SNAC Encoding (add batch dim, move to device)
-        # Input should be [Batch, Channel, Time] = [1, 1, Time]
-        # waveform_to_encode should currently be [1, Time] after channel selection/resampling
-        waveform_batch = waveform_to_encode.unsqueeze(0).to(DEVICE) # Add batch dimension -> [1, 1, Time]
-        logs.append(f"Waveform prepared for encoding. Shape: {waveform_batch.shape}, Device: {DEVICE}")
-        # 4. Encode Audio using SNAC
-        logs.append("Encoding audio with snac_model.encode()...")
-        encode_start = time.time()
-        with torch.inference_mode():
-            codes = snac_model.encode(waveform_batch)
-        encode_end = time.time()
-        if not codes or not all(isinstance(c, torch.Tensor) for c in codes):
-             log_msg = f"Encoding failed: Expected list of Tensors, but got: {type(codes)}"
-             if isinstance(codes, list):
-                 log_msg += f" with types {[type(c) for c in codes]}"
-             logs.append(log_msg)
-             raise ValueError(log_msg)
-        logs.append(f"Encoding complete. Received {len(codes)} code layers. Time: {encode_end - encode_start:.2f}s")
-        for i, layer_codes in enumerate(codes):
-             logs.append(f"  Layer {i+1} codes shape: {layer_codes.shape}, Device: {layer_codes.device}")
-        # 5. Decode the Codes using SNAC
-        logs.append("Decoding the generated codes with snac_model.decode()...")
-        decode_start = time.time()
-        with torch.inference_mode():
-            reconstructed_waveform = snac_model.decode(codes) # codes are already on DEVICE
-        decode_end = time.time()
-        logs.append(f"Decoding complete. Reconstructed waveform shape: {reconstructed_waveform.shape}, Device: {reconstructed_waveform.device}. Time: {decode_end - decode_start:.2f}s")
-        # 6. Prepare Reconstructed Audio for Playback
-        # Output is [Batch, 1, Time]. Move to CPU, remove Batch/Channel, convert to NumPy.
-        reconstructed_audio_np = reconstructed_waveform.cpu().squeeze().numpy() # Squeeze removes Batch and Channel dims
-        logs.append(f"Reconstructed audio prepared for playback. Shape: {reconstructed_audio_np.shape}")
-        reconstructed_audio_playback = (TARGET_SR, reconstructed_audio_np)
-        logs.append("\n--- Audio Processing Completed Successfully ---")
-        return original_audio_playback, resampled_audio_playback, reconstructed_audio_playback, "\n".join(logs)
     except Exception as e:
-        logs.append("\n--- An Error Occurred ---")
-        logs.append(f"Error Type: {type(e).__name__}")
-        logs.append(f"Error Details: {e}")
-        logs.append("\n--- Traceback ---")
-        logs.append(traceback.format_exc())
-        # Return None for audio components on error, and the detailed log
-        return None, None, None, "\n".join(logs)
-# --- Gradio Interface ---
-DESCRIPTION = """
-This Space demonstrates the **SNAC (Scalable Neural Audio Codec)** model (`hubertsiuzdak/snac_24khz`).
-1. Upload an audio file (wav, mp3, flac, etc.).
-2. The audio will be automatically resampled to 24kHz if needed.
-3. The 24kHz audio is encoded into discrete codes by SNAC.
-4. These codes are then decoded back into audio by SNAC.
-5. You can listen to the original, the 24kHz version (if resampled), and the final reconstructed audio.
-**Note:** Processing happens on the server. Larger files will take longer. If the input is stereo, only the first channel is processed.
 """
 iface = gr.Interface(
-    fn=process_audio,
-    inputs=gr.Audio(type="filepath", label="Upload Audio File"),
     outputs=[
-        gr.Audio(label="Original Audio"),
-        gr.Audio(label="Resampled Audio (24kHz Input to SNAC)"),
-        gr.Audio(label="Reconstructed Audio (Output from SNAC)"),
-        gr.Textbox(label="Log Output", lines=15)
-    ],
-    title="SNAC Audio Codec Demo (24kHz)",
-    description=DESCRIPTION,
-    examples=[
-        # Add paths to example audio files if you upload some to your Space repo
-        # ["examples/example1.wav"],
-        # ["examples/example2.mp3"],
     ],
-    cache_examples=False # Disable caching if examples change or have issues
 )
 if __name__ == "__main__":
-    if snac_model is None:
-        print("Cannot launch Gradio interface because SNAC model failed to load.")
-    else:
-        print("Launching Gradio Interface...")
-        iface.launch()

 import torch
 import torchaudio
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, BitsAndBytesConfig
+import gradio as gr
+import os
 import time
+import numpy as np
+# Load model and processor (runs once on startup)
+model_name = "ibm-granite/granite-speech-3.2-8b"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+print("Loading processor...")
+speech_granite_processor = AutoProcessor.from_pretrained(
+    model_name, trust_remote_code=True)
+tokenizer = speech_granite_processor.tokenizer
+print("Loading model with 4-bit quantization...")
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True
+)
+speech_granite = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_name,
+    quantization_config=quantization_config,
+    device_map="auto",
+    trust_remote_code=True
+)
+print("Model loaded successfully")
+def transcribe_audio(audio_input):
+    """Process audio input and return transcription"""
     start_time = time.time()
+    logs = [f"Audio input received: {type(audio_input)}"]
+    if audio_input is None:
+        return "Error: No audio provided.", 0.0
     try:
+        # Handle different audio input formats
+        if isinstance(audio_input, tuple) and len(audio_input) == 2:
+            # Microphone input: (sample_rate, numpy_array)
+            logs.append("Processing microphone input")
+            sr, wav_np = audio_input
+            wav = torch.from_numpy(wav_np).float().unsqueeze(0)
         else:
+            # File input: filepath string
+            logs.append(f"Processing file input: {audio_input}")
+            wav, sr = torchaudio.load(audio_input)
+            logs.append(f"Loaded audio file with sample rate {sr}Hz and shape {wav.shape}")
+        # Convert to mono if stereo
+        if wav.shape[0] > 1:
+            wav = torch.mean(wav, dim=0, keepdim=True)
+            logs.append("Converted stereo to mono")
+        # Resample to 16kHz if needed
+        if sr != 16000:
+            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
+            wav = resampler(wav)
+            sr = 16000
+            logs.append(f"Resampled to {sr}Hz")
+        logs.append(f"Final audio: sample rate {sr}Hz, shape {wav.shape}, min: {wav.min().item()}, max: {wav.max().item()}")
+        # Create text prompt
+        chat = [
+            {
+                "role": "system",
+                "content": "Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant",
+            },
+            {
+                "role": "user",
+                "content": "<|audio|>can you transcribe the speech into a written format?",
+            }
+        ]
+        text = tokenizer.apply_chat_template(
+            chat, tokenize=False, add_generation_prompt=True
+        )
+        # Compute audio embeddings
+        logs.append("Preparing model inputs")
+        model_inputs = speech_granite_processor(
+            text=text,
+            audio=wav.numpy().squeeze(),  # Convert to numpy and squeeze
+            sampling_rate=sr,
+            return_tensors="pt",
+        ).to(device)
+        # Generate transcription
+        logs.append("Generating transcription")
+        model_outputs = speech_granite.generate(
+            **model_inputs,
+            max_new_tokens=1000,
+            num_beams=4,
+            do_sample=False,
+            min_length=1,
+            top_p=1.0,
+            repetition_penalty=3.0,
+            length_penalty=1.0,
+            temperature=1.0,
+            bos_token_id=tokenizer.bos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.pad_token_id,
+        )
+        # Extract the generated text (skipping input tokens)
+        logs.append("Processing output")
+        num_input_tokens = model_inputs["input_ids"].shape[-1]
+        new_tokens = torch.unsqueeze(model_outputs[0, num_input_tokens:], dim=0)
+        output_text = tokenizer.batch_decode(
+            new_tokens, add_special_tokens=False, skip_special_tokens=True
+        )
+        transcription = output_text[0].strip().upper()
+        logs.append(f"Transcription complete: {transcription[:50]}...")
     except Exception as e:
+        import traceback
+        error_trace = traceback.format_exc()
+        print(error_trace)
+        print("\n".join(logs))
+        return f"Error: {str(e)}\n\nLogs:\n" + "\n".join(logs), 0.0
+    processing_time = round(time.time() - start_time, 2)
+    return transcription, processing_time
+# Create Gradio interface
+title = "IBM Granite Speech-to-Text (8B Quantized)"
+description = """
+Transcribe speech using IBM's Granite Speech 3.2 8B model (loaded in 4-bit).
+Upload an audio file or use your microphone to record speech.
 """
 iface = gr.Interface(
+    fn=transcribe_audio,
+    inputs=gr.Audio(sources=["upload", "microphone"], type="filepath"),
     outputs=[
+        gr.Textbox(label="Transcription", lines=5),
+        gr.Number(label="Processing Time (seconds)")
     ],
+    title=title,
+    description=description,
 )
 if __name__ == "__main__":
+    iface.launch()