Spaces:

Scrapyard-Brampton
/

Testing

Sleeping

File size: 5,154 Bytes

cf0dcc0
acbd561
7b7db64
 
 
 
 
 
97fbbc1
7b7db64
 
a748eff
8b3bbb3
7b7db64
 
 
 
 
97fbbc1
bc075a6
 
 
7b7db64
7b7174c
bc075a6
7b7174c
bc075a6
 
7b7174c
7b7db64
 
 
bc075a6
 
66a7fab
7b7db64
 
 
 
7b7174c
7b7db64
bc075a6
 
8b3bbb3
7b7db64
 
bc075a6
f6b199b
bc075a6
 
7b7db64
f6b199b
bc075a6
 
 
f6b199b
8b3bbb3
 
7b7db64
 
 
 
 
 
 
 
 
8b3bbb3
bc075a6
8b3bbb3
7b7db64
 
 
 
 
 
7b7174c
bc075a6
 
7b7174c
bc075a6
 
 
7b7174c
bc075a6
 
 
8b3bbb3
7b7174c
bc075a6
7b7db64
 
 
 
 
 
 
7b7174c
7b7db64
bc075a6
7b7174c
 
7b7db64
acbd561
7b7174c
7b7db64
 
 
 
 
bc075a6
7b7db64
 
 
 
 
a748eff
8b3bbb3
7b7db64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b3bbb3

import gradio as gr
import numpy as np
import threading
import time
from components.transcriber import AudioProcessor
from components.gpt import gen_llm_response
from components.streaming import StreamingManager, create_streaming_interface
from config import config

# Create processor instance with configuration-based device settings
processor = AudioProcessor(model_size="base.en")

# Adjust some settings for better quality
processor.min_process_length = 1 * processor.sample_rate  # Need at least 2 seconds before processing
processor.process_interval = 1  # Process at most every 1.5 seconds

# Create streaming manager
streaming_manager = StreamingManager(processor)

def process_mic_audio(audio):
    """Process audio from Gradio microphone and update transcription"""
    if audio is None:
        return gr.update(), gr.update(), gr.update()

    sr, y = audio

    # Add to processor and possibly trigger transcription
    buffer_size = processor.add_audio(y, sr)

    # Wait for any pending processing to complete before getting transcription
    processor.wait_for_processing_complete(1.0)

    # Get current transcription
    transcription = processor.get_transcription()

    # Send transcription to LLM and get response
    llm_response = ""
    if transcription and len(transcription) > 0:
        llm_response = gen_llm_response(transcription)

    # Return status update, original transcription, and LLM response
    buffer_seconds = buffer_size / processor.sample_rate
    return (
        f"Buffer: {buffer_seconds:.1f}s | Processed: {processor.processed_length/processor.sample_rate:.1f}s",
        transcription,
        llm_response
    )

def clear_audio_buffer():
    """Clear the audio buffer"""
    return processor.clear_buffer(), gr.update(), "", ""

def get_current_buffer():
    """Get the current buffer for playback"""
    return processor.get_playback_audio()

def force_transcribe():
    """Force transcription of current buffer"""
    # Force complete processing of all remaining audio
    transcription = processor.force_complete_processing()

    # Send to LLM and get response
    llm_response = ""
    if transcription and len(transcription) > 0:
        llm_response = gen_llm_response(transcription)

    return transcription, llm_response

# Create Gradio interface
with gr.Blocks(title="Live Speech Transcription") as demo:
    device_info = config.get_device_info()
    device_status = f"🖥️ **Device:** {device_info['device'].upper()}"
    if device_info['cuda_available'] and device_info['device'] == 'cuda':
        device_status += f" | **GPU:** {device_info.get('cuda_device_name', 'Unknown')}"

    gr.Markdown(f"# Live Speech Recognition with LLM Response\n{device_status}")

    with gr.Row():
        audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Microphone Input")

    with gr.Row():
        status_output = gr.Textbox(label="Buffer Status", interactive=False)
        buffer_audio = gr.Audio(label="Current Buffer (Click to Play)", interactive=False)

    with gr.Row():
        clear_btn = gr.Button("Clear Buffer")
        play_btn = gr.Button("Get Buffer for Playback")
        force_btn = gr.Button("Force Transcribe")

    with gr.Row():
        with gr.Column():
            transcription_display = gr.Textbox(label="Live Transcription", lines=5, interactive=False)
        with gr.Column():
            llm_response_display = gr.Textbox(label="LLM Response", lines=5, interactive=False)

    # Create streaming interface
    streaming_components = create_streaming_interface(streaming_manager)

    # Connect main interface components
    audio_input.stream(
        process_mic_audio,
        audio_input,
        [status_output, streaming_components['transcription_output'], streaming_components['llm_output']]
    )

    clear_btn.click(
        clear_audio_buffer,
        None,
        [status_output, buffer_audio, streaming_components['transcription_output'], streaming_components['llm_output']]
    )
    play_btn.click(get_current_buffer, None, buffer_audio)
    force_btn.click(
        force_transcribe,
        None,
        [streaming_components['transcription_output'], streaming_components['llm_output']]
    )

if __name__ == "__main__":
    print("🎤 Live Speech Transcription App with LLM")
    print("=" * 40)

    # Display device configuration
    device_info = config.get_device_info()
    print("🔧 Configuration:")
    print(f"   Device: {device_info['device'].upper()}")
    print(f"   Compute type: {device_info['compute_type']}")
    print(f"   CUDA available: {device_info['cuda_available']}")
    if device_info['cuda_available'] and device_info['device'] == 'cuda':
        print(f"   GPU: {device_info.get('cuda_device_name', 'Unknown')}")
        memory_gb = device_info.get('cuda_memory_total', 0) / (1024**3)
        print(f"   GPU Memory: {memory_gb:.1f} GB")

    print("\nFeatures:")
    print("• Real-time microphone transcription")
    print("• Audio buffer playback")
    print("• LLM responses displayed in UI")
    print("• RoBERTa+ hybrid question detection")

    # Launch the interface
    demo.launch()