Testing / app.py
Sidak Singh
question boundary works
7b7db64
import gradio as gr
import numpy as np
import threading
import time
from components.transcriber import AudioProcessor
from components.gpt import gen_llm_response
from components.streaming import StreamingManager, create_streaming_interface
from config import config
# Create processor instance with configuration-based device settings
processor = AudioProcessor(model_size="base.en")
# Adjust some settings for better quality
processor.min_process_length = 1 * processor.sample_rate # Need at least 2 seconds before processing
processor.process_interval = 1 # Process at most every 1.5 seconds
# Create streaming manager
streaming_manager = StreamingManager(processor)
def process_mic_audio(audio):
"""Process audio from Gradio microphone and update transcription"""
if audio is None:
return gr.update(), gr.update(), gr.update()
sr, y = audio
# Add to processor and possibly trigger transcription
buffer_size = processor.add_audio(y, sr)
# Wait for any pending processing to complete before getting transcription
processor.wait_for_processing_complete(1.0)
# Get current transcription
transcription = processor.get_transcription()
# Send transcription to LLM and get response
llm_response = ""
if transcription and len(transcription) > 0:
llm_response = gen_llm_response(transcription)
# Return status update, original transcription, and LLM response
buffer_seconds = buffer_size / processor.sample_rate
return (
f"Buffer: {buffer_seconds:.1f}s | Processed: {processor.processed_length/processor.sample_rate:.1f}s",
transcription,
llm_response
)
def clear_audio_buffer():
"""Clear the audio buffer"""
return processor.clear_buffer(), gr.update(), "", ""
def get_current_buffer():
"""Get the current buffer for playback"""
return processor.get_playback_audio()
def force_transcribe():
"""Force transcription of current buffer"""
# Force complete processing of all remaining audio
transcription = processor.force_complete_processing()
# Send to LLM and get response
llm_response = ""
if transcription and len(transcription) > 0:
llm_response = gen_llm_response(transcription)
return transcription, llm_response
# Create Gradio interface
with gr.Blocks(title="Live Speech Transcription") as demo:
device_info = config.get_device_info()
device_status = f"🖥️ **Device:** {device_info['device'].upper()}"
if device_info['cuda_available'] and device_info['device'] == 'cuda':
device_status += f" | **GPU:** {device_info.get('cuda_device_name', 'Unknown')}"
gr.Markdown(f"# Live Speech Recognition with LLM Response\n{device_status}")
with gr.Row():
audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Microphone Input")
with gr.Row():
status_output = gr.Textbox(label="Buffer Status", interactive=False)
buffer_audio = gr.Audio(label="Current Buffer (Click to Play)", interactive=False)
with gr.Row():
clear_btn = gr.Button("Clear Buffer")
play_btn = gr.Button("Get Buffer for Playback")
force_btn = gr.Button("Force Transcribe")
with gr.Row():
with gr.Column():
transcription_display = gr.Textbox(label="Live Transcription", lines=5, interactive=False)
with gr.Column():
llm_response_display = gr.Textbox(label="LLM Response", lines=5, interactive=False)
# Create streaming interface
streaming_components = create_streaming_interface(streaming_manager)
# Connect main interface components
audio_input.stream(
process_mic_audio,
audio_input,
[status_output, streaming_components['transcription_output'], streaming_components['llm_output']]
)
clear_btn.click(
clear_audio_buffer,
None,
[status_output, buffer_audio, streaming_components['transcription_output'], streaming_components['llm_output']]
)
play_btn.click(get_current_buffer, None, buffer_audio)
force_btn.click(
force_transcribe,
None,
[streaming_components['transcription_output'], streaming_components['llm_output']]
)
if __name__ == "__main__":
print("🎤 Live Speech Transcription App with LLM")
print("=" * 40)
# Display device configuration
device_info = config.get_device_info()
print("🔧 Configuration:")
print(f" Device: {device_info['device'].upper()}")
print(f" Compute type: {device_info['compute_type']}")
print(f" CUDA available: {device_info['cuda_available']}")
if device_info['cuda_available'] and device_info['device'] == 'cuda':
print(f" GPU: {device_info.get('cuda_device_name', 'Unknown')}")
memory_gb = device_info.get('cuda_memory_total', 0) / (1024**3)
print(f" GPU Memory: {memory_gb:.1f} GB")
print("\nFeatures:")
print("• Real-time microphone transcription")
print("• Audio buffer playback")
print("• LLM responses displayed in UI")
print("• RoBERTa+ hybrid question detection")
# Launch the interface
demo.launch()