import gradio as gr import numpy as np import threading import time from components.transcriber import AudioProcessor from components.gpt import gen_llm_response from components.streaming import StreamingManager, create_streaming_interface from config import config # Create processor instance with configuration-based device settings processor = AudioProcessor(model_size="base.en") # Adjust some settings for better quality processor.min_process_length = 1 * processor.sample_rate # Need at least 2 seconds before processing processor.process_interval = 1 # Process at most every 1.5 seconds # Create streaming manager streaming_manager = StreamingManager(processor) def process_mic_audio(audio): """Process audio from Gradio microphone and update transcription""" if audio is None: return gr.update(), gr.update(), gr.update() sr, y = audio # Add to processor and possibly trigger transcription buffer_size = processor.add_audio(y, sr) # Wait for any pending processing to complete before getting transcription processor.wait_for_processing_complete(1.0) # Get current transcription transcription = processor.get_transcription() # Send transcription to LLM and get response llm_response = "" if transcription and len(transcription) > 0: llm_response = gen_llm_response(transcription) # Return status update, original transcription, and LLM response buffer_seconds = buffer_size / processor.sample_rate return ( f"Buffer: {buffer_seconds:.1f}s | Processed: {processor.processed_length/processor.sample_rate:.1f}s", transcription, llm_response ) def clear_audio_buffer(): """Clear the audio buffer""" return processor.clear_buffer(), gr.update(), "", "" def get_current_buffer(): """Get the current buffer for playback""" return processor.get_playback_audio() def force_transcribe(): """Force transcription of current buffer""" # Force complete processing of all remaining audio transcription = processor.force_complete_processing() # Send to LLM and get response llm_response = "" if transcription and len(transcription) > 0: llm_response = gen_llm_response(transcription) return transcription, llm_response # Create Gradio interface with gr.Blocks(title="Live Speech Transcription") as demo: device_info = config.get_device_info() device_status = f"🖥️ **Device:** {device_info['device'].upper()}" if device_info['cuda_available'] and device_info['device'] == 'cuda': device_status += f" | **GPU:** {device_info.get('cuda_device_name', 'Unknown')}" gr.Markdown(f"# Live Speech Recognition with LLM Response\n{device_status}") with gr.Row(): audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Microphone Input") with gr.Row(): status_output = gr.Textbox(label="Buffer Status", interactive=False) buffer_audio = gr.Audio(label="Current Buffer (Click to Play)", interactive=False) with gr.Row(): clear_btn = gr.Button("Clear Buffer") play_btn = gr.Button("Get Buffer for Playback") force_btn = gr.Button("Force Transcribe") with gr.Row(): with gr.Column(): transcription_display = gr.Textbox(label="Live Transcription", lines=5, interactive=False) with gr.Column(): llm_response_display = gr.Textbox(label="LLM Response", lines=5, interactive=False) # Create streaming interface streaming_components = create_streaming_interface(streaming_manager) # Connect main interface components audio_input.stream( process_mic_audio, audio_input, [status_output, streaming_components['transcription_output'], streaming_components['llm_output']] ) clear_btn.click( clear_audio_buffer, None, [status_output, buffer_audio, streaming_components['transcription_output'], streaming_components['llm_output']] ) play_btn.click(get_current_buffer, None, buffer_audio) force_btn.click( force_transcribe, None, [streaming_components['transcription_output'], streaming_components['llm_output']] ) if __name__ == "__main__": print("🎤 Live Speech Transcription App with LLM") print("=" * 40) # Display device configuration device_info = config.get_device_info() print("🔧 Configuration:") print(f" Device: {device_info['device'].upper()}") print(f" Compute type: {device_info['compute_type']}") print(f" CUDA available: {device_info['cuda_available']}") if device_info['cuda_available'] and device_info['device'] == 'cuda': print(f" GPU: {device_info.get('cuda_device_name', 'Unknown')}") memory_gb = device_info.get('cuda_memory_total', 0) / (1024**3) print(f" GPU Memory: {memory_gb:.1f} GB") print("\nFeatures:") print("• Real-time microphone transcription") print("• Audio buffer playback") print("• LLM responses displayed in UI") print("• RoBERTa+ hybrid question detection") # Launch the interface demo.launch()