Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| import threading | |
| import time | |
| from components.transcriber import AudioProcessor | |
| from components.gpt import gen_llm_response | |
| from components.streaming import StreamingManager, create_streaming_interface | |
| from config import config | |
| # Create processor instance with configuration-based device settings | |
| processor = AudioProcessor(model_size="base.en") | |
| # Adjust some settings for better quality | |
| processor.min_process_length = 1 * processor.sample_rate # Need at least 2 seconds before processing | |
| processor.process_interval = 1 # Process at most every 1.5 seconds | |
| # Create streaming manager | |
| streaming_manager = StreamingManager(processor) | |
| def process_mic_audio(audio): | |
| """Process audio from Gradio microphone and update transcription""" | |
| if audio is None: | |
| return gr.update(), gr.update(), gr.update() | |
| sr, y = audio | |
| # Add to processor and possibly trigger transcription | |
| buffer_size = processor.add_audio(y, sr) | |
| # Wait for any pending processing to complete before getting transcription | |
| processor.wait_for_processing_complete(1.0) | |
| # Get current transcription | |
| transcription = processor.get_transcription() | |
| # Send transcription to LLM and get response | |
| llm_response = "" | |
| if transcription and len(transcription) > 0: | |
| llm_response = gen_llm_response(transcription) | |
| # Return status update, original transcription, and LLM response | |
| buffer_seconds = buffer_size / processor.sample_rate | |
| return ( | |
| f"Buffer: {buffer_seconds:.1f}s | Processed: {processor.processed_length/processor.sample_rate:.1f}s", | |
| transcription, | |
| llm_response | |
| ) | |
| def clear_audio_buffer(): | |
| """Clear the audio buffer""" | |
| return processor.clear_buffer(), gr.update(), "", "" | |
| def get_current_buffer(): | |
| """Get the current buffer for playback""" | |
| return processor.get_playback_audio() | |
| def force_transcribe(): | |
| """Force transcription of current buffer""" | |
| # Force complete processing of all remaining audio | |
| transcription = processor.force_complete_processing() | |
| # Send to LLM and get response | |
| llm_response = "" | |
| if transcription and len(transcription) > 0: | |
| llm_response = gen_llm_response(transcription) | |
| return transcription, llm_response | |
| # Create Gradio interface | |
| with gr.Blocks(title="Live Speech Transcription") as demo: | |
| device_info = config.get_device_info() | |
| device_status = f"🖥️ **Device:** {device_info['device'].upper()}" | |
| if device_info['cuda_available'] and device_info['device'] == 'cuda': | |
| device_status += f" | **GPU:** {device_info.get('cuda_device_name', 'Unknown')}" | |
| gr.Markdown(f"# Live Speech Recognition with LLM Response\n{device_status}") | |
| with gr.Row(): | |
| audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Microphone Input") | |
| with gr.Row(): | |
| status_output = gr.Textbox(label="Buffer Status", interactive=False) | |
| buffer_audio = gr.Audio(label="Current Buffer (Click to Play)", interactive=False) | |
| with gr.Row(): | |
| clear_btn = gr.Button("Clear Buffer") | |
| play_btn = gr.Button("Get Buffer for Playback") | |
| force_btn = gr.Button("Force Transcribe") | |
| with gr.Row(): | |
| with gr.Column(): | |
| transcription_display = gr.Textbox(label="Live Transcription", lines=5, interactive=False) | |
| with gr.Column(): | |
| llm_response_display = gr.Textbox(label="LLM Response", lines=5, interactive=False) | |
| # Create streaming interface | |
| streaming_components = create_streaming_interface(streaming_manager) | |
| # Connect main interface components | |
| audio_input.stream( | |
| process_mic_audio, | |
| audio_input, | |
| [status_output, streaming_components['transcription_output'], streaming_components['llm_output']] | |
| ) | |
| clear_btn.click( | |
| clear_audio_buffer, | |
| None, | |
| [status_output, buffer_audio, streaming_components['transcription_output'], streaming_components['llm_output']] | |
| ) | |
| play_btn.click(get_current_buffer, None, buffer_audio) | |
| force_btn.click( | |
| force_transcribe, | |
| None, | |
| [streaming_components['transcription_output'], streaming_components['llm_output']] | |
| ) | |
| if __name__ == "__main__": | |
| print("🎤 Live Speech Transcription App with LLM") | |
| print("=" * 40) | |
| # Display device configuration | |
| device_info = config.get_device_info() | |
| print("🔧 Configuration:") | |
| print(f" Device: {device_info['device'].upper()}") | |
| print(f" Compute type: {device_info['compute_type']}") | |
| print(f" CUDA available: {device_info['cuda_available']}") | |
| if device_info['cuda_available'] and device_info['device'] == 'cuda': | |
| print(f" GPU: {device_info.get('cuda_device_name', 'Unknown')}") | |
| memory_gb = device_info.get('cuda_memory_total', 0) / (1024**3) | |
| print(f" GPU Memory: {memory_gb:.1f} GB") | |
| print("\nFeatures:") | |
| print("• Real-time microphone transcription") | |
| print("• Audio buffer playback") | |
| print("• LLM responses displayed in UI") | |
| print("• RoBERTa+ hybrid question detection") | |
| # Launch the interface | |
| demo.launch() | |