Spaces:
Sleeping
Sleeping
File size: 5,154 Bytes
cf0dcc0 acbd561 7b7db64 97fbbc1 7b7db64 a748eff 8b3bbb3 7b7db64 97fbbc1 bc075a6 7b7db64 7b7174c bc075a6 7b7174c bc075a6 7b7174c 7b7db64 bc075a6 66a7fab 7b7db64 7b7174c 7b7db64 bc075a6 8b3bbb3 7b7db64 bc075a6 f6b199b bc075a6 7b7db64 f6b199b bc075a6 f6b199b 8b3bbb3 7b7db64 8b3bbb3 bc075a6 8b3bbb3 7b7db64 7b7174c bc075a6 7b7174c bc075a6 7b7174c bc075a6 8b3bbb3 7b7174c bc075a6 7b7db64 7b7174c 7b7db64 bc075a6 7b7174c 7b7db64 acbd561 7b7174c 7b7db64 bc075a6 7b7db64 a748eff 8b3bbb3 7b7db64 8b3bbb3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import gradio as gr
import numpy as np
import threading
import time
from components.transcriber import AudioProcessor
from components.gpt import gen_llm_response
from components.streaming import StreamingManager, create_streaming_interface
from config import config
# Create processor instance with configuration-based device settings
processor = AudioProcessor(model_size="base.en")
# Adjust some settings for better quality
processor.min_process_length = 1 * processor.sample_rate # Need at least 2 seconds before processing
processor.process_interval = 1 # Process at most every 1.5 seconds
# Create streaming manager
streaming_manager = StreamingManager(processor)
def process_mic_audio(audio):
"""Process audio from Gradio microphone and update transcription"""
if audio is None:
return gr.update(), gr.update(), gr.update()
sr, y = audio
# Add to processor and possibly trigger transcription
buffer_size = processor.add_audio(y, sr)
# Wait for any pending processing to complete before getting transcription
processor.wait_for_processing_complete(1.0)
# Get current transcription
transcription = processor.get_transcription()
# Send transcription to LLM and get response
llm_response = ""
if transcription and len(transcription) > 0:
llm_response = gen_llm_response(transcription)
# Return status update, original transcription, and LLM response
buffer_seconds = buffer_size / processor.sample_rate
return (
f"Buffer: {buffer_seconds:.1f}s | Processed: {processor.processed_length/processor.sample_rate:.1f}s",
transcription,
llm_response
)
def clear_audio_buffer():
"""Clear the audio buffer"""
return processor.clear_buffer(), gr.update(), "", ""
def get_current_buffer():
"""Get the current buffer for playback"""
return processor.get_playback_audio()
def force_transcribe():
"""Force transcription of current buffer"""
# Force complete processing of all remaining audio
transcription = processor.force_complete_processing()
# Send to LLM and get response
llm_response = ""
if transcription and len(transcription) > 0:
llm_response = gen_llm_response(transcription)
return transcription, llm_response
# Create Gradio interface
with gr.Blocks(title="Live Speech Transcription") as demo:
device_info = config.get_device_info()
device_status = f"🖥️ **Device:** {device_info['device'].upper()}"
if device_info['cuda_available'] and device_info['device'] == 'cuda':
device_status += f" | **GPU:** {device_info.get('cuda_device_name', 'Unknown')}"
gr.Markdown(f"# Live Speech Recognition with LLM Response\n{device_status}")
with gr.Row():
audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Microphone Input")
with gr.Row():
status_output = gr.Textbox(label="Buffer Status", interactive=False)
buffer_audio = gr.Audio(label="Current Buffer (Click to Play)", interactive=False)
with gr.Row():
clear_btn = gr.Button("Clear Buffer")
play_btn = gr.Button("Get Buffer for Playback")
force_btn = gr.Button("Force Transcribe")
with gr.Row():
with gr.Column():
transcription_display = gr.Textbox(label="Live Transcription", lines=5, interactive=False)
with gr.Column():
llm_response_display = gr.Textbox(label="LLM Response", lines=5, interactive=False)
# Create streaming interface
streaming_components = create_streaming_interface(streaming_manager)
# Connect main interface components
audio_input.stream(
process_mic_audio,
audio_input,
[status_output, streaming_components['transcription_output'], streaming_components['llm_output']]
)
clear_btn.click(
clear_audio_buffer,
None,
[status_output, buffer_audio, streaming_components['transcription_output'], streaming_components['llm_output']]
)
play_btn.click(get_current_buffer, None, buffer_audio)
force_btn.click(
force_transcribe,
None,
[streaming_components['transcription_output'], streaming_components['llm_output']]
)
if __name__ == "__main__":
print("🎤 Live Speech Transcription App with LLM")
print("=" * 40)
# Display device configuration
device_info = config.get_device_info()
print("🔧 Configuration:")
print(f" Device: {device_info['device'].upper()}")
print(f" Compute type: {device_info['compute_type']}")
print(f" CUDA available: {device_info['cuda_available']}")
if device_info['cuda_available'] and device_info['device'] == 'cuda':
print(f" GPU: {device_info.get('cuda_device_name', 'Unknown')}")
memory_gb = device_info.get('cuda_memory_total', 0) / (1024**3)
print(f" GPU Memory: {memory_gb:.1f} GB")
print("\nFeatures:")
print("• Real-time microphone transcription")
print("• Audio buffer playback")
print("• LLM responses displayed in UI")
print("• RoBERTa+ hybrid question detection")
# Launch the interface
demo.launch()
|