File size: 5,154 Bytes
cf0dcc0
acbd561
7b7db64
 
 
 
 
 
97fbbc1
7b7db64
 
a748eff
8b3bbb3
7b7db64
 
 
 
 
97fbbc1
bc075a6
 
 
7b7db64
7b7174c
bc075a6
7b7174c
bc075a6
 
7b7174c
7b7db64
 
 
bc075a6
 
66a7fab
7b7db64
 
 
 
7b7174c
7b7db64
bc075a6
 
8b3bbb3
7b7db64
 
bc075a6
f6b199b
bc075a6
 
7b7db64
f6b199b
bc075a6
 
 
f6b199b
8b3bbb3
 
7b7db64
 
 
 
 
 
 
 
 
8b3bbb3
bc075a6
8b3bbb3
7b7db64
 
 
 
 
 
7b7174c
bc075a6
 
7b7174c
bc075a6
 
 
7b7174c
bc075a6
 
 
8b3bbb3
7b7174c
bc075a6
7b7db64
 
 
 
 
 
 
7b7174c
7b7db64
bc075a6
7b7174c
 
7b7db64
acbd561
7b7174c
7b7db64
 
 
 
 
bc075a6
7b7db64
 
 
 
 
a748eff
8b3bbb3
7b7db64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b3bbb3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import gradio as gr
import numpy as np
import threading
import time
from components.transcriber import AudioProcessor
from components.gpt import gen_llm_response
from components.streaming import StreamingManager, create_streaming_interface
from config import config

# Create processor instance with configuration-based device settings
processor = AudioProcessor(model_size="base.en")

# Adjust some settings for better quality
processor.min_process_length = 1 * processor.sample_rate  # Need at least 2 seconds before processing
processor.process_interval = 1  # Process at most every 1.5 seconds

# Create streaming manager
streaming_manager = StreamingManager(processor)

def process_mic_audio(audio):
    """Process audio from Gradio microphone and update transcription"""
    if audio is None:
        return gr.update(), gr.update(), gr.update()

    sr, y = audio

    # Add to processor and possibly trigger transcription
    buffer_size = processor.add_audio(y, sr)

    # Wait for any pending processing to complete before getting transcription
    processor.wait_for_processing_complete(1.0)

    # Get current transcription
    transcription = processor.get_transcription()

    # Send transcription to LLM and get response
    llm_response = ""
    if transcription and len(transcription) > 0:
        llm_response = gen_llm_response(transcription)

    # Return status update, original transcription, and LLM response
    buffer_seconds = buffer_size / processor.sample_rate
    return (
        f"Buffer: {buffer_seconds:.1f}s | Processed: {processor.processed_length/processor.sample_rate:.1f}s",
        transcription,
        llm_response
    )

def clear_audio_buffer():
    """Clear the audio buffer"""
    return processor.clear_buffer(), gr.update(), "", ""

def get_current_buffer():
    """Get the current buffer for playback"""
    return processor.get_playback_audio()

def force_transcribe():
    """Force transcription of current buffer"""
    # Force complete processing of all remaining audio
    transcription = processor.force_complete_processing()

    # Send to LLM and get response
    llm_response = ""
    if transcription and len(transcription) > 0:
        llm_response = gen_llm_response(transcription)

    return transcription, llm_response

# Create Gradio interface
with gr.Blocks(title="Live Speech Transcription") as demo:
    device_info = config.get_device_info()
    device_status = f"🖥️ **Device:** {device_info['device'].upper()}"
    if device_info['cuda_available'] and device_info['device'] == 'cuda':
        device_status += f" | **GPU:** {device_info.get('cuda_device_name', 'Unknown')}"

    gr.Markdown(f"# Live Speech Recognition with LLM Response\n{device_status}")

    with gr.Row():
        audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Microphone Input")

    with gr.Row():
        status_output = gr.Textbox(label="Buffer Status", interactive=False)
        buffer_audio = gr.Audio(label="Current Buffer (Click to Play)", interactive=False)

    with gr.Row():
        clear_btn = gr.Button("Clear Buffer")
        play_btn = gr.Button("Get Buffer for Playback")
        force_btn = gr.Button("Force Transcribe")

    with gr.Row():
        with gr.Column():
            transcription_display = gr.Textbox(label="Live Transcription", lines=5, interactive=False)
        with gr.Column():
            llm_response_display = gr.Textbox(label="LLM Response", lines=5, interactive=False)

    # Create streaming interface
    streaming_components = create_streaming_interface(streaming_manager)

    # Connect main interface components
    audio_input.stream(
        process_mic_audio,
        audio_input,
        [status_output, streaming_components['transcription_output'], streaming_components['llm_output']]
    )

    clear_btn.click(
        clear_audio_buffer,
        None,
        [status_output, buffer_audio, streaming_components['transcription_output'], streaming_components['llm_output']]
    )
    play_btn.click(get_current_buffer, None, buffer_audio)
    force_btn.click(
        force_transcribe,
        None,
        [streaming_components['transcription_output'], streaming_components['llm_output']]
    )

if __name__ == "__main__":
    print("🎤 Live Speech Transcription App with LLM")
    print("=" * 40)

    # Display device configuration
    device_info = config.get_device_info()
    print("🔧 Configuration:")
    print(f"   Device: {device_info['device'].upper()}")
    print(f"   Compute type: {device_info['compute_type']}")
    print(f"   CUDA available: {device_info['cuda_available']}")
    if device_info['cuda_available'] and device_info['device'] == 'cuda':
        print(f"   GPU: {device_info.get('cuda_device_name', 'Unknown')}")
        memory_gb = device_info.get('cuda_memory_total', 0) / (1024**3)
        print(f"   GPU Memory: {memory_gb:.1f} GB")

    print("\nFeatures:")
    print("• Real-time microphone transcription")
    print("• Audio buffer playback")
    print("• LLM responses displayed in UI")
    print("• RoBERTa+ hybrid question detection")

    # Launch the interface
    demo.launch()