Scrapyard commited on
Commit
bc075a6
·
1 Parent(s): a748eff

it works idk how but it does

Browse files
Files changed (1) hide show
  1. app.py +163 -52
app.py CHANGED
@@ -1,68 +1,179 @@
1
-
2
  import gradio as gr
3
  import numpy as np
4
  from faster_whisper import WhisperModel
5
- from faster_whisper.transcribe import Segment
 
 
6
 
 
7
  audio_model = WhisperModel("tiny.en", device="cpu", compute_type="int8")
8
- transcription = ['']
9
- buffer = np.array([])
10
-
11
- def transcribe(SampleRate, data):
12
- global buffer
13
- if SampleRate * 3 >= len(buffer):
14
- print("buffer big")
15
- segments, info = audio_model.transcribe(buffer, beam_size=5)
16
- result = (list(segments))
17
- text = ""
18
-
19
- if result and len(result) > 0:
20
- text = result[0].text
21
- print("Text:", text)
22
- else:
23
- text = ""
24
- print("No text found")
25
- print(result)
26
-
27
 
28
- buffer = np.array([])
29
- return(text)
30
- else:
31
- buffer = np.concatenate([buffer, data])
32
- print("buffer small")
33
- return None
34
-
35
-
36
- def normaliseData(audioInput, stream):
37
- sr, y = audioInput
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- # Convert to mono if stereo
40
- if y.ndim > 1:
41
- y = y.mean(axis=1)
 
 
42
 
43
- y = y.astype(np.float32)
44
- y /= np.max(np.abs(y))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- if stream is not None:
47
- stream = np.concatenate([stream, y])
48
- else:
49
- stream = y
50
 
51
- words = transcribe(sr, y)
 
 
 
 
 
 
 
 
52
 
53
- # Return the stream as state and a string representation of the array for display
54
- return stream, words,
 
 
 
 
 
 
 
55
 
 
 
 
56
 
57
- with gr.Blocks() as demo:
58
- audioInput = gr.Audio(sources=["microphone"], streaming=True)
59
- audioOutput = gr.Textbox(label="Output")
60
- state = gr.State()
61
 
62
- audioInput.stream(
63
- fn=normaliseData,
64
- inputs=[audioInput, state],
65
- outputs=[state, audioOutput] # try switching it arround
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  )
67
- demo.launch()
 
 
68
 
 
 
 
 
1
  import gradio as gr
2
  import numpy as np
3
  from faster_whisper import WhisperModel
4
+ import threading
5
+ import time
6
+ import scipy.signal as signal
7
 
8
+ # Initialize the WhisperModel
9
  audio_model = WhisperModel("tiny.en", device="cpu", compute_type="int8")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ class AudioProcessor:
12
+ def __init__(self):
13
+ self.audio_buffer = np.array([]) # Stores raw audio for playback
14
+ self.sample_rate = 16000 # Default sample rate for whisper
15
+ self.lock = threading.Lock() # Thread safety for buffer access
16
+ self.transcription = [''] # List of transcription segments
17
+ self.min_process_length = 1 * self.sample_rate # Process at least 1 second
18
+ self.max_buffer_size = 30 * self.sample_rate # Maximum buffer size (30 seconds)
19
+ self.last_process_time = time.time()
20
+ self.process_interval = 1.0 # Process every 1 second
21
+
22
+ def add_audio(self, audio_data, sr):
23
+ """Add audio to the buffer and process if needed"""
24
+ with self.lock:
25
+ # Convert to mono if stereo
26
+ if audio_data.ndim > 1:
27
+ audio_data = audio_data.mean(axis=1)
28
+
29
+ # Keep original format without normalization
30
+ audio_data = audio_data.astype(np.float32)
31
+
32
+ # Resample properly if needed
33
+ if sr != self.sample_rate:
34
+ try:
35
+ number_of_samples = int(len(audio_data) * self.sample_rate / sr)
36
+ audio_data = signal.resample(audio_data, number_of_samples)
37
+ except Exception as e:
38
+ print(f"Resampling error: {e}")
39
+ ratio = self.sample_rate / sr
40
+ audio_data = np.interp(
41
+ np.arange(0, len(audio_data) * ratio, ratio),
42
+ np.arange(0, len(audio_data)),
43
+ audio_data
44
+ )
45
+
46
+ # Add to buffer without renormalizing
47
+ if len(self.audio_buffer) == 0:
48
+ self.audio_buffer = audio_data
49
+ else:
50
+ self.audio_buffer = np.concatenate([self.audio_buffer, audio_data])
51
+
52
+ # Trim buffer if it gets too large
53
+ if len(self.audio_buffer) > self.max_buffer_size:
54
+ self.audio_buffer = self.audio_buffer[-self.max_buffer_size:]
55
+
56
+ # Check if we should process now
57
+ should_process = (
58
+ len(self.audio_buffer) >= self.min_process_length and
59
+ time.time() - self.last_process_time >= self.process_interval
60
+ )
61
+
62
+ if should_process:
63
+ self.last_process_time = time.time()
64
+ # Process the buffer in a separate thread to avoid blocking
65
+ threading.Thread(target=self._process_audio).start()
66
+
67
+ return len(self.audio_buffer)
68
 
69
+ def _process_audio(self):
70
+ """Process the current audio buffer (should be called in a separate thread)"""
71
+ with self.lock:
72
+ # Make a copy for processing
73
+ audio = self.audio_buffer.copy()
74
 
75
+ # Normalize for transcription
76
+ audio_norm = audio.astype(np.float32)
77
+ if np.max(np.abs(audio_norm)) > 0:
78
+ audio_norm = audio_norm / np.max(np.abs(audio_norm))
79
+
80
+ try:
81
+ # Transcribe with whisper
82
+ segments, info = audio_model.transcribe(audio_norm, beam_size=5)
83
+ result = list(segments)
84
+
85
+ if result:
86
+ with self.lock:
87
+ # Update the transcription
88
+ self.transcription = [seg.text for seg in result]
89
+ except Exception as e:
90
+ print(f"Transcription error: {e}")
91
+
92
+ def get_transcription(self):
93
+ """Get the current transcription text"""
94
+ with self.lock:
95
+ return " ".join(self.transcription)
96
+
97
+ def clear_buffer(self):
98
+ """Clear the audio buffer"""
99
+ with self.lock:
100
+ self.audio_buffer = np.array([])
101
+ self.transcription = ['']
102
+ return "Buffers cleared"
103
+
104
+ def get_playback_audio(self):
105
+ """Get properly formatted audio for Gradio playback"""
106
+ with self.lock:
107
+ if len(self.audio_buffer) == 0:
108
+ return None
109
+
110
+ # Make a copy and ensure proper format for Gradio
111
+ audio = self.audio_buffer.copy()
112
+
113
+ # Ensure audio is in the correct range for playback (-1 to 1)
114
+ if np.max(np.abs(audio)) > 0:
115
+ audio = audio / max(1.0, np.max(np.abs(audio)))
116
+
117
+ return (self.sample_rate, audio)
118
 
119
+ # Create processor instance
120
+ processor = AudioProcessor()
 
 
121
 
122
+ def process_mic_audio(audio):
123
+ """Process audio from Gradio microphone and update transcription"""
124
+ if audio is None:
125
+ return gr.update(), gr.update()
126
+
127
+ sr, y = audio
128
+
129
+ # Add to processor and possibly trigger transcription
130
+ buffer_size = processor.add_audio(y, sr)
131
 
132
+ # Get current transcription
133
+ transcription = processor.get_transcription()
134
+
135
+ # Return status update and transcription
136
+ buffer_seconds = buffer_size / processor.sample_rate
137
+ return (
138
+ f"Buffer size: {buffer_size} samples ({buffer_seconds:.2f} seconds)",
139
+ transcription
140
+ )
141
 
142
+ def clear_audio_buffer():
143
+ """Clear the audio buffer"""
144
+ return processor.clear_buffer(), gr.update(), ""
145
 
146
+ def get_current_buffer():
147
+ """Get the current buffer for playback"""
148
+ return processor.get_playback_audio()
 
149
 
150
+ # Create Gradio interface
151
+ with gr.Blocks() as demo:
152
+ gr.Markdown("# Live Speech Recognition with Buffer Playback")
153
+
154
+ with gr.Row():
155
+ audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Microphone Input")
156
+
157
+ with gr.Row():
158
+ status_output = gr.Textbox(label="Buffer Status", interactive=False)
159
+ buffer_audio = gr.Audio(label="Current Buffer (Click to Play)", interactive=False)
160
+
161
+ with gr.Row():
162
+ clear_btn = gr.Button("Clear Buffer")
163
+ play_btn = gr.Button("Get Buffer for Playback")
164
+
165
+ with gr.Row():
166
+ transcription_output = gr.Textbox(label="Live Transcription", lines=5, interactive=False)
167
+
168
+ # Connect components - removed the 'every' parameter for compatibility
169
+ audio_input.stream(
170
+ process_mic_audio,
171
+ audio_input,
172
+ [status_output, transcription_output]
173
  )
174
+
175
+ clear_btn.click(clear_audio_buffer, None, [status_output, buffer_audio, transcription_output])
176
+ play_btn.click(get_current_buffer, None, buffer_audio)
177
 
178
+ # Launch the interface
179
+ demo.launch()