from app.logger_config import ( logger as logging, DEBUG ) import numpy as np import gradio as gr import asyncio from fastrtc.webrtc import WebRTC from fastrtc.utils import AdditionalOutputs from pydub import AudioSegment import time import os from gradio.utils import get_space from app.utils import ( raise_function, generate_coturn_config, read_and_stream_audio, stop_streaming, task ) from app.session_utils import ( on_load, on_unload, get_active_session_hash_code, register_session_hash_code, reset_all_active_session_hash_code, get_active_task_flag_file, ) from app.ui_utils import ( SUPPORTED_LANGS_MAP, EXAMPLE_CONFIGS, apply_preset_if_example, reset_to_defaults, summarize_config, handle_additional_outputs, get_custom_theme, on_file_load ) import nemo.collections.asr as nemo_asr # -------------------------------------------------------- # Initialization # -------------------------------------------------------- reset_all_active_session_hash_code() theme,css_style = get_custom_theme() from omegaconf import OmegaConf cfg = OmegaConf.load('app/config.yaml') # logger.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') from app.canary_speech_engine import CanarySpeechEngine from app.silero_vad_engine import Silero_Vad_Engine from app.streaming_audio_processor import StreamingAudioProcessor,StreamingAudioProcessorConfig asr_model = nemo_asr.models.ASRModel.from_pretrained(cfg.pretrained_name) canary_speech_engine = CanarySpeechEngine(asr_model,cfg) silero_vad_engine = Silero_Vad_Engine() streaming_audio_processor_config = StreamingAudioProcessorConfig( read_size=4000, silence_threshold_chunks=1 ) streamer = StreamingAudioProcessor(speech_engine=canary_speech_engine,vad_engine=silero_vad_engine,cfg=streaming_audio_processor_config) with gr.Blocks(theme=theme, css=css_style) as demo: session_hash_code = gr.State() session_hash_code_box = gr.Textbox(label="Session ID", interactive=False, visible=DEBUG) with gr.Accordion("📊 Active Sessions Hash", open=True ,visible=DEBUG): sessions_table = gr.DataFrame( headers=["session_hash_code", "file", "start_time", "status"], interactive=False, wrap=True, max_height=200, ) gr.Timer(3.0).tick(fn=get_active_session_hash_code, outputs=sessions_table) demo.load(fn=on_load, inputs=None, outputs=[session_hash_code, session_hash_code_box]) demo.unload(on_unload) stop_streaming_flags = gr.State(value={"stop": False}) active_filepath = gr.State(value=next(iter(EXAMPLE_CONFIGS))) with gr.Walkthrough(selected=0) as walkthrough: # === STEP 1 === with gr.Step("Audio", id=0) as audio_source_step: gr.Markdown( """ ### Step 1: Upload or Record an Audio File You can upload an existing file or record directly from your microphone. Accepted formats: **.wav**, **.mp3**, **.flac** Maximum length recommended: **60 seconds** """ ) with gr.Group(): with gr.Column(): main_audio = gr.Audio( label="Audio Input", sources=["upload", "microphone"], type="filepath", interactive=True ) with gr.Accordion("Need a quick test? Try one of the sample audios below", open=True): examples = gr.Examples( examples=list(EXAMPLE_CONFIGS.keys()), inputs=main_audio, label=None, examples_per_page=3 ) gr.Markdown( """ 🔹 **english_meeting.wav** – Short business meeting in English 🔹 **french_news.wav** – Excerpt from a French radio broadcast 🔹 **spanish_podcast.wav** – Segment from a Spanish-language podcast """ ) btn_proceed_streaming = gr.Button("Proceed to Streaming", visible=False) ui_components_oload_audio = [active_filepath, btn_proceed_streaming] main_audio.change(fn=on_file_load, inputs=[main_audio], outputs=ui_components_oload_audio) # main_audio.stop_recording(fn=on_file_load, inputs=[main_audio], outputs=ui_components_one) # main_audio.clear(fn=on_file_load, inputs=[main_audio], outputs=ui_components_one) btn_proceed_streaming.click(lambda: gr.Walkthrough(selected=1), outputs=walkthrough) # === STEP 2 === with gr.Step("Stream", id=1) as audio_stream: gr.Markdown("### Step 2: Start audio streaming") with gr.Group(): with gr.Column(): webrtc_stream = WebRTC( label="Live Stream", mode="receive", modality="audio", rtc_configuration=generate_coturn_config(), visible=True, inputs=main_audio ) start_stream_button = gr.Button("Start Streaming") webrtc_stream.stream( fn=read_and_stream_audio, inputs=[active_filepath, session_hash_code, stop_streaming_flags,gr.State(streaming_audio_processor_config.read_size)], outputs=[webrtc_stream], trigger=start_stream_button.click, concurrency_id="audio_stream", concurrency_limit=10, ) status_message_stream = gr.Markdown("", elem_id="status-message-stream", visible=False) go_to_config = gr.Button("Go to Configuration", visible=False) go_to_config.click(lambda: gr.Walkthrough(selected=2), outputs=walkthrough) # === STEP 3 === with gr.Step("Configuration", id=2): gr.Markdown("## Step 3: Configure the Task") task_type = gr.Radio(["Transcription", "Translation"], value="Transcription", label="Task Type") lang_source = gr.Dropdown(list(SUPPORTED_LANGS_MAP.keys()), value="French", label="Source Language") lang_target = gr.Dropdown(list(SUPPORTED_LANGS_MAP.keys()), value="English", label="Target Language", visible=False) with gr.Accordion("Advanced Configuration", open=False): chunk_secs = gr.Number(value=1.0, label="chunk_secs", precision=1) left_context_secs = gr.Number(value=20.0, label="left_context_secs", precision=1) right_context_secs = gr.Number(value=0.5, label="right_context_secs", precision=1) streaming_policy = gr.Dropdown(["waitk", "alignatt"], value="waitk", label="decoding.streaming_policy") alignatt_thr = gr.Number(value=8, label="alignatt_thr", precision=0) waitk_lagging = gr.Number(value=2, label="waitk_lagging", precision=0) exclude_sink_frames = gr.Number(value=8, label="exclude_sink_frames", precision=0) xatt_scores_layer = gr.Number(value=-2, label="xatt_scores_layer", precision=0) hallucinations_detector = gr.Checkbox(value=True, label="hallucinations_detector") with gr.Row(): auto_apply_presets = gr.Checkbox(value=True, label="Auto-apply presets for sample audios") reset_btn = gr.Button("Reset to defaults") summary_box = gr.Textbox(label="Configuration Summary", lines=10, interactive=False) # --- Events --- task_type.change( fn=lambda t: gr.update(visible=(t == "Translation")), inputs=task_type, outputs=lang_target, queue=False ) inputs_list = [ task_type, lang_source, lang_target, chunk_secs, left_context_secs, right_context_secs, streaming_policy, alignatt_thr, waitk_lagging, exclude_sink_frames, xatt_scores_layer, hallucinations_detector ] for inp in inputs_list: inp.change( fn=summarize_config, inputs=inputs_list, outputs=summary_box, queue=False ) # Apply preset or not main_audio.change( fn=apply_preset_if_example, inputs=[main_audio, auto_apply_presets], outputs=[ task_type, lang_source, lang_target, chunk_secs, left_context_secs, right_context_secs, streaming_policy, alignatt_thr, waitk_lagging, exclude_sink_frames, xatt_scores_layer, hallucinations_detector, summary_box ], queue=False ) # Reset defaults reset_btn.click( fn=reset_to_defaults, inputs=None, outputs=[ task_type, lang_source, lang_target, chunk_secs, left_context_secs, right_context_secs, streaming_policy, alignatt_thr, waitk_lagging, exclude_sink_frames, xatt_scores_layer, hallucinations_detector, summary_box ], queue=False ) go_to_task = gr.Button("Go to Task") go_to_task.click(lambda: gr.Walkthrough(selected=3), outputs=walkthrough) # === STEP 4 === with gr.Step("Task", id=3) as task_step: gr.Markdown("## Step 4: Start the Task") with gr.Group(): with gr.Column(): status_slider = gr.Slider( 0, 100, value=0, label="Streaming Progress", interactive=False, visible=False ) stop_stream_button = gr.Button("Stop Streaming", visible=False) transcription_output = gr.Textbox( label="Transcription / Translation Result", placeholder="Waiting for output...", lines=10, max_lines= 10, interactive=False, visible=True, autoscroll=True ) start_task_button = gr.Button("Start Task", visible=True) stop_task_button = gr.Button("Stop Task", visible=False) stop_stream_button.click( fn=stop_streaming, inputs=[session_hash_code, stop_streaming_flags], outputs=[stop_streaming_flags], ) def stop_task_fn(session_hash_code): transcribe_active = get_active_task_flag_file(session_hash_code) if os.path.exists(transcribe_active): os.remove(transcribe_active) yield "Task stopped by user." stop_task_button.click( fn=stop_task_fn, inputs=session_hash_code, outputs=transcription_output ) # task(session_hash_code) def start_transcription( session_hash_code, stop_streaming_flags, task_type, lang_source, lang_target, chunk_secs, left_context_secs, right_context_secs, streaming_policy, alignatt_thr, waitk_lagging, exclude_sink_frames, xatt_scores_layer, hallucinations_detector ): """Stream transcription or translation results in real time.""" accumulated = "" yield f"Starting {task_type.lower()}...\n\n",gr.update(visible=False),gr.update(visible=True) # Boucle sur le générateur de `task()` for msg in task(session_hash_code,streamer=streamer): accumulated += msg yield accumulated,gr.update(visible=False),gr.update(visible=True) yield accumulated + "\nDone.",gr.update(visible=True),gr.update(visible=False) start_task_button.click( fn=start_transcription, inputs=[ session_hash_code, stop_streaming_flags, task_type, lang_source, lang_target, chunk_secs, left_context_secs, right_context_secs, streaming_policy, alignatt_thr, waitk_lagging, exclude_sink_frames, xatt_scores_layer, hallucinations_detector ], outputs=[transcription_output,start_task_button,stop_task_button] ) ui_components = [ start_stream_button, stop_stream_button, go_to_config, audio_source_step, status_slider,walkthrough,status_message_stream ] webrtc_stream.on_additional_outputs( fn=handle_additional_outputs, inputs=[webrtc_stream], outputs=ui_components, concurrency_id="additional_outputs_audio_stream", concurrency_limit=10, ) # def toggle_task_buttons(): # return ( # gr.update(visible=False), # gr.update(visible=True), # gr.update(visible=True) # ) # start_task_button.click( # fn=toggle_task_buttons, # inputs=None, # outputs=[start_task_button, stop_task_button, stop_stream_button], # queue=False # ) if __name__ == "__main__": demo.queue(max_size=10, api_open=False).launch(show_api=False,show_error=True, debug=DEBUG)