Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import os | |
| from dotenv import load_dotenv | |
| from pydub import AudioSegment | |
| load_dotenv() | |
| from lang_list import TEXT_SOURCE_LANGUAGE_NAMES | |
| from gradio_client import Client | |
| HF_API = os.getenv("HF_API") | |
| API_URL = os.getenv("API_URL") # path to Seamlessm4t API endpoint | |
| DEFAULT_TARGET_LANGUAGE = "Western Persian" | |
| DESCRIPTION = """ | |
| # Seamlessm4t + Speaker Diarization + Voice Activity Detection | |
| Here we use seamlessm4t to generate captions for full audios. Audio can be of arbitrary length. | |
| """ | |
| DUPLICATE = """ | |
| To duplicate this repo, you have to give permission from three reopsitories and accept all user conditions: | |
| 1- https://huggingface.co/pyannote/voice-activity-detection | |
| 2- https://hf.co/pyannote/segmentation | |
| 3- https://hf.co/pyannote/speaker-diarization | |
| """ | |
| from pyannote.audio import Pipeline | |
| pipeline = Pipeline.from_pretrained( | |
| "pyannote/speaker-diarization", use_auth_token=HF_API | |
| ) | |
| def predict( | |
| target_language, number_of_speakers, audio_source, input_audio_mic, input_audio_file | |
| ): | |
| if audio_source == "microphone": | |
| input_data = input_audio_mic | |
| else: | |
| input_data = input_audio_file | |
| print(input_data) | |
| if number_of_speakers == 0: | |
| diarization = pipeline(input_data) | |
| else: | |
| diarization = pipeline(input_data, num_speakers=number_of_speakers) | |
| for turn, value, speaker in diarization.itertracks(yield_label=True): | |
| print(f"start={turn.start}s stop={turn.end}s speaker_{speaker}") | |
| song = AudioSegment.from_wav(input_data) | |
| client = Client(API_URL) | |
| output_text = "" | |
| for turn, value, speaker in diarization.itertracks(yield_label=True): | |
| print(turn) | |
| try: | |
| clipped = song[turn.start * 1000 : turn.end * 1000] | |
| clipped.export(f"my.wav", format="wav", bitrate=16000) | |
| _, result = client.predict( | |
| "ASR (Automatic Speech Recognition)", | |
| "file", # str in 'Audio source' Radio component | |
| f"my.wav", | |
| f"my.wav", | |
| "text", | |
| target_language, | |
| target_language, | |
| api_name="/run", | |
| ) | |
| current_text = f"start: {turn.start:.1f} end: {turn.end:.1f} text: {result} speaker: {speaker}" | |
| if current_text is not None: | |
| output_text = output_text + "\n" + current_text | |
| yield output_text | |
| except Exception as e: | |
| print(e) | |
| # return output_text | |
| def update_audio_ui(audio_source: str) -> tuple[dict, dict]: | |
| mic = audio_source == "microphone" | |
| return ( | |
| gr.update(visible=mic, value=None), # input_audio_mic | |
| gr.update(visible=not mic, value=None), # input_audio_file | |
| ) | |
| with gr.Blocks(css="style.css") as demo: | |
| gr.Markdown(DESCRIPTION) | |
| with gr.Group(): | |
| with gr.Row(): | |
| target_language = gr.Dropdown( | |
| choices=TEXT_SOURCE_LANGUAGE_NAMES, | |
| label="Output Language", | |
| value=DEFAULT_TARGET_LANGUAGE, | |
| interactive=True, | |
| info="Select your target language", | |
| ) | |
| number_of_speakers = gr.Number( | |
| label="Number of Speakers", | |
| info="Keep it zero, if you want the model to automatically detect the number of speakers", | |
| ) | |
| with gr.Row() as audio_box: | |
| audio_source = gr.Radio( | |
| choices=["file", "microphone"], value="file", interactive=True | |
| ) | |
| input_audio_mic = gr.Audio( | |
| label="Input speech", | |
| type="filepath", | |
| source="microphone", | |
| visible=False, | |
| ) | |
| input_audio_file = gr.Audio( | |
| label="Input speech", | |
| type="filepath", | |
| source="upload", | |
| visible=True, | |
| ) | |
| final_audio = gr.Audio(label="Output", visible=False) | |
| audio_source.change( | |
| fn=update_audio_ui, | |
| inputs=audio_source, | |
| outputs=[input_audio_mic, input_audio_file], | |
| queue=False, | |
| api_name=False, | |
| ) | |
| input_audio_mic.change(lambda x: x, input_audio_mic, final_audio) | |
| input_audio_file.change(lambda x: x, input_audio_file, final_audio) | |
| submit = gr.Button("Submit") | |
| text_output = gr.Textbox( | |
| label="Transcribed Text", | |
| value="", | |
| interactive=False, | |
| lines=10, | |
| scale=10, | |
| max_lines=10, | |
| ) | |
| submit.click( | |
| fn=predict, | |
| inputs=[ | |
| target_language, | |
| number_of_speakers, | |
| audio_source, | |
| input_audio_mic, | |
| input_audio_file, | |
| ], | |
| outputs=[text_output], | |
| api_name="predict", | |
| ) | |
| gr.Markdown(DUPLICATE) | |
| demo.queue(max_size=50).launch() | |