Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from whisperplus.pipelines.whisper import SpeechToTextPipeline | |
| from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline | |
| from whisperplus.utils.download_utils import download_and_convert_to_mp3 | |
| from whisperplus.utils.text_utils import format_speech_to_dialogue | |
| import subprocess | |
| def install_package(package): | |
| subprocess.check_call(['pip', 'install', package, '--no-build-isolation']) | |
| # Then install flash-attn | |
| install_package('flash-attn') | |
| def youtube_url_to_text(url, model_id, language_choice): | |
| """ | |
| Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using | |
| a specified model, and returns the transcript along with the video path. | |
| Args: | |
| url (str): The URL of the video to download and convert. | |
| model_id (str): The ID of the speech-to-text model to use. | |
| language_choice (str): The language choice for the speech-to-text conversion. | |
| Returns: | |
| transcript (str): The transcript of the speech-to-text conversion. | |
| video_path (str): The path of the downloaded video. | |
| """ | |
| video_path = download_and_convert_to_mp3(url) | |
| output = SpeechToTextPipeline(model_id) | |
| print(video_path) | |
| transcript = output(audio_path=video_path, language=language_choice) | |
| return transcript, video_path | |
| def speaker_diarization(url, model_id, num_speakers, min_speaker, max_speaker): | |
| """ | |
| Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using | |
| a specified model, and returns the transcript along with the video path. | |
| Args: | |
| url (str): The URL of the video to download and convert. | |
| model_id (str): The ID of the speech-to-text model to use. | |
| language_choice (str): The language choice for the speech-to-text conversion. | |
| Returns: | |
| transcript (str): The transcript of the speech-to-text conversion. | |
| video_path (str): The path of the downloaded video. | |
| """ | |
| pipeline = ASRDiarizationPipeline.from_pretrained( | |
| asr_model=model_id, | |
| diarizer_model="pyannote/speaker-diarization", | |
| chunk_length_s=30, | |
| device="cuda", | |
| ) | |
| audio_path = download_and_convert_to_mp3(url) | |
| output_text = pipeline( | |
| audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker) | |
| dialogue = format_speech_to_dialogue(output_text) | |
| return dialogue, audio_path | |
| def youtube_url_to_text_app(): | |
| with gr.Blocks(): | |
| with gr.Row(): | |
| with gr.Column(): | |
| youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL") | |
| language_choice = gr.Dropdown( | |
| choices=[ | |
| "English", | |
| "Turkish", | |
| "Spanish", | |
| "French", | |
| "Chinese", | |
| "Japanese", | |
| "Korean", | |
| ], | |
| value="Turkish", | |
| label="Language", | |
| ) | |
| whisper_model_id = gr.Dropdown( | |
| choices=[ | |
| "openai/whisper-large-v3", | |
| "openai/whisper-large", | |
| "openai/whisper-medium", | |
| "openai/whisper-base", | |
| "openai/whisper-small", | |
| "openai/whisper-tiny", | |
| ], | |
| value="openai/whisper-large-v3", | |
| label="Whisper Model", | |
| ) | |
| whisperplus_in_predict = gr.Button(value="Generator") | |
| with gr.Column(): | |
| output_text = gr.Textbox(label="Output Text") | |
| output_audio = gr.Audio(label="Output Audio") | |
| whisperplus_in_predict.click( | |
| fn=youtube_url_to_text, | |
| inputs=[ | |
| youtube_url_path, | |
| whisper_model_id, | |
| language_choice, | |
| ], | |
| outputs=[output_text, output_audio], | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| "https://www.youtube.com/watch?v=di3rHkEZuUw", | |
| "distil-whisper/distil-large-v3", | |
| "English", | |
| ], | |
| ], | |
| fn=youtube_url_to_text, | |
| inputs=[ | |
| youtube_url_path, | |
| whisper_model_id, | |
| language_choice, | |
| ], | |
| outputs=[output_text, output_audio], | |
| cache_examples=True, | |
| ) | |
| def speaker_diarization_app(): | |
| with gr.Blocks(): | |
| with gr.Row(): | |
| with gr.Column(): | |
| youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL") | |
| whisper_model_id = gr.Dropdown( | |
| choices=[ | |
| "openai/whisper-large-v3", | |
| "distil-whisper/distil-large-v3", | |
| "distil-whisper/distil-large-v2", | |
| ], | |
| value="distil-whisper/distil-large-v3", | |
| label="Whisper Model", | |
| ) | |
| num_speakers = gr.Number(value=2, label="Number of Speakers") | |
| min_speaker = gr.Number(value=1, label="Minimum Number of Speakers") | |
| max_speaker = gr.Number(value=2, label="Maximum Number of Speakers") | |
| whisperplus_in_predict = gr.Button(value="Generator") | |
| with gr.Column(): | |
| output_text = gr.Textbox(label="Output Text") | |
| output_audio = gr.Audio(label="Output Audio") | |
| whisperplus_in_predict.click( | |
| fn=speaker_diarization, | |
| inputs=[ | |
| youtube_url_path, | |
| whisper_model_id, | |
| num_speakers, | |
| min_speaker, | |
| max_speaker, | |
| ], | |
| outputs=[output_text, output_audio], | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| "https://www.youtube.com/shorts/o8PgLUgte2k", | |
| "distil-whisper/distil-large-v3", | |
| 2, | |
| 1, | |
| 2, | |
| ], | |
| ], | |
| fn=speaker_diarization, | |
| inputs=[ | |
| youtube_url_path, | |
| whisper_model_id, | |
| num_speakers, | |
| min_speaker, | |
| max_speaker, | |
| ], | |
| outputs=[output_text, output_audio], | |
| cache_examples=False, | |
| ) | |
| gradio_app = gr.Blocks() | |
| with gradio_app: | |
| gr.HTML( | |
| """ | |
| <h1 style='text-align: center'> | |
| WhisperPlus: Advancing Speech-to-Text Processing 🚀 | |
| </h1> | |
| """) | |
| gr.HTML( | |
| """ | |
| <h3 style='text-align: center'> | |
| Follow me for more! | |
| <a href='https://twitter.com/kadirnar_ai' target='_blank'>Twitter</a> | <a href='https://github.com/kadirnar' target='_blank'>Github</a> | <a href='https://www.linkedin.com/in/kadir-nar/' target='_blank'>Linkedin</a> | <a href='https://www.huggingface.co/kadirnar/' target='_blank'>HuggingFace</a> | |
| </h3> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| with gr.Tab(label="Youtube URL to Text"): | |
| youtube_url_to_text_app() | |
| with gr.Tab(label="Speaker Diarization"): | |
| speaker_diarization_app() | |
| gradio_app.launch(debug=True) |