Spaces:

kadirnar
/

Audio-WebUI

Runtime error

App Files Files Community

Audio-WebUI / app.py

kadirnar

Update app.py

4f7fe11 verified over 1 year ago

raw

history blame

7.5 kB

	import gradio as gr

	from whisperplus.pipelines.whisper import SpeechToTextPipeline
	from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
	from whisperplus.utils.download_utils import download_and_convert_to_mp3
	from whisperplus.utils.text_utils import format_speech_to_dialogue

	import subprocess

	def install_package(package):
	subprocess.check_call(['pip', 'install', package, '--no-build-isolation'])

	# Then install flash-attn
	install_package('flash-attn')


	def youtube_url_to_text(url, model_id, language_choice):
	"""
	Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
	a specified model, and returns the transcript along with the video path.

	Args:
	url (str): The URL of the video to download and convert.
	model_id (str): The ID of the speech-to-text model to use.
	language_choice (str): The language choice for the speech-to-text conversion.

	Returns:
	transcript (str): The transcript of the speech-to-text conversion.
	video_path (str): The path of the downloaded video.
	"""
	video_path = download_and_convert_to_mp3(url)
	output = SpeechToTextPipeline(model_id)
	print(video_path)
	transcript = output(audio_path=video_path, language=language_choice)

	return transcript, video_path


	def speaker_diarization(url, model_id, num_speakers, min_speaker, max_speaker):
	"""
	Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
	a specified model, and returns the transcript along with the video path.

	Args:
	url (str): The URL of the video to download and convert.
	model_id (str): The ID of the speech-to-text model to use.
	language_choice (str): The language choice for the speech-to-text conversion.

	Returns:
	transcript (str): The transcript of the speech-to-text conversion.
	video_path (str): The path of the downloaded video.
	"""

	pipeline = ASRDiarizationPipeline.from_pretrained(
	asr_model=model_id,
	diarizer_model="pyannote/speaker-diarization",
	chunk_length_s=30,
	device="cuda",
	)

	audio_path = download_and_convert_to_mp3(url)
	output_text = pipeline(
	audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker)
	dialogue = format_speech_to_dialogue(output_text)
	return dialogue, audio_path


	def youtube_url_to_text_app():
	with gr.Blocks():
	with gr.Row():
	with gr.Column():
	youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL")

	language_choice = gr.Dropdown(
	choices=[
	"English",
	"Turkish",
	"Spanish",
	"French",
	"Chinese",
	"Japanese",
	"Korean",
	],
	value="Turkish",
	label="Language",
	)
	whisper_model_id = gr.Dropdown(
	choices=[
	"openai/whisper-large-v3",
	"openai/whisper-large",
	"openai/whisper-medium",
	"openai/whisper-base",
	"openai/whisper-small",
	"openai/whisper-tiny",
	],
	value="openai/whisper-large-v3",
	label="Whisper Model",
	)
	whisperplus_in_predict = gr.Button(value="Generator")

	with gr.Column():
	output_text = gr.Textbox(label="Output Text")
	output_audio = gr.Audio(label="Output Audio")

	whisperplus_in_predict.click(
	fn=youtube_url_to_text,
	inputs=[
	youtube_url_path,
	whisper_model_id,
	language_choice,
	],
	outputs=[output_text, output_audio],
	)
	gr.Examples(
	examples=[
	[
	"https://www.youtube.com/watch?v=di3rHkEZuUw",
	"distil-whisper/distil-large-v3",
	"English",
	],
	],
	fn=youtube_url_to_text,
	inputs=[
	youtube_url_path,
	whisper_model_id,
	language_choice,
	],
	outputs=[output_text, output_audio],
	cache_examples=True,
	)


	def speaker_diarization_app():
	with gr.Blocks():
	with gr.Row():
	with gr.Column():
	youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL")

	whisper_model_id = gr.Dropdown(
	choices=[
	"openai/whisper-large-v3",
	"distil-whisper/distil-large-v3",
	"distil-whisper/distil-large-v2",
	],
	value="distil-whisper/distil-large-v3",
	label="Whisper Model",
	)
	num_speakers = gr.Number(value=2, label="Number of Speakers")
	min_speaker = gr.Number(value=1, label="Minimum Number of Speakers")
	max_speaker = gr.Number(value=2, label="Maximum Number of Speakers")
	whisperplus_in_predict = gr.Button(value="Generator")

	with gr.Column():
	output_text = gr.Textbox(label="Output Text")
	output_audio = gr.Audio(label="Output Audio")

	whisperplus_in_predict.click(
	fn=speaker_diarization,
	inputs=[
	youtube_url_path,
	whisper_model_id,
	num_speakers,
	min_speaker,
	max_speaker,
	],
	outputs=[output_text, output_audio],
	)
	gr.Examples(
	examples=[
	[
	"https://www.youtube.com/shorts/o8PgLUgte2k",
	"distil-whisper/distil-large-v3",
	2,
	1,
	2,
	],
	],
	fn=speaker_diarization,
	inputs=[
	youtube_url_path,
	whisper_model_id,
	num_speakers,
	min_speaker,
	max_speaker,
	],
	outputs=[output_text, output_audio],
	cache_examples=False,
	)


	gradio_app = gr.Blocks()
	with gradio_app:
	gr.HTML(
	"""
	<h1 style='text-align: center'>
	WhisperPlus: Advancing Speech-to-Text Processing 🚀
	</h1>
	""")
	gr.HTML(
	"""
	<h3 style='text-align: center'>
	Follow me for more!
	<a href='https://twitter.com/kadirnar_ai' target='_blank'>Twitter</a> \| <a href='https://github.com/kadirnar' target='_blank'>Github</a> \| <a href='https://www.linkedin.com/in/kadir-nar/' target='_blank'>Linkedin</a> \| <a href='https://www.huggingface.co/kadirnar/' target='_blank'>HuggingFace</a>
	</h3>
	""")
	with gr.Row():
	with gr.Column():
	with gr.Tab(label="Youtube URL to Text"):
	youtube_url_to_text_app()
	with gr.Tab(label="Speaker Diarization"):
	speaker_diarization_app()

	gradio_app.launch(debug=True)