Spaces:

mr-don88
/

translate-subtitles

Running

le quy don

Update app.py

9de0758 verified 4 months ago

10.3 kB

	import gradio as gr
	from transformers import MarianMTModel, MarianTokenizer, pipeline
	import pysrt
	import tempfile
	from tqdm import tqdm
	from langdetect import detect
	import os
	from datetime import timedelta

	# Danh sách các ngôn ngữ và model tương ứng
	LANGUAGE_MODELS = {
	"Tiếng Anh": "en",
	"Tiếng Việt": "vi",
	"Tiếng Pháp": "fr",
	"Tiếng Đức": "de",
	"Tiếng Tây Ban Nha": "es",
	"Tiếng Bồ Đào Nha": "pt",
	"Tiếng Ý": "it",
	"Tiếng Nga": "ru",
	"Tiếng Hà Lan": "nl",
	"Tiếng Thụy Điển": "sv",
	"Tiếng Phần Lan": "fi",
	"Tiếng Đan Mạch": "da",
	"Tiếng Na Uy": "no",
	"Tiếng Ba Lan": "pl",
	"Tiếng Séc": "cs",
	"Tiếng Hungary": "hu",
	"Tiếng Romania": "ro",
	"Tiếng Hy Lạp": "el",
	"Tiếng Thổ Nhĩ Kỳ": "tr",
	"Tiếng Hindi": "hi",
	"Tiếng Ả Rập": "ar",
	"Tiếng Trung (Giản thể)": "zh",
	"Tiếng Nhật": "ja",
	"Tiếng Hàn": "ko"
	}

	# Đảo ngược dictionary để lấy code từ tên ngôn ngữ
	LANGUAGE_CODES = {v: k for k, v in LANGUAGE_MODELS.items()}

	# Cache models để tăng tốc độ
	model_cache = {}
	detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")

	def detect_subtitle_language(file_path):
	try:
	subs = pysrt.open(file_path)
	sample_text = " ".join([sub.text for sub in subs[:10] if sub.text.strip()])

	if not sample_text:
	return "en" # Mặc định là tiếng Anh nếu không phát hiện được

	try:
	# Sử dụng langdetect cho đơn giản
	lang_code = detect(sample_text)
	return lang_code
	except:
	# Fallback sử dụng model xlm-roberta
	result = detector(sample_text[:512])[0] # Giới hạn độ dài đầu vào
	return result['label'].split('__')[-1]
	except Exception as e:
	print(f"Error detecting language: {e}")
	return "en"

	def get_model(source_lang, target_lang):
	model_key = f"{source_lang}-{target_lang}"

	if model_key not in model_cache:
	model_name = f"Helsinki-NLP/opus-mt-{model_key}"
	try:
	tokenizer = MarianTokenizer.from_pretrained(model_name)
	model = MarianMTModel.from_pretrained(model_name)
	model_cache[model_key] = (model, tokenizer)
	except:
	# Fallback: Dịch qua tiếng Anh nếu không có model trực tiếp
	if source_lang != "en":
	# Dịch từ ngôn ngữ nguồn -> tiếng Anh -> ngôn ngữ đích
	model1_name = f"Helsinki-NLP/opus-mt-{source_lang}-en"
	model2_name = f"Helsinki-NLP/opus-mt-en-{target_lang}"

	tokenizer1 = MarianTokenizer.from_pretrained(model1_name)
	model1 = MarianMTModel.from_pretrained(model1_name)
	tokenizer2 = MarianTokenizer.from_pretrained(model2_name)
	model2 = MarianMTModel.from_pretrained(model2_name)

	model_cache[model_key] = ((model1, tokenizer1), (model2, tokenizer2))
	else:
	raise gr.Error(f"Không tìm thấy model dịch từ {source_lang} sang {target_lang}")

	return model_cache[model_key]

	def translate_text(text, model, tokenizer):
	inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
	translated = model.generate(**inputs)
	return tokenizer.batch_decode(translated, skip_special_tokens=True)[0]

	def add_time_to_subtitle(input_file, hours, minutes, seconds):
	if input_file is None:
	raise gr.Error("Vui lòng upload file phụ đề!")

	try:
	if not os.path.exists(input_file):
	raise gr.Error("File không tồn tại hoặc không thể đọc!")

	subs = pysrt.open(input_file)

	# Chuyển đổi thời gian nhập vào thành mili giây (hỗ trợ số thập phân)
	try:
	seconds_float = float(seconds)
	except ValueError:
	seconds_float = 0

	total_milliseconds = int((int(hours) * 3600 + int(minutes) * 60 + seconds_float) * 1000)

	# Thêm thời gian vào tất cả các phụ đề
	if total_milliseconds > 0:
	for sub in subs:
	sub.start.ordinal += total_milliseconds
	sub.end.ordinal += total_milliseconds

	# Lưu file tạm
	output_path = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name
	subs.save(output_path, encoding='utf-8')
	return output_path, f"Đã thêm {hours}h {minutes}m {seconds_float}s vào file gốc"

	except Exception as e:
	raise gr.Error(f"Có lỗi xảy ra khi thêm thời gian: {str(e)}")

	def translate_subtitle(input_file, source_language, target_language, hours, minutes, seconds):
	if input_file is None:
	raise gr.Error("Vui lòng upload file phụ đề!")

	try:
	if not os.path.exists(input_file):
	raise gr.Error("File không tồn tại hoặc không thể đọc!")

	source_code = LANGUAGE_MODELS.get(source_language, "en")
	target_code = LANGUAGE_MODELS[target_language]

	model_info = get_model(source_code, target_code)

	subs = pysrt.open(input_file)

	# Chuyển đổi thời gian nhập vào thành mili giây (hỗ trợ số thập phân)
	try:
	seconds_float = float(seconds)
	except ValueError:
	seconds_float = 0

	total_milliseconds = int((int(hours) * 3600 + int(minutes) * 60 + seconds_float) * 1000)

	# Thêm thời gian vào tất cả các phụ đề
	if total_milliseconds > 0:
	for sub in subs:
	sub.start.ordinal += total_milliseconds
	sub.end.ordinal += total_milliseconds

	# Xử lý dịch thuật
	if isinstance(model_info[0], tuple):
	# Dịch qua tiếng Anh
	model1, tokenizer1 = model_info[0]
	model2, tokenizer2 = model_info[1]

	for sub in tqdm(subs, desc="Đang dịch"):
	if sub.text.strip():
	en_text = translate_text(sub.text, model1, tokenizer1)
	sub.text = translate_text(en_text, model2, tokenizer2)
	else:
	# Dịch trực tiếp
	model, tokenizer = model_info
	for sub in tqdm(subs, desc="Đang dịch"):
	if sub.text.strip():
	sub.text = translate_text(sub.text, model, tokenizer)

	# Lưu file tạm
	output_path = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name
	subs.save(output_path, encoding='utf-8')
	return output_path, f"Dịch từ {source_language} sang {target_language} thành công! Đã thêm {hours}h {minutes}m {seconds_float}s"

	except Exception as e:
	raise gr.Error(f"Có lỗi xảy ra: {str(e)}")

	# Giao diện Gradio
	with gr.Blocks(title="Subtitle Translator Pro", theme="soft") as demo:
	gr.Markdown("# 🎬 Subtitle Translator Pro")
	gr.Markdown("Dịch phụ đề (.srt) giữa nhiều ngôn ngữ khác nhau")

	with gr.Row():
	with gr.Column():
	file_input = gr.File(label="Upload file phụ đề (.srt)", file_types=[".srt"])

	with gr.Row():
	source_lang = gr.Dropdown(
	choices=list(LANGUAGE_MODELS.keys()),
	value="Tiếng Anh",
	label="Ngôn ngữ nguồn",
	interactive=True
	)
	auto_detect = gr.Checkbox(label="Tự động phát hiện ngôn ngữ", value=True)

	target_lang = gr.Dropdown(
	choices=list(LANGUAGE_MODELS.keys()),
	value="Tiếng Việt",
	label="Ngôn ngữ đích"
	)

	with gr.Row():
	hours = gr.Number(label="Giờ", value=0, precision=0, minimum=0)
	minutes = gr.Number(label="Phút", value=0, precision=0, minimum=0, maximum=59)
	seconds = gr.Number(label="Giây", value=0, minimum=0, step=0.1)

	with gr.Row():
	add_time_btn = gr.Button("Chỉ thêm thời gian", variant="secondary")
	translate_btn = gr.Button("Dịch phụ đề", variant="primary")

	with gr.Column():
	file_output = gr.File(label="File phụ đề đã xử lý", interactive=False)
	status = gr.Textbox(label="Trạng thái")

	# Xử lý khi upload file
	def on_file_upload(file, auto_detect_flag):
	if file and auto_detect_flag:
	try:
	lang_code = detect_subtitle_language(file.name)
	detected_lang = LANGUAGE_CODES.get(lang_code, "Tiếng Anh")
	return gr.Dropdown(value=detected_lang)
	except:
	return gr.Dropdown(value="Tiếng Anh")
	return gr.Dropdown()

	file_input.upload(
	fn=on_file_upload,
	inputs=[file_input, auto_detect],
	outputs=source_lang
	)

	# Xử lý khi nhấn nút thêm thời gian
	add_time_btn.click(
	fn=add_time_to_subtitle,
	inputs=[file_input, hours, minutes, seconds],
	outputs=[file_output, status]
	)

	# Xử lý khi nhấn nút dịch phụ đề
	translate_btn.click(
	fn=translate_subtitle,
	inputs=[file_input, source_lang, target_lang, hours, minutes, seconds],
	outputs=[file_output, status]
	)

	gr.Markdown("### Thông tin")
	gr.Markdown("""
	- Hỗ trợ định dạng .srt
	- Tự động phát hiện ngôn ngữ nguồn
	- Dịch giữa 24 ngôn ngữ khác nhau
	- Hỗ trợ dịch qua tiếng Anh nếu không có model trực tiếp
	- Thêm thời gian vào tất cả phụ đề (hỗ trợ giây thập phân)
	- Có nút riêng để chỉ thêm thời gian trước khi dịch
	""")

	if __name__ == "__main__":
	demo.launch()