Spaces:

f2ai
/

exp-audio-recorder

Sleeping

App Files Files Community

exp-audio-recorder / preprocess /Satang /ensembling.py

f2ai

Upload folder using huggingface_hub

ad93d56 verified 7 months ago

raw

history blame contribute delete

5.36 kB

	# Parameters
	CHUNK_DURATION = 30 # seconds
	SAMPLING_RATE = 16000
	transcriptions = []
	folder_path = "/data/week_6/DATA/Audio Understanding (SCBx)/speechs/test"
	audio_files = os.listdir(folder_path)
	model_paths = {
	"monsoon": "scb10x/monsoon-whisper-medium-gigaspeech2",
	"whisper": "openai/whisper-large-v3-turbo",
	"Pathumma": "nectec/Pathumma-whisper-th-large-v3",
	# Add more models here if needed
	}

	model_path = "/data/week_6/Models/qwen"
	output_name = "merged_transcriptions.csv"

	import os

	os.environ["HF_HOME"] = "/data/.cache"
	import librosa
	from transformers import (
	AutoProcessor,
	AutoModelForSpeechSeq2Seq,
	AutoModelForCausalLM,
	AutoTokenizer,
	pipeline,
	)
	import torch

	import pandas as pd
	from tqdm import tqdm


	os.environ["HF_HOME"] = "/data/.cache"
	pd.set_option("display.max_colwidth", None)


	# transcribe chunks
	def transcribe_audio_chunks(audio, sr, model, processor):
	chunk_size = CHUNK_DURATION * sr
	total_chunks = int(len(audio) / chunk_size) + 1
	text_chunks = []

	for i in range(total_chunks):
	start = i * chunk_size
	end = min((i + 1) * chunk_size, len(audio))
	chunk = audio[start:end]

	if len(chunk) < 1000:
	continue

	input_features = processor(
	chunk, sampling_rate=sr, return_tensors="pt"
	).input_features.to(model.device)

	max_target = (
	model.config.max_target_positions
	if hasattr(model.config, "max_target_positions")
	else 448
	)
	max_new_tokens = max_target - 4

	with torch.no_grad():
	predicted_ids = model.generate(
	input_features, max_new_tokens=max_new_tokens
	)
	text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
	text_chunks.append(text)

	return " ".join(text_chunks)


	df = pd.DataFrame({"File": audio_files})

	# Loop transcribing with each model
	for model_name, model_path in model_paths.items():
	print(f"\n🚀 Transcribing with model: {model_name}")

	processor = AutoProcessor.from_pretrained(model_path)
	model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path).to("cuda").eval()

	# For wav2vec2 models, uncomment and use this:
	# processor = Wav2Vec2Processor.from_pretrained(model_path)
	# model = Wav2Vec2ForCTC.from_pretrained(model_path).to("cuda").eval()

	transcriptions = []

	for file_name in tqdm(audio_files, desc=f"[{model_name}] Transcribing"):
	try:
	file_path = os.path.join(folder_path, file_name)
	audio, sr = librosa.load(file_path, sr=SAMPLING_RATE)
	full_text = transcribe_audio_chunks(audio, sr, model, processor)

	except Exception as e:
	print(f"❌ Error on {file_name} with model {model_name}: {e}")
	full_text = f"[ERROR] {e}"

	transcriptions.append(full_text)

	df[f"Transcription_{model_name}"] = transcriptions

	# Save to CSV
	# df.to_csv("multi_model_transcriptions.csv", index=False, encoding='utf-8-sig')


	# for merge DF
	""" monsson = pd.read_csv("/data/week_6/Code/SATANG/monsoon_transcription.csv")
	whisper = pd.read_csv("/data/week_6/Code/SATANG/whisper_transcription.csv")
	wav2vec = pd.read_csv("/data/week_6/Code/SATANG/wev2vec.csv")
	# Merge the three dataframes on the 'id' column
	merged_df = monsson.merge(whisper, on='File', suffixes=('_monsoon', '_whisper'))
	merged_df = merged_df.merge(wav2vec, on='File', suffixes=('', '_wav2vec'))
	merged_df = df.copy
	display(merged_df) """


	df = df[["File", "Transcription_monsoon", "Transcription_whisper", "Transcription"]]

	# Load model and tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
	model.to("cuda")

	# %%
	# Create pipeline
	generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
	merged_outputs = []

	# Loop through each row
	for _, row in tqdm(df.iterrows(), total=len(df)):
	file_name = row["File"]
	monsoon = str(row["Transcription_monsoon"])
	whisper = str(row["Transcription_whisper"])

	if not monsoon.strip() and not whisper.strip():
	merged_outputs.append("")
	continue

	prompt = f"""
	รวมข้อความต่อไปนี้จากสองระบบให้กลายเป็นข้อความเดียวที่สมบูรณ์ ชัดเจน และเป็นธรรมชาติที่สุดในภาษาไทย
	หากมีคำผิดหรือเสียงเพี้ยนให้แก้ไขให้เหมาะสมและคงความหมายเดิม

	📂 ชื่อไฟล์: {file_name}

	🌀 Whisper:
	{whisper}

	🌧️ Monsoon:
	{monsoon}

	✅ ข้อความรวม (ภาษาไทย):
	"""

	try:
	response = generator(prompt, max_new_tokens=256, temperature=0.3)
	merged = response[0]["generated_text"].replace(prompt, "").strip()
	except Exception as e:
	merged = f"[ERROR] {e}"
	# Print for debugging
	# print(f"Processed {file_name}: {merged}...")

	merged_outputs.append(merged)

	# Save the output
	df["Merged_transcription"] = merged_outputs
	df.to_csv(output_name, index=False, encoding="utf-8-sig")