Spaces:

f2ai
/

exp-audio-recorder

Sleeping

App Files Files Community

exp-audio-recorder / asr /Golf /asr_pathumma.py

f2ai

Upload folder using huggingface_hub

ad93d56 verified 7 months ago

raw

history blame contribute delete

3.13 kB

	import os
	from pydub import AudioSegment
	from transformers import pipeline
	import torch
	import csv
	import time
	from tqdm import tqdm


	# ======== Configuration ========
	# ชื่อโมเดล Whisper ที่ใช้
	model_name = "nectec/Pathumma-whisper-th-large-v3"
	# โฟลเดอร์ไฟล์เสียงต้นทาง
	input_folder = "/kaggle/input/audio-understanding/speechs/speechs/test"
	# บันทึกผลลง CSV
	output_csv = "asr.csv"
	# ===============================

	start_time = time.perf_counter()

	# ตั้งค่าการใช้ GPU / CPU
	device = 0 if torch.cuda.is_available() else -1
	torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	# โหลดโมเดล Pathumma
	pipe = pipeline(
	task="automatic-speech-recognition",
	model=model_name,
	torch_dtype=torch_dtype,
	device=device,
	)

	# กำหนดภาษาและ task
	lang = "th"
	task = "transcribe"
	pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(
	language=lang, task=task
	)


	# สร้าง list สำหรับเก็บผลลัพธ์
	results = []

	# วนลูปทุกไฟล์ .wav ในโฟลเดอร์
	wav_files = sorted([f for f in os.listdir(input_folder) if f.endswith(".wav")])

	for filename in tqdm(wav_files, desc=":open_file_folder: Processing files"):
	full_path = os.path.join(input_folder, filename)

	try:
	audio = AudioSegment.from_file(full_path)
	except Exception as e:
	print(f"\n:x: Error loading {filename}: {e}")
	results.append({"id": filename, "transcription": "[ERROR: Cannot load file]"})
	continue

	chunk_length_ms = 27000 # ตัดเป็นช่วงละ 27 วินาที
	full_transcription = ""

	num_chunks = (len(audio) + chunk_length_ms - 1) // chunk_length_ms

	for i in tqdm(
	range(num_chunks), desc=f":loud_sound: Chunks for {filename}", leave=False
	):
	start = i * chunk_length_ms
	chunk = audio[start : start + chunk_length_ms]
	chunk_path = f"temp_chunk_{i}.wav"
	chunk.export(chunk_path, format="wav")

	try:
	output = pipe(chunk_path)
	full_transcription += output["text"].strip() + " "
	except Exception as e:
	print(f"\n:x: Error on chunk {i} of {filename}: {e}")
	full_transcription += "[ERROR] "

	os.remove(chunk_path) # ลบไฟล์ชั่วคราว

	results.append({"id": filename, "transcription": full_transcription.strip()})

	with open(output_csv, mode="w", newline="", encoding="utf-8") as file:
	writer = csv.DictWriter(file, fieldnames=["id", "transcription"])
	writer.writeheader()
	for row in results:
	writer.writerow(row)

	end_time = time.perf_counter()
	elapsed_time = end_time - start_time

	print(f"\n:white_check_mark: All done! Time taken: {elapsed_time:.2f} seconds")
	print(f":page_facing_up: Results saved to {output_csv}")