File size: 3,125 Bytes
ad93d56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
from pydub import AudioSegment
from transformers import pipeline
import torch
import csv
import time
from tqdm import tqdm


# ======== Configuration ========
# ชื่อโมเดล Whisper ที่ใช้
model_name = "nectec/Pathumma-whisper-th-large-v3"
# โฟลเดอร์ไฟล์เสียงต้นทาง
input_folder = "/kaggle/input/audio-understanding/speechs/speechs/test"
# บันทึกผลลง CSV
output_csv = "asr.csv"
# ===============================

start_time = time.perf_counter()

# ตั้งค่าการใช้ GPU / CPU
device = 0 if torch.cuda.is_available() else -1
torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

# โหลดโมเดล Pathumma
pipe = pipeline(
    task="automatic-speech-recognition",
    model=model_name,
    torch_dtype=torch_dtype,
    device=device,
)

# กำหนดภาษาและ task
lang = "th"
task = "transcribe"
pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(
    language=lang, task=task
)


# สร้าง list สำหรับเก็บผลลัพธ์
results = []

# วนลูปทุกไฟล์ .wav ในโฟลเดอร์
wav_files = sorted([f for f in os.listdir(input_folder) if f.endswith(".wav")])

for filename in tqdm(wav_files, desc=":open_file_folder: Processing files"):
    full_path = os.path.join(input_folder, filename)

    try:
        audio = AudioSegment.from_file(full_path)
    except Exception as e:
        print(f"\n:x: Error loading {filename}: {e}")
        results.append({"id": filename, "transcription": "[ERROR: Cannot load file]"})
        continue

    chunk_length_ms = 27000  # ตัดเป็นช่วงละ 27 วินาที
    full_transcription = ""

    num_chunks = (len(audio) + chunk_length_ms - 1) // chunk_length_ms

    for i in tqdm(
        range(num_chunks), desc=f":loud_sound: Chunks for {filename}", leave=False
    ):
        start = i * chunk_length_ms
        chunk = audio[start : start + chunk_length_ms]
        chunk_path = f"temp_chunk_{i}.wav"
        chunk.export(chunk_path, format="wav")

        try:
            output = pipe(chunk_path)
            full_transcription += output["text"].strip() + " "
        except Exception as e:
            print(f"\n:x: Error on chunk {i} of {filename}: {e}")
            full_transcription += "[ERROR] "

        os.remove(chunk_path)  # ลบไฟล์ชั่วคราว

    results.append({"id": filename, "transcription": full_transcription.strip()})

with open(output_csv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["id", "transcription"])
    writer.writeheader()
    for row in results:
        writer.writerow(row)

end_time = time.perf_counter()
elapsed_time = end_time - start_time

print(f"\n:white_check_mark: All done! Time taken: {elapsed_time:.2f} seconds")
print(f":page_facing_up: Results saved to {output_csv}")