# Parameters CHUNK_DURATION = 30 # seconds SAMPLING_RATE = 16000 transcriptions = [] folder_path = "/data/week_6/DATA/Audio Understanding (SCBx)/speechs/test" audio_files = os.listdir(folder_path) model_paths = { "monsoon": "scb10x/monsoon-whisper-medium-gigaspeech2", "whisper": "openai/whisper-large-v3-turbo", "Pathumma": "nectec/Pathumma-whisper-th-large-v3", # Add more models here if needed } model_path = "/data/week_6/Models/qwen" output_name = "merged_transcriptions.csv" import os os.environ["HF_HOME"] = "/data/.cache" import librosa from transformers import ( AutoProcessor, AutoModelForSpeechSeq2Seq, AutoModelForCausalLM, AutoTokenizer, pipeline, ) import torch import pandas as pd from tqdm import tqdm os.environ["HF_HOME"] = "/data/.cache" pd.set_option("display.max_colwidth", None) # transcribe chunks def transcribe_audio_chunks(audio, sr, model, processor): chunk_size = CHUNK_DURATION * sr total_chunks = int(len(audio) / chunk_size) + 1 text_chunks = [] for i in range(total_chunks): start = i * chunk_size end = min((i + 1) * chunk_size, len(audio)) chunk = audio[start:end] if len(chunk) < 1000: continue input_features = processor( chunk, sampling_rate=sr, return_tensors="pt" ).input_features.to(model.device) max_target = ( model.config.max_target_positions if hasattr(model.config, "max_target_positions") else 448 ) max_new_tokens = max_target - 4 with torch.no_grad(): predicted_ids = model.generate( input_features, max_new_tokens=max_new_tokens ) text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] text_chunks.append(text) return " ".join(text_chunks) df = pd.DataFrame({"File": audio_files}) # Loop transcribing with each model for model_name, model_path in model_paths.items(): print(f"\n🚀 Transcribing with model: {model_name}") processor = AutoProcessor.from_pretrained(model_path) model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path).to("cuda").eval() # For wav2vec2 models, uncomment and use this: # processor = Wav2Vec2Processor.from_pretrained(model_path) # model = Wav2Vec2ForCTC.from_pretrained(model_path).to("cuda").eval() transcriptions = [] for file_name in tqdm(audio_files, desc=f"[{model_name}] Transcribing"): try: file_path = os.path.join(folder_path, file_name) audio, sr = librosa.load(file_path, sr=SAMPLING_RATE) full_text = transcribe_audio_chunks(audio, sr, model, processor) except Exception as e: print(f"❌ Error on {file_name} with model {model_name}: {e}") full_text = f"[ERROR] {e}" transcriptions.append(full_text) df[f"Transcription_{model_name}"] = transcriptions # Save to CSV # df.to_csv("multi_model_transcriptions.csv", index=False, encoding='utf-8-sig') # for merge DF """ monsson = pd.read_csv("/data/week_6/Code/SATANG/monsoon_transcription.csv") whisper = pd.read_csv("/data/week_6/Code/SATANG/whisper_transcription.csv") wav2vec = pd.read_csv("/data/week_6/Code/SATANG/wev2vec.csv") # Merge the three dataframes on the 'id' column merged_df = monsson.merge(whisper, on='File', suffixes=('_monsoon', '_whisper')) merged_df = merged_df.merge(wav2vec, on='File', suffixes=('', '_wav2vec')) merged_df = df.copy display(merged_df) """ df = df[["File", "Transcription_monsoon", "Transcription_whisper", "Transcription"]] # Load model and tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) model.to("cuda") # %% # Create pipeline generator = pipeline("text-generation", model=model, tokenizer=tokenizer) merged_outputs = [] # Loop through each row for _, row in tqdm(df.iterrows(), total=len(df)): file_name = row["File"] monsoon = str(row["Transcription_monsoon"]) whisper = str(row["Transcription_whisper"]) if not monsoon.strip() and not whisper.strip(): merged_outputs.append("") continue prompt = f""" รวมข้อความต่อไปนี้จากสองระบบให้กลายเป็นข้อความเดียวที่สมบูรณ์ ชัดเจน และเป็นธรรมชาติที่สุดในภาษาไทย หากมีคำผิดหรือเสียงเพี้ยนให้แก้ไขให้เหมาะสมและคงความหมายเดิม 📂 ชื่อไฟล์: {file_name} 🌀 Whisper: {whisper} 🌧️ Monsoon: {monsoon} ✅ ข้อความรวม (ภาษาไทย): """ try: response = generator(prompt, max_new_tokens=256, temperature=0.3) merged = response[0]["generated_text"].replace(prompt, "").strip() except Exception as e: merged = f"[ERROR] {e}" # Print for debugging # print(f"Processed {file_name}: {merged}...") merged_outputs.append(merged) # Save the output df["Merged_transcription"] = merged_outputs df.to_csv(output_name, index=False, encoding="utf-8-sig")