Spaces:
Sleeping
Sleeping
| # Parameters | |
| CHUNK_DURATION = 30 # seconds | |
| SAMPLING_RATE = 16000 | |
| transcriptions = [] | |
| folder_path = "/data/week_6/DATA/Audio Understanding (SCBx)/speechs/test" | |
| audio_files = os.listdir(folder_path) | |
| model_paths = { | |
| "monsoon": "scb10x/monsoon-whisper-medium-gigaspeech2", | |
| "whisper": "openai/whisper-large-v3-turbo", | |
| "Pathumma": "nectec/Pathumma-whisper-th-large-v3", | |
| # Add more models here if needed | |
| } | |
| model_path = "/data/week_6/Models/qwen" | |
| output_name = "merged_transcriptions.csv" | |
| import os | |
| os.environ["HF_HOME"] = "/data/.cache" | |
| import librosa | |
| from transformers import ( | |
| AutoProcessor, | |
| AutoModelForSpeechSeq2Seq, | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| pipeline, | |
| ) | |
| import torch | |
| import pandas as pd | |
| from tqdm import tqdm | |
| os.environ["HF_HOME"] = "/data/.cache" | |
| pd.set_option("display.max_colwidth", None) | |
| # transcribe chunks | |
| def transcribe_audio_chunks(audio, sr, model, processor): | |
| chunk_size = CHUNK_DURATION * sr | |
| total_chunks = int(len(audio) / chunk_size) + 1 | |
| text_chunks = [] | |
| for i in range(total_chunks): | |
| start = i * chunk_size | |
| end = min((i + 1) * chunk_size, len(audio)) | |
| chunk = audio[start:end] | |
| if len(chunk) < 1000: | |
| continue | |
| input_features = processor( | |
| chunk, sampling_rate=sr, return_tensors="pt" | |
| ).input_features.to(model.device) | |
| max_target = ( | |
| model.config.max_target_positions | |
| if hasattr(model.config, "max_target_positions") | |
| else 448 | |
| ) | |
| max_new_tokens = max_target - 4 | |
| with torch.no_grad(): | |
| predicted_ids = model.generate( | |
| input_features, max_new_tokens=max_new_tokens | |
| ) | |
| text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
| text_chunks.append(text) | |
| return " ".join(text_chunks) | |
| df = pd.DataFrame({"File": audio_files}) | |
| # Loop transcribing with each model | |
| for model_name, model_path in model_paths.items(): | |
| print(f"\n🚀 Transcribing with model: {model_name}") | |
| processor = AutoProcessor.from_pretrained(model_path) | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path).to("cuda").eval() | |
| # For wav2vec2 models, uncomment and use this: | |
| # processor = Wav2Vec2Processor.from_pretrained(model_path) | |
| # model = Wav2Vec2ForCTC.from_pretrained(model_path).to("cuda").eval() | |
| transcriptions = [] | |
| for file_name in tqdm(audio_files, desc=f"[{model_name}] Transcribing"): | |
| try: | |
| file_path = os.path.join(folder_path, file_name) | |
| audio, sr = librosa.load(file_path, sr=SAMPLING_RATE) | |
| full_text = transcribe_audio_chunks(audio, sr, model, processor) | |
| except Exception as e: | |
| print(f"❌ Error on {file_name} with model {model_name}: {e}") | |
| full_text = f"[ERROR] {e}" | |
| transcriptions.append(full_text) | |
| df[f"Transcription_{model_name}"] = transcriptions | |
| # Save to CSV | |
| # df.to_csv("multi_model_transcriptions.csv", index=False, encoding='utf-8-sig') | |
| # for merge DF | |
| """ monsson = pd.read_csv("/data/week_6/Code/SATANG/monsoon_transcription.csv") | |
| whisper = pd.read_csv("/data/week_6/Code/SATANG/whisper_transcription.csv") | |
| wav2vec = pd.read_csv("/data/week_6/Code/SATANG/wev2vec.csv") | |
| # Merge the three dataframes on the 'id' column | |
| merged_df = monsson.merge(whisper, on='File', suffixes=('_monsoon', '_whisper')) | |
| merged_df = merged_df.merge(wav2vec, on='File', suffixes=('', '_wav2vec')) | |
| merged_df = df.copy | |
| display(merged_df) """ | |
| df = df[["File", "Transcription_monsoon", "Transcription_whisper", "Transcription"]] | |
| # Load model and tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) | |
| model.to("cuda") | |
| # %% | |
| # Create pipeline | |
| generator = pipeline("text-generation", model=model, tokenizer=tokenizer) | |
| merged_outputs = [] | |
| # Loop through each row | |
| for _, row in tqdm(df.iterrows(), total=len(df)): | |
| file_name = row["File"] | |
| monsoon = str(row["Transcription_monsoon"]) | |
| whisper = str(row["Transcription_whisper"]) | |
| if not monsoon.strip() and not whisper.strip(): | |
| merged_outputs.append("") | |
| continue | |
| prompt = f""" | |
| รวมข้อความต่อไปนี้จากสองระบบให้กลายเป็นข้อความเดียวที่สมบูรณ์ ชัดเจน และเป็นธรรมชาติที่สุดในภาษาไทย | |
| หากมีคำผิดหรือเสียงเพี้ยนให้แก้ไขให้เหมาะสมและคงความหมายเดิม | |
| 📂 ชื่อไฟล์: {file_name} | |
| 🌀 Whisper: | |
| {whisper} | |
| 🌧️ Monsoon: | |
| {monsoon} | |
| ✅ ข้อความรวม (ภาษาไทย): | |
| """ | |
| try: | |
| response = generator(prompt, max_new_tokens=256, temperature=0.3) | |
| merged = response[0]["generated_text"].replace(prompt, "").strip() | |
| except Exception as e: | |
| merged = f"[ERROR] {e}" | |
| # Print for debugging | |
| # print(f"Processed {file_name}: {merged}...") | |
| merged_outputs.append(merged) | |
| # Save the output | |
| df["Merged_transcription"] = merged_outputs | |
| df.to_csv(output_name, index=False, encoding="utf-8-sig") | |