Spaces:

f2ai
/

exp-audio-recorder

Sleeping

File size: 5,362 Bytes

ad93d56

# Parameters
CHUNK_DURATION = 30  # seconds
SAMPLING_RATE = 16000
transcriptions = []
folder_path = "/data/week_6/DATA/Audio Understanding (SCBx)/speechs/test"
audio_files = os.listdir(folder_path)
model_paths = {
    "monsoon": "scb10x/monsoon-whisper-medium-gigaspeech2",
    "whisper": "openai/whisper-large-v3-turbo",
    "Pathumma": "nectec/Pathumma-whisper-th-large-v3",
    # Add more models here if needed
}

model_path = "/data/week_6/Models/qwen"
output_name = "merged_transcriptions.csv"

import os

os.environ["HF_HOME"] = "/data/.cache"
import librosa
from transformers import (
    AutoProcessor,
    AutoModelForSpeechSeq2Seq,
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
)
import torch

import pandas as pd
from tqdm import tqdm


os.environ["HF_HOME"] = "/data/.cache"
pd.set_option("display.max_colwidth", None)


# transcribe chunks
def transcribe_audio_chunks(audio, sr, model, processor):
    chunk_size = CHUNK_DURATION * sr
    total_chunks = int(len(audio) / chunk_size) + 1
    text_chunks = []

    for i in range(total_chunks):
        start = i * chunk_size
        end = min((i + 1) * chunk_size, len(audio))
        chunk = audio[start:end]

        if len(chunk) < 1000:
            continue

        input_features = processor(
            chunk, sampling_rate=sr, return_tensors="pt"
        ).input_features.to(model.device)

        max_target = (
            model.config.max_target_positions
            if hasattr(model.config, "max_target_positions")
            else 448
        )
        max_new_tokens = max_target - 4

        with torch.no_grad():
            predicted_ids = model.generate(
                input_features, max_new_tokens=max_new_tokens
            )
        text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        text_chunks.append(text)

    return " ".join(text_chunks)


df = pd.DataFrame({"File": audio_files})

# Loop transcribing with each model
for model_name, model_path in model_paths.items():
    print(f"\n🚀 Transcribing with model: {model_name}")

    processor = AutoProcessor.from_pretrained(model_path)
    model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path).to("cuda").eval()

    # For wav2vec2 models, uncomment and use this:
    # processor = Wav2Vec2Processor.from_pretrained(model_path)
    # model = Wav2Vec2ForCTC.from_pretrained(model_path).to("cuda").eval()

    transcriptions = []

    for file_name in tqdm(audio_files, desc=f"[{model_name}] Transcribing"):
        try:
            file_path = os.path.join(folder_path, file_name)
            audio, sr = librosa.load(file_path, sr=SAMPLING_RATE)
            full_text = transcribe_audio_chunks(audio, sr, model, processor)

        except Exception as e:
            print(f"❌ Error on {file_name} with model {model_name}: {e}")
            full_text = f"[ERROR] {e}"

        transcriptions.append(full_text)

    df[f"Transcription_{model_name}"] = transcriptions

# Save to CSV
# df.to_csv("multi_model_transcriptions.csv", index=False, encoding='utf-8-sig')


# for merge DF
""" monsson = pd.read_csv("/data/week_6/Code/SATANG/monsoon_transcription.csv")
whisper = pd.read_csv("/data/week_6/Code/SATANG/whisper_transcription.csv")
wav2vec = pd.read_csv("/data/week_6/Code/SATANG/wev2vec.csv")
# Merge the three dataframes on the 'id' column
merged_df = monsson.merge(whisper, on='File', suffixes=('_monsoon', '_whisper'))
merged_df = merged_df.merge(wav2vec, on='File', suffixes=('', '_wav2vec')) 
merged_df = df.copy
display(merged_df) """


df = df[["File", "Transcription_monsoon", "Transcription_whisper", "Transcription"]]

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
model.to("cuda")

# %%
# Create pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
merged_outputs = []

# Loop through each row
for _, row in tqdm(df.iterrows(), total=len(df)):
    file_name = row["File"]
    monsoon = str(row["Transcription_monsoon"])
    whisper = str(row["Transcription_whisper"])

    if not monsoon.strip() and not whisper.strip():
        merged_outputs.append("")
        continue

    prompt = f"""
รวมข้อความต่อไปนี้จากสองระบบให้กลายเป็นข้อความเดียวที่สมบูรณ์ ชัดเจน และเป็นธรรมชาติที่สุดในภาษาไทย
หากมีคำผิดหรือเสียงเพี้ยนให้แก้ไขให้เหมาะสมและคงความหมายเดิม

📂 ชื่อไฟล์: {file_name}

🌀 Whisper:
{whisper}

🌧️ Monsoon:
{monsoon}

✅ ข้อความรวม (ภาษาไทย):
"""

    try:
        response = generator(prompt, max_new_tokens=256, temperature=0.3)
        merged = response[0]["generated_text"].replace(prompt, "").strip()
    except Exception as e:
        merged = f"[ERROR] {e}"
    # Print for debugging
    # print(f"Processed {file_name}: {merged}...")

    merged_outputs.append(merged)

# Save the output
df["Merged_transcription"] = merged_outputs
df.to_csv(output_name, index=False, encoding="utf-8-sig")