f2ai's picture
Upload folder using huggingface_hub
ad93d56 verified
# Parameters
CHUNK_DURATION = 30 # seconds
SAMPLING_RATE = 16000
transcriptions = []
folder_path = "/data/week_6/DATA/Audio Understanding (SCBx)/speechs/test"
audio_files = os.listdir(folder_path)
model_paths = {
"monsoon": "scb10x/monsoon-whisper-medium-gigaspeech2",
"whisper": "openai/whisper-large-v3-turbo",
"Pathumma": "nectec/Pathumma-whisper-th-large-v3",
# Add more models here if needed
}
model_path = "/data/week_6/Models/qwen"
output_name = "merged_transcriptions.csv"
import os
os.environ["HF_HOME"] = "/data/.cache"
import librosa
from transformers import (
AutoProcessor,
AutoModelForSpeechSeq2Seq,
AutoModelForCausalLM,
AutoTokenizer,
pipeline,
)
import torch
import pandas as pd
from tqdm import tqdm
os.environ["HF_HOME"] = "/data/.cache"
pd.set_option("display.max_colwidth", None)
# transcribe chunks
def transcribe_audio_chunks(audio, sr, model, processor):
chunk_size = CHUNK_DURATION * sr
total_chunks = int(len(audio) / chunk_size) + 1
text_chunks = []
for i in range(total_chunks):
start = i * chunk_size
end = min((i + 1) * chunk_size, len(audio))
chunk = audio[start:end]
if len(chunk) < 1000:
continue
input_features = processor(
chunk, sampling_rate=sr, return_tensors="pt"
).input_features.to(model.device)
max_target = (
model.config.max_target_positions
if hasattr(model.config, "max_target_positions")
else 448
)
max_new_tokens = max_target - 4
with torch.no_grad():
predicted_ids = model.generate(
input_features, max_new_tokens=max_new_tokens
)
text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
text_chunks.append(text)
return " ".join(text_chunks)
df = pd.DataFrame({"File": audio_files})
# Loop transcribing with each model
for model_name, model_path in model_paths.items():
print(f"\n🚀 Transcribing with model: {model_name}")
processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path).to("cuda").eval()
# For wav2vec2 models, uncomment and use this:
# processor = Wav2Vec2Processor.from_pretrained(model_path)
# model = Wav2Vec2ForCTC.from_pretrained(model_path).to("cuda").eval()
transcriptions = []
for file_name in tqdm(audio_files, desc=f"[{model_name}] Transcribing"):
try:
file_path = os.path.join(folder_path, file_name)
audio, sr = librosa.load(file_path, sr=SAMPLING_RATE)
full_text = transcribe_audio_chunks(audio, sr, model, processor)
except Exception as e:
print(f"❌ Error on {file_name} with model {model_name}: {e}")
full_text = f"[ERROR] {e}"
transcriptions.append(full_text)
df[f"Transcription_{model_name}"] = transcriptions
# Save to CSV
# df.to_csv("multi_model_transcriptions.csv", index=False, encoding='utf-8-sig')
# for merge DF
""" monsson = pd.read_csv("/data/week_6/Code/SATANG/monsoon_transcription.csv")
whisper = pd.read_csv("/data/week_6/Code/SATANG/whisper_transcription.csv")
wav2vec = pd.read_csv("/data/week_6/Code/SATANG/wev2vec.csv")
# Merge the three dataframes on the 'id' column
merged_df = monsson.merge(whisper, on='File', suffixes=('_monsoon', '_whisper'))
merged_df = merged_df.merge(wav2vec, on='File', suffixes=('', '_wav2vec'))
merged_df = df.copy
display(merged_df) """
df = df[["File", "Transcription_monsoon", "Transcription_whisper", "Transcription"]]
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
model.to("cuda")
# %%
# Create pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
merged_outputs = []
# Loop through each row
for _, row in tqdm(df.iterrows(), total=len(df)):
file_name = row["File"]
monsoon = str(row["Transcription_monsoon"])
whisper = str(row["Transcription_whisper"])
if not monsoon.strip() and not whisper.strip():
merged_outputs.append("")
continue
prompt = f"""
รวมข้อความต่อไปนี้จากสองระบบให้กลายเป็นข้อความเดียวที่สมบูรณ์ ชัดเจน และเป็นธรรมชาติที่สุดในภาษาไทย
หากมีคำผิดหรือเสียงเพี้ยนให้แก้ไขให้เหมาะสมและคงความหมายเดิม
📂 ชื่อไฟล์: {file_name}
🌀 Whisper:
{whisper}
🌧️ Monsoon:
{monsoon}
✅ ข้อความรวม (ภาษาไทย):
"""
try:
response = generator(prompt, max_new_tokens=256, temperature=0.3)
merged = response[0]["generated_text"].replace(prompt, "").strip()
except Exception as e:
merged = f"[ERROR] {e}"
# Print for debugging
# print(f"Processed {file_name}: {merged}...")
merged_outputs.append(merged)
# Save the output
df["Merged_transcription"] = merged_outputs
df.to_csv(output_name, index=False, encoding="utf-8-sig")