Spaces:
Running
on
T4
Running
on
T4
| import librosa | |
| from transformers import AutoFeatureExtractor, Wav2Vec2BertModel | |
| import soundfile as sf | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| # Model and feature extractor (same as before) | |
| model_id = "facebook/w2v-bert-2.0" | |
| feature_extractor = AutoFeatureExtractor.from_pretrained(model_id) | |
| model = Wav2Vec2BertModel.from_pretrained(model_id) | |
| def load_and_resample_audio(file_path, target_sample_rate=16000): | |
| audio_input, sample_rate = sf.read(file_path) | |
| if sample_rate != target_sample_rate: | |
| audio_input = librosa.resample( | |
| audio_input, orig_sr=sample_rate, target_sr=target_sample_rate | |
| ) | |
| return audio_input, target_sample_rate | |
| def calculate_mfcc(audio_data, sample_rate): | |
| mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13) | |
| mfccs_scaled = np.mean(mfccs.T, axis=0) # Average across time dimension | |
| return mfccs_scaled | |
| def calculate_similarity(mfccs1, mfccs2): | |
| similarity = cosine_similarity(mfccs1.reshape(1, -1), mfccs2.reshape(1, -1)) | |
| return similarity[0][0] | |
| def mfcc_similarty_check(original: str, recorded: str): | |
| correct_pronunciation_audio, _ = load_and_resample_audio(original) | |
| user_pronunciation_audio, sample_rate = load_and_resample_audio(recorded) | |
| # Extract MFCCs from audio data | |
| correct_mfccs = calculate_mfcc(correct_pronunciation_audio.flatten(), sample_rate) | |
| user_mfccs = calculate_mfcc(user_pronunciation_audio.flatten(), sample_rate) | |
| distance = np.linalg.norm(correct_mfccs.flatten() - user_mfccs.flatten()) | |
| # Calculate cosine similarity using MFCCs | |
| similarity_score = calculate_similarity(correct_mfccs, user_mfccs) | |
| accuracy_percentage = similarity_score * 100 | |
| return distance, accuracy_percentage | |