Spaces:
Sleeping
Sleeping
| # models/embeddings/audio_embedding_model.py | |
| import torch | |
| import librosa | |
| from typing import List | |
| from transformers import AutoProcessor, AutoModel | |
| from utils.logger import logger | |
| from config.model_configs import AUDIO_EMBEDDING_MODEL | |
| class AudioEmbeddingModel: | |
| def __init__(self): | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| logger.info(f"Loading Audio Embedding Model '{AUDIO_EMBEDDING_MODEL}' to device: {self.device}") | |
| self.processor = AutoProcessor.from_pretrained(AUDIO_EMBEDDING_MODEL) | |
| self.model = AutoModel.from_pretrained(AUDIO_EMBEDDING_MODEL).to(self.device) | |
| logger.info("Audio Embedding Model loaded successfully.") | |
| def get_embeddings(self, audio_paths: List[str]) -> List[List[float]]: | |
| if not audio_paths: | |
| return [] | |
| audio_inputs = [] | |
| sample_rate = self.processor.feature_extractor.sampling_rate | |
| for audio_path in audio_paths: | |
| try: | |
| audio_data, sr = librosa.load(audio_path, sr=sample_rate) | |
| audio_inputs.append(audio_data) | |
| except Exception as e: | |
| logger.warning(f"Could not load audio {audio_path}: {e}. Skipping.") | |
| continue | |
| if not audio_inputs: | |
| return [] | |
| inputs = self.processor(audios=audio_inputs, sampling_rate=sample_rate, return_tensors="pt", padding=True).to(self.device) | |
| with torch.no_grad(): | |
| audio_features = self.model.get_audio_features(**inputs) | |
| embeddings = audio_features / audio_features.norm(p=2, dim=-1, keepdim=True) | |
| embeddings_list = embeddings.cpu().tolist() | |
| logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(audio_inputs)} audio clips.") | |
| return embeddings_list |