Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| import torchaudio | |
| import gradio as gr | |
| import numpy as np | |
| from transformers import AutoFeatureExtractor, AutoModelForAudioClassification | |
| import torchaudio.transforms as transforms | |
| MODEL_NAME = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech" | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME) | |
| model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME).to(device) | |
| label2id = {"female": 0, "male": 1} | |
| id2label = {0: "Female", 1: "Male"} | |
| def preprocess_audio(audio): | |
| """Convert stereo to mono, normalize, resample, and pad audio if needed.""" | |
| # Check if audio is not blank | |
| if audio is None: | |
| return None | |
| sr, audio_data = audio | |
| if audio_data is None: | |
| return None | |
| if audio_data.ndim > 1: | |
| audio_data = np.mean(audio_data, axis=0) | |
| audio_tensor = torch.tensor(audio_data, dtype=torch.float32) | |
| resampler = torchaudio.transforms.Resample(sr, 16000) | |
| audio_data_resampled = resampler(audio_tensor).numpy() | |
| min_length = 16000 | |
| if audio_data_resampled.shape[0] < min_length: | |
| padding = np.zeros(min_length - audio_data_resampled.shape[0], dtype=audio_data_resampled.dtype) | |
| audio_data_resampled = np.concatenate([audio_data_resampled, padding]) | |
| return audio_data_resampled | |
| def predict_gender(audio): | |
| if audio is None: | |
| return {"Error": "No audio provided."} | |
| audio_data = preprocess_audio(audio) | |
| if audio_data is None: | |
| return {"Error": "Invalid audio input."} | |
| inputs = feature_extractor(audio_data, sampling_rate=16000, return_tensors="pt", padding=True) | |
| # Move each tensor in the inputs dictionary to the desired device. | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| scores = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist() | |
| return { id2label[0]: scores[0], id2label[1]: scores[1] } | |
| demo = gr.Interface( | |
| fn=predict_gender, | |
| inputs=gr.Audio(type="numpy"), | |
| outputs=gr.Label(num_top_classes=2), | |
| title="Voice Gender Detection", | |
| description="Please use the microphone option and speak into the microphone to predict real time gender from voice." | |
| ) | |
| demo.launch(debug=False, share=True) | |