{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install transformers torch torchaudio librosa pandas scikit-learn tqdm" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torch.nn as nn\n", "from transformers import AutoModel\n", "import librosa\n", "import os\n", "import pandas as pd\n", "from sklearn.metrics import accuracy_score\n", "import numpy as np\n", "from tqdm import tqdm\n", "import pickle" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def download_esc50():\n", " import urllib.request\n", " import zipfile\n", " \n", " if not os.path.exists('ESC-50'):\n", " print(\"Downloading ESC-50 dataset...\")\n", " url = \"https://github.com/karoldvl/ESC-50/archive/master.zip\"\n", " urllib.request.urlretrieve(url, 'esc50.zip')\n", " \n", " with zipfile.ZipFile('esc50.zip', 'r') as zip_ref:\n", " zip_ref.extractall('.')\n", " os.rename('ESC-50-master', 'ESC-50')\n", " os.remove('esc50.zip')\n", " print(\"ESC-50 dataset downloaded and extracted\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def extract_features():\n", " \"\"\"Extract and save features for all ESC-50 audio files\"\"\"\n", " \n", " if os.path.exists('esc50_features.pkl'):\n", " print(\"Features already extracted, loading from file...\")\n", " with open('esc50_features.pkl', 'rb') as f:\n", " return pickle.load(f)\n", " \n", " # Load model\n", " model = AutoModel.from_pretrained(\"mispeech/dashengtokenizer\", trust_remote_code=True)\n", " model.eval()\n", " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", " model.to(device)\n", " \n", " # Load metadata\n", " metadata_path = 'ESC-50/meta/esc50.csv'\n", " df = pd.read_csv(metadata_path)\n", " \n", " features_list = []\n", " labels_list = []\n", " folds_list = []\n", " \n", " print(\"Extracting features...\")\n", " for idx, row in tqdm(df.iterrows(), total=len(df)):\n", " filename = row['filename']\n", " label = row['target']\n", " fold = row['fold']\n", " \n", " audio_path = os.path.join('ESC-50/audio', filename)\n", " \n", " try:\n", " # Load and preprocess audio\n", " audio, sr = librosa.load(audio_path, sr=16000)\n", " audio_tensor = torch.tensor(audio).float().unsqueeze(0).to(device)\n", " \n", " # Extract features\n", " with torch.no_grad(),torch.autocast(device_type='cuda'):\n", " features = model.encode(audio_tensor)\n", " if isinstance(features, dict):\n", " for key in ['last_hidden_state', 'embeddings', 'audio']:\n", " if key in features:\n", " features = features[key]\n", " break\n", " else:\n", " features = list(features.values())[0]\n", " \n", " # Global average pooling\n", " if features.dim() > 2:\n", " features = features.mean(dim=1)\n", " \n", " features = features.squeeze().cpu().numpy()\n", " \n", " features_list.append(features)\n", " labels_list.append(label)\n", " folds_list.append(fold)\n", " \n", " except Exception as e:\n", " print(f\"Error processing {filename}: {e}\")\n", " \n", " # Save features\n", " features_data = {\n", " 'features': np.array(features_list),\n", " 'labels': np.array(labels_list),\n", " 'folds': np.array(folds_list),\n", " 'embedding_dim': features_list[0].shape[0]\n", " }\n", " \n", " with open('esc50_features.pkl', 'wb') as f:\n", " pickle.dump(features_data, f)\n", " \n", " print(f\"Features extracted: {len(features_list)} samples, embedding dim: {features_data['embedding_dim']}\")\n", " return features_data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Download dataset and extract features\n", "download_esc50()\n", "features_data = extract_features()\n", "\n", "X = features_data['features']\n", "y = features_data['labels']\n", "folds = features_data['folds']\n", "embedding_dim = features_data['embedding_dim']\n", "\n", "print(f\"Features shape: {X.shape}, Labels shape: {y.shape}\")\n", "print(f\"Folds: {np.unique(folds)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 5-fold cross validation\n", "accuracies = []\n", "\n", "for fold in range(1, 6):\n", " print(f\"\\n=== Fold {fold} ===\")\n", " \n", " # Split data based on fold\n", " val_mask = folds == fold\n", " train_mask = ~val_mask\n", " \n", " X_train = X[train_mask]\n", " y_train = y[train_mask]\n", " X_val = X[val_mask]\n", " y_val = y[val_mask]\n", " \n", " print(f\"Train: {X_train.shape}, Val: {X_val.shape}\")\n", " \n", " # Convert to PyTorch tensors\n", " X_train_tensor = torch.tensor(X_train, dtype=torch.float32)\n", " y_train_tensor = torch.tensor(y_train, dtype=torch.long)\n", " X_val_tensor = torch.tensor(X_val, dtype=torch.float32)\n", " y_val_tensor = torch.tensor(y_val, dtype=torch.long)\n", " \n", " # Single linear layer\n", " classifier = nn.Linear(embedding_dim, 50) # 50 ESC-50 classes\n", " \n", " # Setup\n", " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", " classifier.to(device)\n", " \n", " # Training setup\n", " optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3)\n", " criterion = nn.CrossEntropyLoss()\n", " \n", " # Training loop\n", " batch_size = 32\n", " \n", " for epoch in range(10):\n", " classifier.train()\n", " \n", " # Training\n", " train_loss = 0\n", " train_preds = []\n", " train_labels = []\n", " \n", " # Mini-batch training\n", " for i in range(0, len(X_train_tensor), batch_size):\n", " batch_features = X_train_tensor[i:i+batch_size].to(device)\n", " batch_labels = y_train_tensor[i:i+batch_size].to(device)\n", " \n", " # Forward pass\n", " logits = classifier(batch_features)\n", " loss = criterion(logits, batch_labels)\n", " \n", " # Backward pass\n", " optimizer.zero_grad()\n", " loss.backward()\n", " optimizer.step()\n", " \n", " train_loss += loss.item()\n", " preds = torch.argmax(logits, dim=1)\n", " train_preds.extend(preds.cpu().numpy())\n", " train_labels.extend(batch_labels.cpu().numpy())\n", " \n", " train_acc = accuracy_score(train_labels, train_preds)\n", " \n", " # Validation\n", " classifier.eval()\n", " with torch.no_grad():\n", " val_features = X_val_tensor.to(device)\n", " val_labels = y_val_tensor.cpu().numpy()\n", " \n", " val_logits = classifier(val_features)\n", " val_preds = torch.argmax(val_logits, dim=1).cpu().numpy()\n", " val_acc = accuracy_score(val_labels, val_preds)\n", " \n", " print(f\"Epoch {epoch+1}/10 - Train Loss: {train_loss/len(range(0, len(X_train_tensor), batch_size)):.4f} - Train Acc: {train_acc:.4f} - Val Acc: {val_acc:.4f}\")\n", " \n", " # Store final validation accuracy for this fold\n", " accuracies.append(val_acc)\n", " print(f\"Fold {fold} final validation accuracy: {val_acc:.4f}\")\n", "\n", "# Calculate average accuracy\n", "mean_acc = np.mean(accuracies)\n", "std_acc = np.std(accuracies)\n", "print(f\"\\n=== Cross-Validation Results ===\")\n", "print(f\"Mean accuracy: {mean_acc:.4f} ± {std_acc:.4f}\")\n", "print(f\"Individual fold accuracies: {[f'{acc:.4f}' for acc in accuracies]}\")" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 4 }