{ "cells": [ { "cell_type": "code", "execution_count": 8, "id": "4daac0c9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found 1750 files belonging to 25 classes.\n", "Found 375 files belonging to 25 classes.\n", "Found 375 files belonging to 25 classes.\n" ] } ], "source": [ "import tensorflow as tf\n", "from tensorflow import keras\n", "from tensorflow.keras import layers\n", "import os\n", "\n", "BASE_DIR = r\"D:\\Guvi\\SmartVision_AI\\smartvision_dataset\"\n", "IMG_SIZE = (224, 224)\n", "BATCH_SIZE = 32\n", "IMG_SIZE = (224, 224)\n", "\n", "NUM_CLASSES = 25\n", "\n", "train_dir = os.path.join(BASE_DIR, \"classification\", \"train\")\n", "val_dir = os.path.join(BASE_DIR, \"classification\", \"val\")\n", "test_dir = os.path.join(BASE_DIR, \"classification\", \"test\")\n", "\n", "train_ds = tf.keras.utils.image_dataset_from_directory(\n", " train_dir,\n", " image_size=IMG_SIZE,\n", " batch_size=BATCH_SIZE,\n", " shuffle=True\n", ")\n", "\n", "val_ds = tf.keras.utils.image_dataset_from_directory(\n", " val_dir,\n", " image_size=IMG_SIZE,\n", " batch_size=BATCH_SIZE,\n", " shuffle=False\n", ")\n", "\n", "test_ds = tf.keras.utils.image_dataset_from_directory(\n", " test_dir,\n", " image_size=IMG_SIZE,\n", " batch_size=BATCH_SIZE,\n", " shuffle=False\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e690c322", "metadata": {}, "outputs": [], "source": [ "# 1.4. Data augmentation block (applied only on training data)\n", "data_augmentation = keras.Sequential(\n", " [\n", " layers.RandomFlip(\"horizontal\"), # random horizontal flip\n", " layers.RandomRotation(0.04), # ~ ±15° (15/360 ≈ 0.04)\n", " layers.RandomZoom(0.1), # random zoom\n", " layers.RandomContrast(0.2), # ±20% contrast\n", " # Brightness jitter using Lambda + tf.image\n", " layers.Lambda(\n", " lambda x: tf.image.random_brightness(x, max_delta=0.2)\n", " ),\n", " # Optional: light color jitter via saturation\n", " layers.Lambda(\n", " lambda x: tf.image.random_saturation(x, lower=0.8, upper=1.2)\n", " ),\n", " ],\n", " name=\"data_augmentation\",\n", ")\n", "\n", "# Normalization layer (0–1 scaling or ImageNet style)\n", "normalization = layers.Rescaling(1./255)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "88323a0f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/25\n", "\u001b[1m55/55\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 5s/step - accuracy: 0.0405 - loss: 3.4605" ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m55/55\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m328s\u001b[0m 6s/step - accuracy: 0.0429 - loss: 3.4206 - val_accuracy: 0.0373 - val_loss: 3.2323 - learning_rate: 1.0000e-04\n", "Epoch 2/25\n", "\u001b[1m55/55\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6s/step - accuracy: 0.0474 - loss: 3.2988" ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m55/55\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m457s\u001b[0m 8s/step - accuracy: 0.0486 - loss: 3.2914 - val_accuracy: 0.0533 - val_loss: 3.1938 - learning_rate: 1.0000e-04\n", "Epoch 3/25\n", "\u001b[1m55/55\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 18s/step - accuracy: 0.0463 - loss: 3.2775 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m55/55\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1232s\u001b[0m 22s/step - accuracy: 0.0486 - loss: 3.2567 - val_accuracy: 0.0853 - val_loss: 3.1689 - learning_rate: 1.0000e-04\n", "Epoch 4/25\n", "\u001b[1m55/55\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 19s/step - accuracy: 0.0568 - loss: 3.2323 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m55/55\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1278s\u001b[0m 23s/step - accuracy: 0.0543 - loss: 3.2274 - val_accuracy: 0.1360 - val_loss: 3.1451 - learning_rate: 1.0000e-04\n", "Epoch 5/25\n", "\u001b[1m55/55\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 16s/step - accuracy: 0.0526 - loss: 3.1936 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m55/55\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1076s\u001b[0m 19s/step - accuracy: 0.0623 - loss: 3.1870 - val_accuracy: 0.1520 - val_loss: 3.1223 - learning_rate: 1.0000e-04\n", "Epoch 6/25\n", "\u001b[1m55/55\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 11s/step - accuracy: 0.0762 - loss: 3.1579 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m55/55\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m757s\u001b[0m 14s/step - accuracy: 0.0811 - loss: 3.1483 - val_accuracy: 0.1867 - val_loss: 3.0975 - learning_rate: 1.0000e-04\n", "Epoch 7/25\n", "\u001b[1m55/55\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 13s/step - accuracy: 0.1051 - loss: 3.1299 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m55/55\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m900s\u001b[0m 16s/step - accuracy: 0.1029 - loss: 3.1283 - val_accuracy: 0.2107 - val_loss: 3.0750 - learning_rate: 1.0000e-04\n", "Epoch 8/25\n", "\u001b[1m55/55\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 11s/step - accuracy: 0.1321 - loss: 3.1018 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m55/55\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m799s\u001b[0m 15s/step - accuracy: 0.1343 - loss: 3.0993 - val_accuracy: 0.2373 - val_loss: 3.0532 - learning_rate: 1.0000e-04\n", "Epoch 9/25\n", "\u001b[1m50/55\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━\u001b[0m \u001b[1m1:06\u001b[0m 13s/step - accuracy: 0.1195 - loss: 3.0798" ] } ], "source": [ "# 2.1: Model 1 - VGG16\n", "\n", "def build_vgg16_model():\n", " inputs = keras.Input(shape=(*IMG_SIZE, 3))\n", " x = data_augmentation(inputs) # train only\n", " x = normalization(x)\n", "\n", " base_model = keras.applications.VGG16(\n", " include_top=False,\n", " weights=\"imagenet\",\n", " input_tensor=x\n", " )\n", " base_model.trainable = False # freeze convolutional base\n", "\n", " x = layers.GlobalAveragePooling2D()(base_model.output)\n", " x = layers.Dense(256, activation=\"relu\")(x)\n", " x = layers.Dropout(0.5)(x)\n", " outputs = layers.Dense(NUM_CLASSES, activation=\"softmax\")(x)\n", "\n", " model = keras.Model(inputs, outputs, name=\"VGG16_smartvision\")\n", " return model\n", "def compile_and_train(model, model_name, train_ds, val_ds, epochs=25, lr=1e-4):\n", " model.compile(\n", " optimizer=keras.optimizers.Adam(learning_rate=lr),\n", " loss=\"sparse_categorical_crossentropy\",\n", " metrics=[\"accuracy\"]\n", " )\n", "\n", " callbacks = [\n", " keras.callbacks.ModelCheckpoint(\n", " filepath=f\"{model_name}_best.h5\",\n", " monitor=\"val_accuracy\",\n", " save_best_only=True,\n", " mode=\"max\"\n", " ),\n", " keras.callbacks.EarlyStopping(\n", " monitor=\"val_accuracy\",\n", " patience=5,\n", " restore_best_weights=True\n", " ),\n", " keras.callbacks.ReduceLROnPlateau(\n", " monitor=\"val_loss\",\n", " factor=0.5,\n", " patience=2,\n", " min_lr=1e-6,\n", " verbose=1\n", " )\n", " ]\n", "\n", " history = model.fit(\n", " train_ds,\n", " validation_data=val_ds,\n", " epochs=epochs,\n", " callbacks=callbacks\n", " )\n", " return history\n", "\n", "vgg16_model = build_vgg16_model()\n", "history_vgg16 = compile_and_train(vgg16_model, \"vgg16\", train_ds, val_ds, epochs=25)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "3e7696bc", "metadata": {}, "outputs": [], "source": [ "class_names = train_ds.class_names\n", "NUM_CLASSES = len(class_names)\n", "print(class_names)" ] }, { "cell_type": "code", "execution_count": null, "id": "3b3417aa", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import time\n", "import json\n", "import os\n", "from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support\n", "\n", "def evaluate_and_collect_metrics(model, model_name, test_ds, class_names, weights_path=None):\n", " # If you saved best weights, load them\n", " if weights_path is not None and os.path.exists(weights_path):\n", " model.load_weights(weights_path)\n", " print(f\"✅ Loaded best weights from {weights_path}\")\n", "\n", " y_true = []\n", " y_pred = []\n", " y_pred_probs = []\n", "\n", " # ----- measure inference time -----\n", " total_time = 0.0\n", " total_images = 0\n", "\n", " for images, labels in test_ds:\n", " images_np = images.numpy()\n", " batch_size = images_np.shape[0]\n", "\n", " start = time.perf_counter()\n", " probs = model.predict(images_np, verbose=0)\n", " end = time.perf_counter()\n", "\n", " total_time += (end - start)\n", " total_images += batch_size\n", "\n", " preds = np.argmax(probs, axis=1)\n", "\n", " y_true.extend(labels.numpy())\n", " y_pred.extend(preds)\n", " y_pred_probs.append(probs)\n", "\n", " y_true = np.array(y_true)\n", " y_pred = np.array(y_pred)\n", " y_pred_probs = np.concatenate(y_pred_probs, axis=0)\n", "\n", " # ----- basic metrics -----\n", " acc = (y_true == y_pred).mean()\n", "\n", " precision, recall, f1, _ = precision_recall_fscore_support(\n", " y_true, y_pred, average=\"weighted\", zero_division=0\n", " )\n", "\n", " # ----- top-5 accuracy -----\n", " top5_correct = 0\n", " for i, label in enumerate(y_true):\n", " top5 = np.argsort(y_pred_probs[i])[-5:]\n", " if label in top5:\n", " top5_correct += 1\n", " top5_acc = top5_correct / len(y_true)\n", "\n", " # ----- inference time -----\n", " avg_time_per_image = total_time / total_images # seconds\n", " imgs_per_second = 1.0 / avg_time_per_image if avg_time_per_image > 0 else 0.0\n", "\n", " # ----- model size -----\n", " # Save weights temporarily to compute size\n", " temp_weights = f\"{model_name}_temp_for_size.weights.h5\" \n", " model.save_weights(temp_weights)\n", " size_mb = os.path.getsize(temp_weights) / (1024 * 1024)\n", " os.remove(temp_weights)\n", "\n", " # ----- classification report & confusion matrix (for plots) -----\n", " print(f\"\\n=== {model_name.upper()} – Classification Report ===\")\n", " print(classification_report(y_true, y_pred, target_names=class_names, zero_division=0))\n", "\n", " cm = confusion_matrix(y_true, y_pred)\n", " print(f\"\\nConfusion matrix shape: {cm.shape}\")\n", "\n", " metrics = {\n", " \"model_name\": model_name,\n", " \"accuracy\": float(acc),\n", " \"precision_weighted\": float(precision),\n", " \"recall_weighted\": float(recall),\n", " \"f1_weighted\": float(f1),\n", " \"top5_accuracy\": float(top5_acc),\n", " \"avg_inference_time_sec_per_image\": float(avg_time_per_image),\n", " \"images_per_second\": float(imgs_per_second),\n", " \"model_size_mb\": float(size_mb),\n", " \"num_parameters\": int(model.count_params()),\n", " }\n", " return metrics, cm\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6c01d2cc", "metadata": {}, "outputs": [], "source": [ "vgg_metrics, vgg_cm = evaluate_and_collect_metrics(\n", " vgg16_model, \"vgg16\", test_ds, class_names, \"vgg16_best.h5\"\n", ")\n", "with open(\"vgg16_metrics.json\", \"w\") as f:\n", " json.dump(vgg_metrics, f, indent=2)" ] }, { "cell_type": "code", "execution_count": null, "id": "6e91352d", "metadata": {}, "outputs": [], "source": [ "# 2.2: Model 2 - ResNet50\n", "def build_resnet50_model():\n", " inputs = keras.Input(shape=(*IMG_SIZE, 3))\n", " x = data_augmentation(inputs)\n", " x = normalization(x)\n", "\n", " base_model = keras.applications.ResNet50(\n", " include_top=False,\n", " weights=\"imagenet\",\n", " input_tensor=x\n", " )\n", "\n", " # Freeze all, then unfreeze last 20 layers\n", " for layer in base_model.layers:\n", " layer.trainable = False\n", " for layer in base_model.layers[-20:]:\n", " layer.trainable = True\n", "\n", " x = layers.GlobalAveragePooling2D()(base_model.output)\n", " x = layers.Dense(256, activation=\"relu\")(x)\n", " x = layers.Dropout(0.5)(x)\n", " outputs = layers.Dense(NUM_CLASSES, activation=\"softmax\")(x)\n", "\n", " model = keras.Model(inputs, outputs, name=\"ResNet50_smartvision\")\n", " return model\n", "\n", "resnet_model = build_resnet50_model()\n", "history_resnet = compile_and_train(resnet_model, \"resnet50\", train_ds, val_ds, epochs=25, lr=1e-4)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "aab6167c", "metadata": {}, "outputs": [], "source": [ "# 2.3: Model 3 - MobileNetV2\n", "\n", "def build_mobilenetv2_model():\n", " inputs = keras.Input(shape=(*IMG_SIZE, 3))\n", " x = data_augmentation(inputs)\n", " x = normalization(x)\n", "\n", " base_model = keras.applications.MobileNetV2(\n", " include_top=False,\n", " weights=\"imagenet\",\n", " input_tensor=x\n", " )\n", " base_model.trainable = False # keep it light & fast\n", "\n", " x = layers.GlobalAveragePooling2D()(base_model.output)\n", " x = layers.Dense(128, activation=\"relu\")(x)\n", " x = layers.Dropout(0.3)(x)\n", " outputs = layers.Dense(NUM_CLASSES, activation=\"softmax\")(x)\n", "\n", " model = keras.Model(inputs, outputs, name=\"MobileNetV2_smartvision\")\n", " return model\n", "\n", "mobilenet_model = build_mobilenetv2_model()\n", "history_mobilenet = compile_and_train(mobilenet_model, \"mobilenetv2\", train_ds, val_ds, epochs=20, lr=1e-4)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d4f51125", "metadata": {}, "outputs": [], "source": [ "# 2.4: Model 4 - EfficientNetB0\n", "\n", "from tensorflow.keras import mixed_precision\n", "mixed_precision.set_global_policy(\"mixed_float16\") # for GPU speed\n", "\n", "def build_efficientnetb0_model():\n", " inputs = keras.Input(shape=(*IMG_SIZE, 3))\n", " x = data_augmentation(inputs)\n", " x = normalization(x)\n", "\n", " base_model = keras.applications.EfficientNetB0(\n", " include_top=False,\n", " weights=\"imagenet\",\n", " input_tensor=x\n", " )\n", "\n", " # Fine-tune: unfreeze some top layers\n", " for layer in base_model.layers[:-30]:\n", " layer.trainable = False\n", " for layer in base_model.layers[-30:]:\n", " layer.trainable = True\n", "\n", " x = layers.GlobalAveragePooling2D()(base_model.output)\n", " x = layers.BatchNormalization()(x)\n", " x = layers.Dense(256, activation=\"relu\")(x)\n", " x = layers.Dropout(0.4)(x)\n", " outputs = layers.Dense(NUM_CLASSES, activation=\"softmax\", dtype=\"float32\")(x) # force float32 at output\n", "\n", " model = keras.Model(inputs, outputs, name=\"EfficientNetB0_smartvision\")\n", " return model\n", "\n", "effnet_model = build_efficientnetb0_model()\n", "history_effnet = compile_and_train(effnet_model, \"efficientnetb0\", train_ds, val_ds, epochs=30, lr=5e-5)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "0064b8f3", "metadata": {}, "outputs": [], "source": [ "# 2.5: Model Comparison & Selection\n", "\n", "from sklearn.metrics import classification_report, confusion_matrix\n", "import numpy as np\n", "\n", "def evaluate_on_test(model, test_ds, model_name):\n", " y_true = []\n", " y_pred = []\n", "\n", " for images, labels in test_ds:\n", " preds = model.predict(images)\n", " y_true.extend(labels.numpy())\n", " y_pred.extend(np.argmax(preds, axis=1))\n", "\n", " print(f\"\\n=== {model_name} TEST REPORT ===\")\n", " print(classification_report(y_true, y_pred, target_names=class_names))\n", "\n", " cm = confusion_matrix(y_true, y_pred)\n", " plt.figure(figsize=(10, 8))\n", " sns.heatmap(cm, annot=False, cmap=\"Blues\",\n", " xticklabels=class_names,\n", " yticklabels=class_names)\n", " plt.title(f\"{model_name} - Confusion Matrix\")\n", " plt.xlabel(\"Predicted\")\n", " plt.ylabel(\"True\")\n", " plt.show()\n", "\n", "# Load best weights if needed and evaluate\n", "vgg16_model.load_weights(\"vgg16_best.h5\")\n", "resnet_model.load_weights(\"resnet50_best.h5\")\n", "mobilenet_model.load_weights(\"mobilenetv2_best.h5\")\n", "effnet_model.load_weights(\"efficientnetb0_best.h5\")\n", "\n", "evaluate_on_test(vgg16_model, test_ds, \"VGG16\")\n", "evaluate_on_test(resnet_model, test_ds, \"ResNet50\")\n", "evaluate_on_test(mobilenet_model, test_ds, \"MobileNetV2\")\n", "evaluate_on_test(effnet_model, test_ds, \"EfficientNetB0\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }