Spaces:

sergio-sanz-rodriguez
/

transform-eats

Sleeping

App Files Files Community

sergio-sanz-rodriguez commited on Jan 4

Commit

2bf4af2

1 Parent(s): 95510f9

updated app with 101 + unknown trained classes

Browse files

Files changed (2) hide show

app.py +7 -46
vision_transformer.py +353 -0

app.py CHANGED Viewed

@@ -31,36 +31,17 @@ effnetb0_model = create_effnetb0(
     num_classes=2
     )
-# Load the ViT-Base/16 transformer with input image of 224x224 pixels
-vitbase_model_1 = create_vitbase_model(
     model_weights_dir=".",
-    model_weights_name="vitbase16_5.pth",
-    img_size=224,
-    num_classes=num_classes,
-    compile=False
-)
-# Specify manual transforms for model_1
-transforms_1 = v2.Compose([
-    v2.Resize((242, 242)),
-    v2.CenterCrop((224, 224)),
-    v2.ToImage(),
-    v2.ToDtype(torch.float32, scale=True),
-    v2.Normalize(mean=[0.485, 0.456, 0.406],
-                std=[0.229, 0.224, 0.225])
-])
-# Load the ViT-Base/16 transformer with input image of 384x384 pixels
-vitbase_model_2 = create_vitbase_model(
-    model_weights_dir=".",
-    model_weights_name="vitbase16_2_2024-12-31.pth",
     img_size=384,
     num_classes=num_classes,
     compile=True
 )
 # Specify manual transforms for model_2
-transforms_2 = v2.Compose([
     v2.Resize(384), #v2.Resize((384, 384)),
     v2.CenterCrop((384, 384)),
     v2.ToImage(),
@@ -69,13 +50,10 @@ transforms_2 = v2.Compose([
                 std=[0.229, 0.224, 0.225])
 ])
 # Put models into evaluation mode and turn on inference mode
 effnetb0_model.eval()
-vitbase_model_1.eval()
-vitbase_model_2.eval()
-# Specify default ViT model
-default_model = "Vision Transformer - 384x384 pixels (higher accuracy, slower predictions)" # "Vision Transformer - 224x224 pixels (lower accuracy, faster predictions)"
 # Predict function
 def predict(image) -> Tuple[Dict, str, str]:
@@ -86,14 +64,6 @@ def predict(image) -> Tuple[Dict, str, str]:
         # Start the timer
         start_time = timer()
-        # Select the appropriate model based on the user's choice
-        if default_model == "Vision Transformer - 384x384 pixels (higher accuracy, slower predictions)":
-            vitbase_model = vitbase_model_2
-            transforms = transforms_2
-        else:
-            vitbase_model = vitbase_model_1
-            transforms = transforms_1
         # Transform the target image and add a batch dimension
         image = transforms(image).unsqueeze(0)
@@ -104,14 +74,13 @@ def predict(image) -> Tuple[Dict, str, str]:
             if effnetb0_model(image)[:,1].cpu() >= 0.9981166124343872:
                 # Pass the transformed image through the model and turn the prediction logits into prediction probabilities
-                pred_probs = torch.softmax(vitbase_model(image), dim=1) # 101 classes
                 # Calculate entropy
                 entropy = -torch.sum(pred_probs * torch.log(pred_probs), dim=1).item()
                 # Create a prediction label and prediction probability dictionary for each prediction class
                 pred_classes_and_probs = {class_names[i]: float(pred_probs[0][i]) for i in range(num_classes)}
-                pred_classes_and_probs["unknown"] = 0.0
                 # Get the top predicted class
                 top_class = max(pred_classes_and_probs, key=pred_classes_and_probs.get)
@@ -164,14 +133,6 @@ A cutting-edge Vision Transformer (ViT) model to classify 101 delicious food typ
 # Configure the upload image area
 upload_input = gr.Image(type="pil", label="Upload Image", sources=['upload'], show_label=True, mirror_webcam=False)
-# Configure the dropdown option
-#model_dropdown = gr.Dropdown(
-#    choices=["Vision Transformer - 384x384 pixels (higher accuracy, slower predictions)",
-#             "Vision Transformer - 224x224 pixels (lower accuracy, faster predictions)"],
-#    value="Vision Transformer - 384x384 pixels (higher accuracy, slower predictions)",
-#    label="Select Model:"
-#)
 # Configure the sample image area
 food_vision_examples = [["examples/" + example] for example in os.listdir("examples")]

     num_classes=2
     )
+# Load the ViT-Base/16 transformer with input image of 384x384 pixels and 101 + unknown classes
+vitbase_model = create_vitbase_model(
     model_weights_dir=".",
+    model_weights_name="vitbase16_102_2025-01-03.pth",
     img_size=384,
     num_classes=num_classes,
     compile=True
 )
 # Specify manual transforms for model_2
+transforms = v2.Compose([
     v2.Resize(384), #v2.Resize((384, 384)),
     v2.CenterCrop((384, 384)),
     v2.ToImage(),
                 std=[0.229, 0.224, 0.225])
 ])
 # Put models into evaluation mode and turn on inference mode
 effnetb0_model.eval()
+vitbase_model.eval()
 # Predict function
 def predict(image) -> Tuple[Dict, str, str]:
         # Start the timer
         start_time = timer()
         # Transform the target image and add a batch dimension
         image = transforms(image).unsqueeze(0)
             if effnetb0_model(image)[:,1].cpu() >= 0.9981166124343872:
                 # Pass the transformed image through the model and turn the prediction logits into prediction probabilities
+                pred_probs = torch.softmax(vitbase_model(image), dim=1)
                 # Calculate entropy
                 entropy = -torch.sum(pred_probs * torch.log(pred_probs), dim=1).item()
                 # Create a prediction label and prediction probability dictionary for each prediction class
                 pred_classes_and_probs = {class_names[i]: float(pred_probs[0][i]) for i in range(num_classes)}
                 # Get the top predicted class
                 top_class = max(pred_classes_and_probs, key=pred_classes_and_probs.get)
 # Configure the upload image area
 upload_input = gr.Image(type="pil", label="Upload Image", sources=['upload'], show_label=True, mirror_webcam=False)
 # Configure the sample image area
 food_vision_examples = [["examples/" + example] for example in os.listdir("examples")]

vision_transformer.py CHANGED Viewed

@@ -1,8 +1,361 @@
 import torch
 import torchvision
 import torch._dynamo
 from torch import nn
 from torch.nn.init import trunc_normal_, xavier_normal_, zeros_, orthogonal_, kaiming_normal_
 # Create Pytorch's default ViT models

+import os
+import random
 import torch
 import torchvision
 import torch._dynamo
+import matplotlib.pyplot as plt
+from typing import List
 from torch import nn
+from torch.utils.data import DataLoader
 from torch.nn.init import trunc_normal_, xavier_normal_, zeros_, orthogonal_, kaiming_normal_
+from torchvision import datasets
+from torchvision.transforms import v2
+def display_random_images(dataset: torch.utils.data.dataset.Dataset, # or torchvision.datasets.ImageFolder?
+                          classes: List[str] = None,
+                          n: int = 10,
+                          display_shape: bool = True,
+                          rows: int = 5,
+                          cols: int = 5,
+                          seed: int = None):
+    """Displays a number of random images from a given dataset.
+    Args:
+        dataset (torch.utils.data.dataset.Dataset): Dataset to select random images from.
+        classes (List[str], optional): Names of the classes. Defaults to None.
+        n (int, optional): Number of images to display. Defaults to 10.
+        display_shape (bool, optional): Whether to display the shape of the image tensors. Defaults to True.
+        rows: number of rows of the subplot
+        cols: number of columns of the subplot
+        seed (int, optional): The seed to set before drawing random images. Defaults to None.
+    Usage:
+    display_random_images(train_data,
+                      n=16,
+                      classes=class_names,
+                      rows=4,
+                      cols=4,
+                      display_shape=False,
+                      seed=None)
+    """
+    # Setup the range to select images
+    n = min(n, len(dataset))
+    # Adjust display if n too high
+    if n > rows*cols:
+        n = rows*cols
+        #display_shape = False
+        print(f"For display purposes, n shouldn't be larger than {rows*cols}, setting to {n} and removing shape display.")
+    # Set random seed
+    if seed:
+        random.seed(seed)
+    # Get random sample indexes
+    random_samples_idx = random.sample(range(len(dataset)), k=n)
+    # Setup plot
+    plt.figure(figsize=(cols*4, rows*4))
+    #Loop through samples and display random samples
+    for i, targ_sample in enumerate(random_samples_idx):
+        targ_image, targ_label = dataset[targ_sample][0], dataset[targ_sample][1]
+        # 7. Adjust image tensor shape for plotting: [color_channels, height, width] -> [color_channels, height, width]
+        targ_image_adjust = targ_image.permute(1, 2, 0)
+        # Plot adjusted samples
+        plt.subplot(rows, cols, i+1)
+        plt.imshow(targ_image_adjust)
+        plt.axis("off")
+        if classes:
+            title = f"class: {classes[targ_label]}"
+            if display_shape:
+                title = title + f"\nshape: {targ_image_adjust.shape}"
+        plt.title(title)
+def create_dataloaders(
+    train_dir: str,
+    test_dir: str,
+    train_transform: v2.Compose,
+    test_transform: v2.Compose,
+    batch_size: int,
+    num_workers: int=os.cpu_count()
+):
+  """Creates training and testing DataLoaders.
+  Takes in a training directory and testing directory path and turns
+  them into PyTorch Datasets and then into PyTorch DataLoaders.
+  Args:
+    train_dir: Path to training directory.
+    test_dir: Path to testing directory.
+    train_transform: torchvision transforms to perform on training data.
+    test_transform: torchvision transforms to perform on test data.
+    batch_size: Number of samples per batch in each of the DataLoaders.
+    num_workers: An integer for number of workers per DataLoader.
+  Returns:
+    A tuple of (train_dataloader, test_dataloader, class_names).
+    Where class_names is a list of the target classes.
+    Example usage:
+      train_dataloader, test_dataloader, class_names = \
+        = create_dataloaders(train_dir=path/to/train_dir,
+                             test_dir=path/to/test_dir,
+                             transform=some_transform,
+                             batch_size=32,
+                             num_workers=4)
+  """
+  # Use ImageFolder to create dataset(s)
+  train_data = datasets.ImageFolder(train_dir, transform=train_transform)
+  test_data = datasets.ImageFolder(test_dir, transform=test_transform)
+  # Get class names
+  class_names = train_data.classes
+  # Turn images into data loaders
+  train_dataloader = DataLoader(
+      train_data,
+      batch_size=batch_size,
+      shuffle=True,
+      num_workers=num_workers,
+      pin_memory=True, #enables fast data transfers to CUDA-enabled GPU
+  )
+  test_dataloader = DataLoader(
+      test_data,
+      batch_size=batch_size,
+      shuffle=False,
+      num_workers=num_workers,
+      pin_memory=True, #enables fast data transfers to CUDA-enabled GPU
+  )
+  return train_dataloader, test_dataloader, class_names
+def create_dataloader_for_vit(
+        vit_model: str="bitbase16",
+        train_dir: str="./",
+        test_dir: str="./",
+        batch_size: int=64,
+        aug: bool=True,
+        display_imgs: bool=True,
+        num_workers: int=os.cpu_count()
+        ):
+    """
+    Creates data loaders for the training and test datasets to be used to traing visiton transformers.
+    Args:
+        vit_model (str): The name of the ViT model to use. Default is "bitbase16".
+        train_dir (str): The path to the training dataset directory. Default is TRAIN_DIR.
+        test_dir (str): The path to the test dataset directory. Default is TEST_DIR.
+        batch_size (int): The batch size for the data loaders. Default is BATCH_SIZE.
+        aug (bool): Whether to apply data augmentation or not. Default is True.
+        display_imgs (bool): Whether to display sample images or not. Default is True.
+    Returns:
+        train_dataloader (torch.utils.data.DataLoader): The data loader for the training dataset.
+        test_dataloader (torch.utils.data.DataLoader): The data loader for the test dataset.
+        class_names (list): A list of class names.
+    """
+    IMG_SIZE = 224
+    IMG_SIZE_2 = 384
+    # Manual transforms for the training dataset
+    manual_transforms = v2.Compose([
+        v2.RandomCrop((IMG_SIZE, IMG_SIZE)),
+        v2.ToImage(),
+        v2.ToDtype(torch.float32, scale=True),
+    ])
+    # ViT-Base/16 transforms
+    if vit_model == "vitbase16":
+        # Manual transforms for the training dataset
+        if aug:
+            manual_transforms_train_vitb = v2.Compose([
+                v2.TrivialAugmentWide(),
+                v2.Resize((256, 256)),
+                v2.RandomCrop((IMG_SIZE, IMG_SIZE)),
+                v2.ToImage(),
+                v2.ToDtype(torch.float32, scale=True),
+                v2.Normalize(mean=[0.485, 0.456, 0.406],
+                            std=[0.229, 0.224, 0.225])
+            ])
+        else:
+            manual_transforms_train_vitb = v2.Compose([
+                v2.Resize((256, 256)),
+                v2.CenterCrop((IMG_SIZE, IMG_SIZE)),
+                v2.ToImage(),
+                v2.ToDtype(torch.float32, scale=True),
+                v2.Normalize(mean=[0.485, 0.456, 0.406],
+                            std=[0.229, 0.224, 0.225])
+            ])
+        # Manual transforms for the test dataset
+        manual_transforms_test_vitb = v2.Compose([
+            v2.Resize((256, 256)),
+            v2.CenterCrop((IMG_SIZE, IMG_SIZE)),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+            v2.Normalize(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225])
+        ])
+        # Create data loaders for ViT-Base
+        train_dataloader, test_dataloader, class_names = create_dataloaders(
+            train_dir=train_dir,
+            test_dir=test_dir,
+            train_transform=manual_transforms_train_vitb,
+            test_transform=manual_transforms_test_vitb,
+            batch_size=batch_size,
+            num_workers=num_workers
+            )
+    if vit_model == "vitbase16_2":
+        # Manual transforms for the training dataset
+        if aug:
+            manual_transforms_train_vitb = v2.Compose([
+                v2.TrivialAugmentWide(),
+                v2.Resize((IMG_SIZE_2, IMG_SIZE_2)),
+                v2.CenterCrop((IMG_SIZE_2, IMG_SIZE_2)),
+                v2.ToImage(),
+                v2.ToDtype(torch.float32, scale=True),
+                v2.Normalize(mean=[0.485, 0.456, 0.406],
+                            std=[0.229, 0.224, 0.225])
+            ])
+        else:
+            manual_transforms_train_vitb = v2.Compose([
+                v2.Resize((IMG_SIZE_2, IMG_SIZE_2)),
+                v2.CenterCrop((IMG_SIZE_2, IMG_SIZE_2)),
+                v2.ToImage(),
+                v2.ToDtype(torch.float32, scale=True),
+                v2.Normalize(mean=[0.485, 0.456, 0.406],
+                            std=[0.229, 0.224, 0.225])
+            ])
+        # Manual transforms for the test dataset
+        manual_transforms_test_vitb = v2.Compose([
+            v2.Resize((IMG_SIZE_2, IMG_SIZE_2)),
+            v2.CenterCrop((IMG_SIZE_2, IMG_SIZE_2)),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+            v2.Normalize(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225])
+        ])
+        # Create data loaders for ViT-Base
+        train_dataloader, test_dataloader, class_names = create_dataloaders(
+            train_dir=train_dir,
+            test_dir=test_dir,
+            train_transform=manual_transforms_train_vitb,
+            test_transform=manual_transforms_test_vitb,
+            batch_size=batch_size,
+            num_workers=num_workers
+            )
+    # ViT-Large/16 transforms
+    elif vit_model == "vitlarge16":
+        # Manual transforms for the training dataset
+        if aug:
+            manual_transforms_train_vitl = v2.Compose([
+                v2.TrivialAugmentWide(),
+                v2.Resize((242, 242)),
+                v2.RandomCrop((IMG_SIZE, IMG_SIZE)),
+                v2.ToImage(),
+                v2.ToDtype(torch.float32, scale=True),
+                v2.Normalize(mean=[0.485, 0.456, 0.406],
+                            std=[0.229, 0.224, 0.225])
+            ])
+        else:
+            manual_transforms_train_vitl = v2.Compose([
+                v2.Resize((242, 242)),
+                v2.CenterCrop((IMG_SIZE, IMG_SIZE)),
+                v2.ToImage(),
+                v2.ToDtype(torch.float32, scale=True),
+                v2.Normalize(mean=[0.485, 0.456, 0.406],
+                            std=[0.229, 0.224, 0.225])
+            ])
+        # Manual transforms for the test dataset
+        manual_transforms_test_vitl = v2.Compose([
+            v2.Resize((242, 242)),
+            v2.CenterCrop((IMG_SIZE, IMG_SIZE)),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+            v2.Normalize(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225])
+        ])
+        # Create data loaders for ViT-Large/16
+        train_dataloader, test_dataloader, class_names = create_dataloaders(
+            train_dir=train_dir,
+            test_dir=test_dir,
+            train_transform=manual_transforms_train_vitl,
+            test_transform=manual_transforms_test_vitl,
+            batch_size=batch_size,
+            num_workers=num_workers
+        )
+    # ViT-Large/32 transforms
+    else:
+        # Manual transforms for the training dataset
+        if aug:
+            manual_transforms_train_vitl = v2.Compose([
+                v2.TrivialAugmentWide(),
+                v2.Resize((256, 256)),
+                v2.RandomCrop((IMG_SIZE, IMG_SIZE)),
+                v2.ToImage(),
+                v2.ToDtype(torch.float32, scale=True),
+                v2.Normalize(mean=[0.485, 0.456, 0.406],
+                            std=[0.229, 0.224, 0.225])
+            ])
+        else:
+            manual_transforms_train_vitl = v2.Compose([
+                v2.Resize((256, 256)),
+                v2.CenterCrop((IMG_SIZE, IMG_SIZE)),
+                v2.ToImage(),
+                v2.ToDtype(torch.float32, scale=True),
+                v2.Normalize(mean=[0.485, 0.456, 0.406],
+                            std=[0.229, 0.224, 0.225])
+            ])
+        # Manual transforms for the test dataset
+        manual_transforms_test_vitl = v2.Compose([
+            v2.Resize((256, 256)),
+            v2.CenterCrop((IMG_SIZE, IMG_SIZE)),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+            v2.Normalize(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225])
+        ])
+        # Create data loaders for ViT-Large/32
+        train_dataloader, test_dataloader, class_names = create_dataloaders(
+            train_dir=train_dir,
+            test_dir=test_dir,
+            train_transform=manual_transforms_train_vitl,
+            test_transform=manual_transforms_test_vitl,
+            batch_size=batch_size,
+            num_workers=num_workers
+        )
+    # Display images
+    if display_imgs:
+        train_data = datasets.ImageFolder(train_dir, transform=manual_transforms)
+        display_random_images(train_data,
+                        n=25,
+                        classes=class_names,
+                        rows=5,
+                        cols=5,
+                        display_shape=False,
+                        seed=None)
+    return train_dataloader, test_dataloader, class_names
 # Create Pytorch's default ViT models