Clean up repository structure and update for pip install

Files changed (15) hide show

Demo.ipynb +2 -2
README.md +6 -1
model.py +0 -222
nemotron_table_structure_v1/model.py +5 -4
post_processing/table_struct_pp.py +0 -222
post_processing/wbf.py +0 -324
table_structure_v1.py +0 -81
yolox/__init__.py +0 -10
yolox/boxes.py +0 -58
yolox/darknet.py +0 -182
yolox/network_blocks.py +0 -213
yolox/yolo_fpn.py +0 -87
yolox/yolo_head.py +0 -238
yolox/yolo_pafpn.py +0 -119
yolox/yolox.py +0 -35

Demo.ipynb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3fbac9002e8052dd97f8de19ede6a60fa119d237a08aa13e09fc294c73708489
-size 1085041

 version https://git-lfs.github.com/spec/v1
+oid sha256:002e6edf2b37d18f5eb2499fa653b8543b95964b122be8726cf214a2cf5500ba
+size 848913

README.md CHANGED Viewed

@@ -134,7 +134,12 @@ git clone https://huggingface.co/nvidia/nemotron-table-structure-v1
 ```
 git clone git@hf.co:nvidia/nemotron-table-structure-v1
 ```
 2. Run the model using the following code:
 ```

 ```
 git clone git@hf.co:nvidia/nemotron-table-structure-v1
 ```
+Optional:
+This can be installed as a package using pip
+```
+cd nemotron-table-structure-v1
+pip install -e .
+```
 2. Run the model using the following code:
 ```

model.py DELETED Viewed

@@ -1,222 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-import os
-import sys
-import torch
-import importlib
-import numpy as np
-import numpy.typing as npt
-import torch.nn as nn
-import torch.nn.functional as F
-from typing import Dict, List, Tuple, Union
-from yolox.boxes import postprocess
-def define_model(config_name: str = "page_element_v3", verbose: bool = True) -> nn.Module:
-    """
-    Defines and initializes the model based on the configuration.
-    Args:
-        config_name (str): Configuration name. Defaults to "page_element_v3".
-        verbose (bool): Whether to print verbose output. Defaults to True.
-    Returns:
-        torch.nn.Module: The initialized YOLOX model.
-    """
-    # Load model from exp_file
-    sys.path.append(os.path.dirname(config_name))
-    exp_module = importlib.import_module(os.path.basename(config_name).split(".")[0])
-    config = exp_module.Exp()
-    model = config.get_model()
-    # Load weights
-    if verbose:
-        print(" -> Loading weights from", config.ckpt)
-    ckpt = torch.load(config.ckpt, map_location="cpu", weights_only=False)
-    model.load_state_dict(ckpt["model"], strict=True)
-    model = YoloXWrapper(model, config)
-    return model.eval().to(config.device)
-def resize_pad(img: torch.Tensor, size: tuple) -> torch.Tensor:
-    """
-    Resizes and pads an image to a given size.
-    The goal is to preserve the aspect ratio of the image.
-    Args:
-        img (torch.Tensor[C x H x W]): The image to resize and pad.
-        size (tuple[2]): The size to resize and pad the image to.
-    Returns:
-        torch.Tensor: The resized and padded image.
-    """
-    img = img.float()
-    _, h, w = img.shape
-    scale = min(size[0] / h, size[1] / w)
-    nh = int(h * scale)
-    nw = int(w * scale)
-    img = F.interpolate(
-        img.unsqueeze(0), size=(nh, nw), mode="bilinear", align_corners=False
-    ).squeeze(0)
-    img = torch.clamp(img, 0, 255)
-    pad_b = size[0] - nh
-    pad_r = size[1] - nw
-    img = F.pad(img, (0, pad_r, 0, pad_b), value=114.0)
-    return img
-class YoloXWrapper(nn.Module):
-    """
-    Wrapper for YoloX models.
-    """
-    def __init__(self, model: nn.Module, config) -> None:
-        """
-        Constructor
-        Args:
-            model (torch model): Yolo model.
-            config (Config): Config object containing model parameters.
-        """
-        super().__init__()
-        self.model = model
-        self.config = config
-        # Copy config parameters
-        self.device = config.device
-        self.img_size = config.size
-        self.min_bbox_size = config.min_bbox_size
-        self.normalize_boxes = config.normalize_boxes
-        self.conf_thresh = config.conf_thresh
-        self.iou_thresh = config.iou_thresh
-        self.class_agnostic = config.class_agnostic
-        self.threshold = config.threshold
-        self.labels = config.labels
-        self.num_classes = config.num_classes
-    def reformat_input(
-        self,
-        x: torch.Tensor,
-        orig_sizes: Union[torch.Tensor, List, Tuple, npt.NDArray]
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Reformats the input data and original sizes to the correct format.
-        Args:
-            x (torch.Tensor[BS x C x H x W]): Input image batch.
-            orig_sizes (torch.Tensor or list or np.ndarray): Original image sizes.
-        Returns:
-            torch tensor [BS x C x H x W]: Input image batch.
-            torch tensor [BS x 2]: Original image sizes (before resizing and padding).
-        """
-        # Convert image size to tensor
-        if isinstance(orig_sizes, (list, tuple)):
-            orig_sizes = np.array(orig_sizes)
-        if orig_sizes.shape[-1] == 3:  # remove channel
-            orig_sizes = orig_sizes[..., :2]
-        if isinstance(orig_sizes, np.ndarray):
-            orig_sizes = torch.from_numpy(orig_sizes).to(self.device)
-        # Add batch dimension if not present
-        if len(x.size()) == 3:
-            x = x.unsqueeze(0)
-        if len(orig_sizes.size()) == 1:
-            orig_sizes = orig_sizes.unsqueeze(0)
-        return x, orig_sizes
-    def preprocess(self, image: Union[torch.Tensor, npt.NDArray]) -> torch.Tensor:
-        """
-        YoloX preprocessing function:
-        - Resizes to the longest edge to img_size while preserving the aspect ratio
-        - Pads the shortest edge to img_size
-        Args:
-            image (torch tensor or np array [H x W x 3]): Input images in uint8 format.
-        Returns:
-            torch tensor [3 x H x W]: Processed image.
-        """
-        if not isinstance(image, torch.Tensor):
-            image = torch.from_numpy(image)
-        image = image.permute(2, 0, 1)  # [H, W, 3] -> [3, H, W]
-        image = resize_pad(image, self.img_size)
-        return image.float()
-    def forward(
-        self,
-        x: torch.Tensor,
-        orig_sizes: Union[torch.Tensor, List, Tuple, npt.NDArray]
-    ) -> List[Dict[str, torch.Tensor]]:
-        """
-        Forward pass of the model.
-        Applies NMS and reformats the predictions.
-        Args:
-            x (torch.Tensor[BS x C x H x W]): Input image batch.
-            orig_sizes (torch.Tensor or list or np.ndarray): Original image sizes.
-        Returns:
-            list[dict]: List of prediction dictionaries. Each dictionary contains:
-                - labels (torch.Tensor[N]): Class labels
-                - boxes (torch.Tensor[N x 4]): Bounding boxes
-                - scores (torch.Tensor[N]): Confidence scores.
-        """
-        x, orig_sizes = self.reformat_input(x, orig_sizes)
-        # Scale to 0-255 if in range 0-1
-        if x.max() <= 1:
-            x *= 255
-        pred_boxes = self.model(x.to(self.device))
-        # NMS
-        pred_boxes = postprocess(
-            pred_boxes,
-            self.config.num_classes,
-            self.conf_thresh,
-            self.iou_thresh,
-            class_agnostic=self.class_agnostic,
-        )
-        # Reformat output
-        preds = []
-        for i, (p, size) in enumerate(zip(pred_boxes, orig_sizes)):
-            if p is None:  # No detections
-                preds.append({
-                    "labels": torch.empty(0),
-                    "boxes": torch.empty((0, 4)),
-                    "scores": torch.empty(0),
-                })
-                continue
-            p = p.view(-1, p.size(-1))
-            ratio = min(self.img_size[0] / size[0], self.img_size[1] / size[1])
-            boxes = p[:, :4] / ratio
-            # Clip
-            boxes[:, [0, 2]] = torch.clamp(boxes[:, [0, 2]], 0, size[1])
-            boxes[:, [1, 3]] = torch.clamp(boxes[:, [1, 3]], 0, size[0])
-            # Remove too small
-            kept = (
-                (boxes[:, 2] - boxes[:, 0] > self.min_bbox_size) &
-                (boxes[:, 3] - boxes[:, 1] > self.min_bbox_size)
-            )
-            boxes = boxes[kept]
-            p = p[kept]
-            # Normalize to 0-1
-            if self.normalize_boxes:
-                boxes[:, [0, 2]] /= size[1]
-                boxes[:, [1, 3]] /= size[0]
-            scores = p[:, 4] * p[:, 5]
-            labels = p[:, 6]
-            preds.append({"labels": labels, "boxes": boxes, "scores": scores})
-        return preds

nemotron_table_structure_v1/model.py CHANGED Viewed

@@ -36,10 +36,11 @@ def define_model(config_name: str = "page_element_v3", verbose: bool = True) ->
     if verbose:
         print(" -> Loading weights from", config.ckpt)
-    # Use importlib.resources to locate 'weights.pth' inside the module's directory (nmtron_page_elements_v3)
-    with importlib.resources.path("table_structure_v1", "weights.pth") as weights_path:
-        ckpt = torch.load(str(weights_path), map_location="cpu", weights_only=False)
-    model.load_state_dict(ckpt["model"], strict=True)
     model = YoloXWrapper(model, config)
     return model.eval().to(config.device)

     if verbose:
         print(" -> Loading weights from", config.ckpt)
+    # Find package directory and load weights (nemotron_table_structure_v1)
+    package_dir = os.path.dirname(os.path.abspath(__file__))
+    weights_path = os.path.join(package_dir, "weights.pth")
+    state_dict = torch.load(weights_path, map_location="cpu", weights_only=False)
+    model.load_state_dict(state_dict["model"], strict=True)
     model = YoloXWrapper(model, config)
     return model.eval().to(config.device)

post_processing/table_struct_pp.py DELETED Viewed

@@ -1,222 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-import re
-from typing import List, Union, Optional, Literal
-import numpy as np
-import numpy.typing as npt
-import pandas as pd
-def assign_boxes(
-    box: Union[List[float], npt.NDArray[np.float64]],
-    candidate_boxes: npt.NDArray[np.float64],
-    delta: float = 2.0,
-    min_overlap: float = 0.25,
-    mode: Literal["cell", "row", "column"] = "cell",
-) -> npt.NDArray[np.int_]:
-    """
-    Assigns the best candidate boxes to a reference `box` based on overlap.
-    If mode is "cell", the overlap is calculated using surface area overlap.
-    If mode is "row", the overlap is calculated using row height overlap.
-    If mode is "column", the overlap is calculated using column width overlap.
-    If delta > 1, it will look for multiple matches,
-    using candidates with score >= max_overlap / delta.
-    Args:
-        box (list or numpy.ndarray): Reference bounding box [x_min, y_min, x_max, y_max].
-        candidate_boxes (numpy.ndarray [N, 4]): Array of candidate bounding boxes.
-        delta (float, optional): Factor for matches relative to the best overlap. Defaults to 2.0.
-        min_overlap (float, optional): Minimum required overlap for a match. Defaults to 0.25.
-        mode (str, optional): Mode to assign boxes ("cell", "row", or "column"). Defaults to "cell".
-    Returns:
-        numpy.ndarray [M]: Indices of the matched boxes sorted by decreasing overlap.
-                          Returns an empty array if no matches are found.
-    """
-    if not len(candidate_boxes):
-        return np.array([], dtype=np.int_)
-    x0_1, y0_1, x1_1, y1_1 = box
-    x0_2, y0_2, x1_2, y1_2 = (
-        candidate_boxes[:, 0],
-        candidate_boxes[:, 1],
-        candidate_boxes[:, 2],
-        candidate_boxes[:, 3],
-    )
-    # Intersection
-    inter_y0 = np.maximum(y0_1, y0_2)
-    inter_y1 = np.minimum(y1_1, y1_2)
-    inter_x0 = np.maximum(x0_1, x0_2)
-    inter_x1 = np.minimum(x1_1, x1_2)
-    if mode == "cell":
-        inter_area = np.maximum(0, inter_y1 - inter_y0) * np.maximum(0, inter_x1 - inter_x0)
-        box_area = (y1_1 - y0_1) * (x1_1 - x0_1)
-        overlap = inter_area / (box_area + 1e-6)
-    elif mode == "row":
-        inter_area = np.maximum(0, inter_y1 - inter_y0)
-        box_area = y1_1 - y0_1
-        overlap = inter_area / (box_area + 1e-6)
-    elif mode == "column":
-        inter_area = np.maximum(0, inter_x1 - inter_x0)
-        box_area = x1_1 - x0_1
-        overlap = inter_area / (box_area + 1e-6)
-    else:
-        raise ValueError(f"Invalid mode: {mode}")
-    max_overlap = np.max(overlap)
-    if max_overlap <= min_overlap:  # No match
-        return np.array([], dtype=np.int_)
-    n = len(np.where(overlap >= (max_overlap / delta))[0]) if delta > 1 else 1
-    matches = np.argsort(-overlap)[:n]
-    return matches
-def merge_text_in_cell(df_cell: pd.DataFrame) -> pd.DataFrame:
-    """
-    Merges text from multiple rows into a single cell and recalculates its bounding box.
-    Values are sorted by rounded (y, x) coordinates.
-    Args:
-        df_cell (pandas.DataFrame): DataFrame containing cells to merge.
-    Returns:
-        pandas.DataFrame: Updated DataFrame with merged text and a single bounding box.
-    """
-    boxes = np.stack(df_cell["box"].values)
-    df_cell["x"] = (boxes[:, 0] - boxes[:, 0].min()) // 10
-    df_cell["y"] = (boxes[:, 1] - boxes[:, 1].min()) // 10
-    df_cell = df_cell.sort_values(["y", "x"])
-    text = " ".join(df_cell["text"].values.tolist())
-    df_cell["text"] = text
-    df_cell = df_cell.head(1)
-    df_cell["box"] = df_cell["cell"]
-    df_cell.drop(["x", "y"], axis=1, inplace=True)
-    return df_cell
-def remove_empty_row(mat: List[List[str]]) -> List[List[str]]:
-    """
-    Remove empty rows from a matrix.
-    Args:
-        mat (list[list]): The matrix to remove empty rows from.
-    Returns:
-        list[list]: The matrix with empty rows removed.
-    """
-    mat_filter = []
-    for row in mat:
-        if max([len(c) for c in row]):
-            mat_filter.append(row)
-    return mat_filter
-def build_markdown(
-    df: pd.DataFrame,
-    remove_empty: bool = True,
-    n_rows: Optional[int] = None,
-    repeat_single: bool = False,
-) -> Union[List[List[str]], npt.NDArray[np.str_]]:
-    """
-    Convert a dataframe into a markdown table.
-    Args:
-        df (pandas.DataFrame): The dataframe to convert with columns 'col_ids',
-            'row_ids', and 'text'.
-        remove_empty (bool, optional): Whether to remove empty rows & cols. Defaults to True.
-        n_rows (int, optional): Number of rows. Inferred from df if None. Defaults to None.
-        repeat_single (bool, optional): Whether to repeat single element in rows.
-            Defaults to False.
-    Returns:
-        list[list[str]] or numpy.ndarray: A list of lists or array representing the markdown table.
-    """
-    df = df.reset_index(drop=True)
-    n_cols = max([np.max(c) for c in df['col_ids'].values])
-    if n_rows is None:
-        n_rows = max([np.max(c) for c in df['row_ids'].values])
-    else:
-        n_rows = max(
-            n_rows - 1,
-            max([np.max(c) for c in df['row_ids'].values])
-        )
-    mat = np.empty((n_rows + 1, n_cols + 1), dtype=str).tolist()
-    for i in range(len(df)):
-        if isinstance(df["row_ids"][i], int) or isinstance(df["col_ids"][i], int):
-            continue
-        for r in df["row_ids"][i]:
-            for c in df["col_ids"][i]:
-                mat[r][c] = (mat[r][c] + " " + df["text"][i]).strip()
-    # Remove empty rows & columns
-    if remove_empty:
-        mat = remove_empty_row(mat)
-        mat = np.array(remove_empty_row(np.array(mat).T.tolist())).T.tolist()
-    if repeat_single:
-        new_mat = []
-        for row in mat:
-            if sum([len(c) > 0 for c in row]) == 1:
-                txt = [c for c in row if len(c)][0]
-                new_mat.append([txt for _ in range(len(row))])
-            else:
-                new_mat.append(row)
-        mat = np.array(new_mat)
-    return mat
-def display_markdown(
-    data: List[List[str]], show: bool = True, use_header: bool = True
-) -> str:
-    """
-    Convert a list of lists of strings into a markdown table.
-    If show is True, use_header will be set to True.
-    Args:
-        data (list[list[str]]): The table data. The first sublist should contain headers.
-        show (bool, optional): Whether to display the table. Defaults to True.
-        use_header (bool, optional): Whether to use the first sublist as headers. Defaults to True.
-    Returns:
-        str: A markdown-formatted table as a string.
-    """
-    if show:
-        use_header = True
-        data = [[re.sub(r'\n', ' ', c) for c in row] for row in data]
-    if not len(data):
-        return "EMPTY TABLE"
-    max_cols = max(len(row) for row in data)
-    data = [row + [""] * (max_cols - len(row)) for row in data]
-    if use_header:
-        header = "| " + " | ".join(data[0]) + " |"
-        separator = "| " + " | ".join(["---"] * max_cols) + " |"
-        body = "\n".join("| " + " | ".join(row) + " |" for row in data[1:])
-        markdown_table = (
-            f"{header}\n{separator}\n{body}" if body else f"{header}\n{separator}"
-        )
-        if show:
-            from IPython.display import display, Markdown
-            markdown_table = re.sub(r'\$', r'\\$', markdown_table)
-            markdown_table = re.sub(r'\%', r'\\%', markdown_table)
-            display(Markdown(markdown_table))
-    else:
-        markdown_table = "\n".join("| " + " | ".join(row) + " |" for row in data)
-    return markdown_table

post_processing/wbf.py DELETED Viewed

@@ -1,324 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-# Adapted from:
-# https://github.com/ZFTurbo/Weighted-Boxes-Fusion/blob/master/ensemble_boxes/ensemble_boxes_wbf.py
-import warnings
-from typing import Dict, List, Tuple, Union, Literal
-import numpy as np
-import numpy.typing as npt
-def prefilter_boxes(
-    boxes: List[npt.NDArray[np.float64]],
-    scores: List[npt.NDArray[np.float64]],
-    labels: List[npt.NDArray[np.int_]],
-    weights: List[float],
-    thr: float,
-    class_agnostic: bool = False,
-) -> Dict[Union[str, int], npt.NDArray[np.float64]]:
-    """
-    Reformats and filters boxes.
-    Output is a dict of boxes to merge separately.
-    Args:
-        boxes (list[np array[n x 4]]): List of boxes. One list per model.
-        scores (list[np array[n]]): List of confidences.
-        labels (list[np array[n]]): List of labels.
-        weights (list): Model weights.
-        thr (float): Confidence threshold
-        class_agnostic (bool, optional): Merge boxes from different classes. Defaults to False.
-    Returns:
-        dict[np array [? x 8]]: Filtered boxes.
-    """
-    # Create dict with boxes stored by its label
-    new_boxes = dict()
-    for t in range(len(boxes)):
-        assert len(boxes[t]) == len(scores[t]), "len(boxes) != len(scores)"
-        assert len(boxes[t]) == len(labels[t]), "len(boxes) != len(labels)"
-        for j in range(len(boxes[t])):
-            score = scores[t][j]
-            if score < thr:
-                continue
-            label = int(labels[t][j])
-            box_part = boxes[t][j]
-            x1 = float(box_part[0])
-            y1 = float(box_part[1])
-            x2 = float(box_part[2])
-            y2 = float(box_part[3])
-            # Box data checks
-            if x2 < x1:
-                warnings.warn("X2 < X1 value in box. Swap them.")
-                x1, x2 = x2, x1
-            if y2 < y1:
-                warnings.warn("Y2 < Y1 value in box. Swap them.")
-                y1, y2 = y2, y1
-            array = np.array([x1, x2, y1, y2])
-            if array.min() < 0 or array.max() > 1:
-                warnings.warn("Coordinates outside [0, 1]")
-                array = np.clip(array, 0, 1)
-                x1, x2, y1, y2 = array
-            if (x2 - x1) * (y2 - y1) == 0.0:
-                warnings.warn("Zero area box skipped: {}.".format(box_part))
-                continue
-            # [label, score, weight, model index, x1, y1, x2, y2]
-            b = [int(label), float(score) * weights[t], weights[t], t, x1, y1, x2, y2]
-            label_k = "*" if class_agnostic else label
-            if label_k not in new_boxes:
-                new_boxes[label_k] = []
-            new_boxes[label_k].append(b)
-    # Sort each list in dict by score and transform it to numpy array
-    for k in new_boxes:
-        current_boxes = np.array(new_boxes[k])
-        new_boxes[k] = current_boxes[current_boxes[:, 1].argsort()[::-1]]
-    return new_boxes
-def merge_labels(
-    labels: npt.NDArray[np.int_], confs: npt.NDArray[np.float64]
-) -> int:
-    """
-    Custom function for merging labels.
-    If all labels are the same, return the unique value.
-    Else, return the label of the most confident non-title (class 2) box.
-    Args:
-        labels (np array [n]): Labels.
-        confs (np array [n]): Confidence.
-    Returns:
-        int: Label.
-    """
-    if len(np.unique(labels)) == 1:
-        return labels[0]
-    else:  # Most confident and not a title
-        confs = confs[confs != 2]
-        labels = labels[labels != 2]
-        return labels[np.argmax(confs)]
-def get_weighted_box(
-    boxes: npt.NDArray[np.float64], conf_type: Literal["avg", "max"] = "avg"
-) -> npt.NDArray[np.float64]:
-    """
-    Merges boxes by using the weighted fusion.
-    Args:
-        boxes (np array [n x 8]): Boxes to merge.
-        conf_type (str, optional): Confidence merging type. Defaults to "avg".
-    Returns:
-        np array [8]: Merged box.
-    """
-    box = np.zeros(8, dtype=np.float32)
-    conf = 0
-    conf_list = []
-    w = 0
-    for b in boxes:
-        box[4:] += b[1] * b[4:]
-        conf += b[1]
-        conf_list.append(b[1])
-        w += b[2]
-    box[0] = merge_labels(
-        np.array([b[0] for b in boxes]), np.array([b[1] for b in boxes])
-    )
-    box[1] = np.max(conf_list) if conf_type == "max" else np.mean(conf_list)
-    box[2] = w
-    box[3] = -1  # model index field is retained for consistency but is not used.
-    box[4:] /= conf
-    return box
-def get_biggest_box(
-    boxes: npt.NDArray[np.float64], conf_type: Literal["avg", "max"] = "avg"
-) -> npt.NDArray[np.float64]:
-    """
-    Merges boxes by using the biggest box.
-    Args:
-        boxes (np array [n x 8]): Boxes to merge.
-        conf_type (str, optional): Confidence merging type. Defaults to "avg".
-    Returns:
-        np array [8]: Merged box.
-    """
-    box = np.zeros(8, dtype=np.float32)
-    box[4:] = boxes[0][4:]
-    conf_list = []
-    w = 0
-    for b in boxes:
-        box[4] = min(box[4], b[4])
-        box[5] = min(box[5], b[5])
-        box[6] = max(box[6], b[6])
-        box[7] = max(box[7], b[7])
-        conf_list.append(b[1])
-        w += b[2]
-    box[0] = merge_labels(
-        np.array([b[0] for b in boxes]), np.array([b[1] for b in boxes])
-    )
-    #     print(box[0], np.array([b[0] for b in boxes]))
-    box[1] = np.max(conf_list) if conf_type == "max" else np.mean(conf_list)
-    box[2] = w
-    box[3] = -1  # model index field is retained for consistency but is not used.
-    return box
-def find_matching_box_fast(
-    boxes_list: npt.NDArray[np.float64],
-    new_box: npt.NDArray[np.float64],
-    match_iou: float,
-) -> Tuple[int, float]:
-    """
-    Reimplementation of find_matching_box with numpy instead of loops.
-    Gives significant speed up for larger arrays (~100x).
-    This was previously the bottleneck since the function is called for every entry in the array.
-    Args:
-        boxes_list (np.ndarray): Array of boxes with shape (N, 8).
-        new_box (np.ndarray): New box to match with shape (8,).
-        match_iou (float): IoU threshold for matching.
-    Returns:
-        Tuple[int, float]: Index of best matching box (-1 if no match) and IoU value.
-    """
-    def bb_iou_array(
-        boxes: npt.NDArray[np.float64], new_box: npt.NDArray[np.float64]
-    ) -> npt.NDArray[np.float64]:
-        # bb interesection over union
-        xA = np.maximum(boxes[:, 0], new_box[0])
-        yA = np.maximum(boxes[:, 1], new_box[1])
-        xB = np.minimum(boxes[:, 2], new_box[2])
-        yB = np.minimum(boxes[:, 3], new_box[3])
-        interArea = np.maximum(xB - xA, 0) * np.maximum(yB - yA, 0)
-        # compute the area of both the prediction and ground-truth rectangles
-        boxAArea = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
-        boxBArea = (new_box[2] - new_box[0]) * (new_box[3] - new_box[1])
-        iou = interArea / (boxAArea + boxBArea - interArea)
-        return iou
-    if boxes_list.shape[0] == 0:
-        return -1, match_iou
-    ious = bb_iou_array(boxes_list[:, 4:], new_box[4:])
-    # ious[boxes[:, 0] != new_box[0]] = -1
-    best_idx = np.argmax(ious)
-    best_iou = ious[best_idx]
-    if best_iou <= match_iou:
-        best_iou = match_iou
-        best_idx = -1
-    return best_idx, best_iou
-def weighted_boxes_fusion(
-    boxes_list: List[npt.NDArray[np.float64]],
-    labels_list: List[npt.NDArray[np.int_]],
-    scores_list: List[npt.NDArray[np.float64]],
-    iou_thr: float = 0.5,
-    skip_box_thr: float = 0.0,
-    conf_type: Literal["avg", "max"] = "avg",
-    merge_type: Literal["weighted", "biggest"] = "weighted",
-    class_agnostic: bool = False,
-) -> Tuple[npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.int_]]:
-    """
-    Custom WBF implementation that supports a class_agnostic mode and a biggest box fusion.
-    Boxes are expected to be in normalized (x0, y0, x1, y1) format.
-    Args:
-        boxes_list (list[np.ndarray[n x 4]]): List of boxes. One list per model.
-        labels_list (list[np.ndarray[n]]): List of labels.
-        scores_list (list[np.ndarray[n]]): List of confidences.
-        iou_thr (float, optional): IoU threshold for matching. Defaults to 0.55.
-        skip_box_thr (float, optional): Exclude boxes with score < skip_box_thr. Defaults to 0.0.
-        conf_type (str, optional): Confidence merging type ("avg" or "max"). Defaults to "avg".
-        merge_type (str, optional): Merge type ("weighted" or "biggest"). Defaults to "weighted".
-        class_agnostic (bool, optional): Merge boxes from different classes. Defaults to False.
-    Returns:
-        numpy.ndarray [N x 4]: Array of bounding boxes.
-        numpy.ndarray [N]: Array of labels.
-        numpy.ndarray [N]: Array of scores.
-    """
-    weights = np.ones(len(boxes_list))
-    assert conf_type in ["avg", "max"], 'Conf type must be "avg" or "max"'
-    assert merge_type in ["weighted", "biggest"], 'Conf type must be "weighted" or "biggest"'
-    filtered_boxes = prefilter_boxes(
-        boxes_list,
-        scores_list,
-        labels_list,
-        weights,
-        skip_box_thr,
-        class_agnostic=class_agnostic,
-    )
-    if len(filtered_boxes) == 0:
-        return np.zeros((0, 4)), np.zeros((0,)), np.zeros((0,))
-    overall_boxes = []
-    for label in filtered_boxes:
-        boxes = filtered_boxes[label]
-        clusters = []
-        # Clusterize boxes
-        for j in range(len(boxes)):
-            ids = [i for i in range(len(boxes)) if i != j]
-            index, best_iou = find_matching_box_fast(boxes[ids], boxes[j], iou_thr)
-            if index != -1:
-                index = ids[index]
-                cluster_idx = [
-                    clust_idx
-                    for clust_idx, clust in enumerate(clusters)
-                    if (j in clust or index in clust)
-                ]
-                if len(cluster_idx):
-                    cluster_idx = cluster_idx[0]
-                    clusters[cluster_idx] = list(
-                        set(clusters[cluster_idx] + [index, j])
-                    )
-                else:
-                    clusters.append([index, j])
-            else:
-                clusters.append([j])
-        for j, c in enumerate(clusters):
-            if merge_type == "weighted":
-                weighted_box = get_weighted_box(boxes[c], conf_type)
-            elif merge_type == "biggest":
-                weighted_box = get_biggest_box(boxes[c], conf_type)
-            if conf_type == "max":
-                weighted_box[1] = weighted_box[1] / weights.max()
-            else:  # avg
-                weighted_box[1] = weighted_box[1] * len(c) / weights.sum()
-            overall_boxes.append(weighted_box)
-    overall_boxes = np.array(overall_boxes)
-    overall_boxes = overall_boxes[overall_boxes[:, 1].argsort()[::-1]]
-    boxes = overall_boxes[:, 4:]
-    scores = overall_boxes[:, 1]
-    labels = overall_boxes[:, 0]
-    return boxes, labels, scores

table_structure_v1.py DELETED Viewed

@@ -1,81 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-import torch
-import torch.nn as nn
-from typing import List, Tuple
-class Exp:
-    """
-    Configuration class for the table structure model.
-    This class contains all configuration parameters for the YOLOX-based
-    table structure detection model, including architecture settings, inference
-    parameters, and class-specific thresholds.
-    """
-    def __init__(self) -> None:
-        """Initialize the configuration with default parameters."""
-        self.name: str = "page-element-v3"
-        self.ckpt: str = "weights.pth"
-        self.device: str = "cuda:0" if torch.cuda.is_available() else "cpu"
-        # YOLOX architecture parameters
-        self.act: str = "silu"
-        self.depth: float = 1.00
-        self.width: float = 1.00
-        self.labels: List[str] = [
-            "border",  # not used
-            "cell",
-            "row",
-            "column",
-            "header"  # not used
-        ]
-        self.num_classes: int = len(self.labels)
-        # Inference parameters
-        self.size: Tuple[int, int] = (1024, 1024)
-        self.min_bbox_size: int = 0
-        self.normalize_boxes: bool = True
-        # NMS & thresholding. These can be updated
-        self.conf_thresh: float = 0.01
-        self.iou_thresh: float = 0.25
-        self.class_agnostic: bool = False
-        self.threshold: float = 0.05
-    def get_model(self) -> nn.Module:
-        """
-        Get the YOLOX model.
-        Builds and returns a YOLOX model with the configured architecture.
-        Also updates batch normalization parameters for optimal inference.
-        Returns:
-            nn.Module: The YOLOX model with configured parameters.
-        """
-        from yolox import YOLOX, YOLOPAFPN, YOLOXHead
-        # Build model
-        if getattr(self, "model", None) is None:
-            in_channels = [256, 512, 1024]
-            backbone = YOLOPAFPN(
-                self.depth, self.width, in_channels=in_channels, act=self.act
-            )
-            head = YOLOXHead(
-                self.num_classes, self.width, in_channels=in_channels, act=self.act
-            )
-            self.model = YOLOX(backbone, head)
-        # Update batch-norm parameters
-        def init_yolo(M: nn.Module) -> None:
-            for m in M.modules():
-                if isinstance(m, nn.BatchNorm2d):
-                    m.eps = 1e-3
-                    m.momentum = 0.03
-        self.model.apply(init_yolo)
-        return self.model

yolox/__init__.py DELETED Viewed

@@ -1,10 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
-# Copyright (c) Megvii Inc. All rights reserved.
-from .yolo_head import YOLOXHead
-from .yolo_pafpn import YOLOPAFPN
-from .yolox import YOLOX

yolox/boxes.py DELETED Viewed

@@ -1,58 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#!/usr/bin/env python3
-# Copyright (c) Megvii Inc. All rights reserved.
-import torch
-import torchvision
-def postprocess(prediction, num_classes, conf_thre=0.7, nms_thre=0.45, class_agnostic=False):
-    """
-    Copied from YOLOX/yolox/utils/boxes.py
-    """
-    box_corner = prediction.new(prediction.shape)
-    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
-    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
-    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
-    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
-    prediction[:, :, :4] = box_corner[:, :, :4]
-    output = [None for _ in range(len(prediction))]
-    for i, image_pred in enumerate(prediction):
-        # If none are remaining => process next image
-        if not image_pred.size(0):
-            continue
-        # Get score and class with highest confidence
-        class_conf, class_pred = torch.max(image_pred[:, 5: 5 + num_classes], 1, keepdim=True)
-        conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze()
-        # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
-        detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1)
-        detections = detections[conf_mask]
-        if not detections.size(0):
-            continue
-        if class_agnostic:
-            nms_out_index = torchvision.ops.nms(
-                detections[:, :4],
-                detections[:, 4] * detections[:, 5],
-                nms_thre,
-            )
-        else:
-            nms_out_index = torchvision.ops.batched_nms(
-                detections[:, :4],
-                detections[:, 4] * detections[:, 5],
-                detections[:, 6],
-                nms_thre,
-            )
-        detections = detections[nms_out_index]
-        if output[i] is None:
-            output[i] = detections
-        else:
-            output[i] = torch.cat((output[i], detections))
-    return output

yolox/darknet.py DELETED Viewed

@@ -1,182 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-# Copyright (c) Megvii Inc. All rights reserved.
-from torch import nn
-from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck
-class Darknet(nn.Module):
-    # number of blocks from dark2 to dark5.
-    depth2blocks = {21: [1, 2, 2, 1], 53: [2, 8, 8, 4]}
-    def __init__(
-        self,
-        depth,
-        in_channels=3,
-        stem_out_channels=32,
-        out_features=("dark3", "dark4", "dark5"),
-    ):
-        """
-        Args:
-            depth (int): depth of darknet used in model, usually use [21, 53] for this param.
-            in_channels (int): number of input channels, for example, use 3 for RGB image.
-            stem_out_channels (int): number of output channels of darknet stem.
-                It decides channels of darknet layer2 to layer5.
-            out_features (Tuple[str]): desired output layer name.
-        """
-        super().__init__()
-        assert out_features, "please provide output features of Darknet"
-        self.out_features = out_features
-        self.stem = nn.Sequential(
-            BaseConv(in_channels, stem_out_channels, ksize=3, stride=1, act="lrelu"),
-            *self.make_group_layer(stem_out_channels, num_blocks=1, stride=2),
-        )
-        in_channels = stem_out_channels * 2  # 64
-        num_blocks = Darknet.depth2blocks[depth]
-        # create darknet with `stem_out_channels` and `num_blocks` layers.
-        # to make model structure more clear, we don't use `for` statement in python.
-        self.dark2 = nn.Sequential(
-            *self.make_group_layer(in_channels, num_blocks[0], stride=2)
-        )
-        in_channels *= 2  # 128
-        self.dark3 = nn.Sequential(
-            *self.make_group_layer(in_channels, num_blocks[1], stride=2)
-        )
-        in_channels *= 2  # 256
-        self.dark4 = nn.Sequential(
-            *self.make_group_layer(in_channels, num_blocks[2], stride=2)
-        )
-        in_channels *= 2  # 512
-        self.dark5 = nn.Sequential(
-            *self.make_group_layer(in_channels, num_blocks[3], stride=2),
-            *self.make_spp_block([in_channels, in_channels * 2], in_channels * 2),
-        )
-    def make_group_layer(self, in_channels: int, num_blocks: int, stride: int = 1):
-        "starts with conv layer then has `num_blocks` `ResLayer`"
-        return [
-            BaseConv(in_channels, in_channels * 2, ksize=3, stride=stride, act="lrelu"),
-            *[(ResLayer(in_channels * 2)) for _ in range(num_blocks)],
-        ]
-    def make_spp_block(self, filters_list, in_filters):
-        m = nn.Sequential(
-            *[
-                BaseConv(in_filters, filters_list[0], 1, stride=1, act="lrelu"),
-                BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
-                SPPBottleneck(
-                    in_channels=filters_list[1],
-                    out_channels=filters_list[0],
-                    activation="lrelu",
-                ),
-                BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
-                BaseConv(filters_list[1], filters_list[0], 1, stride=1, act="lrelu"),
-            ]
-        )
-        return m
-    def forward(self, x):
-        outputs = {}
-        x = self.stem(x)
-        outputs["stem"] = x
-        x = self.dark2(x)
-        outputs["dark2"] = x
-        x = self.dark3(x)
-        outputs["dark3"] = x
-        x = self.dark4(x)
-        outputs["dark4"] = x
-        x = self.dark5(x)
-        outputs["dark5"] = x
-        return {k: v for k, v in outputs.items() if k in self.out_features}
-class CSPDarknet(nn.Module):
-    def __init__(
-        self,
-        dep_mul,
-        wid_mul,
-        out_features=("dark3", "dark4", "dark5"),
-        depthwise=False,
-        act="silu",
-    ):
-        super().__init__()
-        assert out_features, "please provide output features of Darknet"
-        self.out_features = out_features
-        Conv = DWConv if depthwise else BaseConv
-        base_channels = int(wid_mul * 64)  # 64
-        base_depth = max(round(dep_mul * 3), 1)  # 3
-        # stem
-        self.stem = Focus(3, base_channels, ksize=3, act=act)
-        # dark2
-        self.dark2 = nn.Sequential(
-            Conv(base_channels, base_channels * 2, 3, 2, act=act),
-            CSPLayer(
-                base_channels * 2,
-                base_channels * 2,
-                n=base_depth,
-                depthwise=depthwise,
-                act=act,
-            ),
-        )
-        # dark3
-        self.dark3 = nn.Sequential(
-            Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
-            CSPLayer(
-                base_channels * 4,
-                base_channels * 4,
-                n=base_depth * 3,
-                depthwise=depthwise,
-                act=act,
-            ),
-        )
-        # dark4
-        self.dark4 = nn.Sequential(
-            Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
-            CSPLayer(
-                base_channels * 8,
-                base_channels * 8,
-                n=base_depth * 3,
-                depthwise=depthwise,
-                act=act,
-            ),
-        )
-        # dark5
-        self.dark5 = nn.Sequential(
-            Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
-            SPPBottleneck(base_channels * 16, base_channels * 16, activation=act),
-            CSPLayer(
-                base_channels * 16,
-                base_channels * 16,
-                n=base_depth,
-                shortcut=False,
-                depthwise=depthwise,
-                act=act,
-            ),
-        )
-    def forward(self, x):
-        outputs = {}
-        x = self.stem(x)
-        outputs["stem"] = x
-        x = self.dark2(x)
-        outputs["dark2"] = x
-        x = self.dark3(x)
-        outputs["dark3"] = x
-        x = self.dark4(x)
-        outputs["dark4"] = x
-        x = self.dark5(x)
-        outputs["dark5"] = x
-        return {k: v for k, v in outputs.items() if k in self.out_features}

yolox/network_blocks.py DELETED Viewed

@@ -1,213 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-# Copyright (c) Megvii Inc. All rights reserved.
-import torch
-import torch.nn as nn
-class SiLU(nn.Module):
-    """export-friendly version of nn.SiLU()"""
-    @staticmethod
-    def forward(x):
-        return x * torch.sigmoid(x)
-def get_activation(name="silu", inplace=True):
-    if name == "silu":
-        module = nn.SiLU(inplace=inplace)
-    elif name == "relu":
-        module = nn.ReLU(inplace=inplace)
-    elif name == "lrelu":
-        module = nn.LeakyReLU(0.1, inplace=inplace)
-    else:
-        raise AttributeError("Unsupported act type: {}".format(name))
-    return module
-class BaseConv(nn.Module):
-    """A Conv2d -> Batchnorm -> silu/leaky relu block"""
-    def __init__(
-        self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"
-    ):
-        super().__init__()
-        # same padding
-        pad = (ksize - 1) // 2
-        self.conv = nn.Conv2d(
-            in_channels,
-            out_channels,
-            kernel_size=ksize,
-            stride=stride,
-            padding=pad,
-            groups=groups,
-            bias=bias,
-        )
-        self.bn = nn.BatchNorm2d(out_channels)
-        self.act = get_activation(act, inplace=True)
-    def forward(self, x):
-        return self.act(self.bn(self.conv(x)))
-    def fuseforward(self, x):
-        return self.act(self.conv(x))
-class DWConv(nn.Module):
-    """Depthwise Conv + Conv"""
-    def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"):
-        super().__init__()
-        self.dconv = BaseConv(
-            in_channels,
-            in_channels,
-            ksize=ksize,
-            stride=stride,
-            groups=in_channels,
-            act=act,
-        )
-        self.pconv = BaseConv(
-            in_channels, out_channels, ksize=1, stride=1, groups=1, act=act
-        )
-    def forward(self, x):
-        x = self.dconv(x)
-        return self.pconv(x)
-class Bottleneck(nn.Module):
-    # Standard bottleneck
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        shortcut=True,
-        expansion=0.5,
-        depthwise=False,
-        act="silu",
-    ):
-        super().__init__()
-        hidden_channels = int(out_channels * expansion)
-        Conv = DWConv if depthwise else BaseConv
-        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
-        self.conv2 = Conv(hidden_channels, out_channels, 3, stride=1, act=act)
-        self.use_add = shortcut and in_channels == out_channels
-    def forward(self, x):
-        y = self.conv2(self.conv1(x))
-        if self.use_add:
-            y = y + x
-        return y
-class ResLayer(nn.Module):
-    "Residual layer with `in_channels` inputs."
-    def __init__(self, in_channels: int):
-        super().__init__()
-        mid_channels = in_channels // 2
-        self.layer1 = BaseConv(
-            in_channels, mid_channels, ksize=1, stride=1, act="lrelu"
-        )
-        self.layer2 = BaseConv(
-            mid_channels, in_channels, ksize=3, stride=1, act="lrelu"
-        )
-    def forward(self, x):
-        out = self.layer2(self.layer1(x))
-        return x + out
-class SPPBottleneck(nn.Module):
-    """Spatial pyramid pooling layer used in YOLOv3-SPP"""
-    def __init__(
-        self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu"
-    ):
-        super().__init__()
-        hidden_channels = in_channels // 2
-        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation)
-        self.m = nn.ModuleList(
-            [
-                nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
-                for ks in kernel_sizes
-            ]
-        )
-        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
-        self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation)
-    def forward(self, x):
-        x = self.conv1(x)
-        x = torch.cat([x] + [m(x) for m in self.m], dim=1)
-        x = self.conv2(x)
-        return x
-class CSPLayer(nn.Module):
-    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        n=1,
-        shortcut=True,
-        expansion=0.5,
-        depthwise=False,
-        act="silu",
-    ):
-        """
-        Args:
-            in_channels (int): input channels.
-            out_channels (int): output channels.
-            n (int): number of Bottlenecks. Default value: 1.
-        """
-        # ch_in, ch_out, number, shortcut, groups, expansion
-        super().__init__()
-        hidden_channels = int(out_channels * expansion)  # hidden channels
-        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
-        self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
-        self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1, act=act)
-        module_list = [
-            Bottleneck(
-                hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act
-            )
-            for _ in range(n)
-        ]
-        self.m = nn.Sequential(*module_list)
-    def forward(self, x):
-        x_1 = self.conv1(x)
-        x_2 = self.conv2(x)
-        x_1 = self.m(x_1)
-        x = torch.cat((x_1, x_2), dim=1)
-        return self.conv3(x)
-class Focus(nn.Module):
-    """Focus width and height information into channel space."""
-    def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"):
-        super().__init__()
-        self.conv = BaseConv(in_channels * 4, out_channels, ksize, stride, act=act)
-    def forward(self, x):
-        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
-        patch_top_left = x[..., ::2, ::2]
-        patch_top_right = x[..., ::2, 1::2]
-        patch_bot_left = x[..., 1::2, ::2]
-        patch_bot_right = x[..., 1::2, 1::2]
-        x = torch.cat(
-            (
-                patch_top_left,
-                patch_bot_left,
-                patch_top_right,
-                patch_bot_right,
-            ),
-            dim=1,
-        )
-        return self.conv(x)

yolox/yolo_fpn.py DELETED Viewed

@@ -1,87 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-# Copyright (c) Megvii Inc. All rights reserved.
-import torch
-import torch.nn as nn
-from .darknet import Darknet
-from .network_blocks import BaseConv
-class YOLOFPN(nn.Module):
-    """
-    YOLOFPN module. Darknet 53 is the default backbone of this model.
-    """
-    def __init__(
-        self,
-        depth=53,
-        in_features=["dark3", "dark4", "dark5"],
-    ):
-        super().__init__()
-        self.backbone = Darknet(depth)
-        self.in_features = in_features
-        # out 1
-        self.out1_cbl = self._make_cbl(512, 256, 1)
-        self.out1 = self._make_embedding([256, 512], 512 + 256)
-        # out 2
-        self.out2_cbl = self._make_cbl(256, 128, 1)
-        self.out2 = self._make_embedding([128, 256], 256 + 128)
-        # upsample
-        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
-    def _make_cbl(self, _in, _out, ks):
-        return BaseConv(_in, _out, ks, stride=1, act="lrelu")
-    def _make_embedding(self, filters_list, in_filters):
-        m = nn.Sequential(
-            *[
-                self._make_cbl(in_filters, filters_list[0], 1),
-                self._make_cbl(filters_list[0], filters_list[1], 3),
-                self._make_cbl(filters_list[1], filters_list[0], 1),
-                self._make_cbl(filters_list[0], filters_list[1], 3),
-                self._make_cbl(filters_list[1], filters_list[0], 1),
-            ]
-        )
-        return m
-    def load_pretrained_model(self, filename="./weights/darknet53.mix.pth"):
-        with open(filename, "rb") as f:
-            state_dict = torch.load(f, map_location="cpu")
-        print("loading pretrained weights...")
-        self.backbone.load_state_dict(state_dict)
-    def forward(self, inputs):
-        """
-        Args:
-            inputs (Tensor): input image.
-        Returns:
-            Tuple[Tensor]: FPN output features..
-        """
-        #  backbone
-        out_features = self.backbone(inputs)
-        x2, x1, x0 = [out_features[f] for f in self.in_features]
-        #  yolo branch 1
-        x1_in = self.out1_cbl(x0)
-        x1_in = self.upsample(x1_in)
-        x1_in = torch.cat([x1_in, x1], 1)
-        out_dark4 = self.out1(x1_in)
-        #  yolo branch 2
-        x2_in = self.out2_cbl(out_dark4)
-        x2_in = self.upsample(x2_in)
-        x2_in = torch.cat([x2_in, x2], 1)
-        out_dark3 = self.out2(x2_in)
-        outputs = (out_dark3, out_dark4, x0)
-        return outputs

yolox/yolo_head.py DELETED Viewed

@@ -1,238 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
-# Copyright (c) Megvii Inc. All rights reserved.
-import torch
-import torch.nn as nn
-from .network_blocks import BaseConv, DWConv
-_TORCH_VER = [int(x) for x in torch.__version__.split(".")[:2]]
-def meshgrid(*tensors):
-    """
-    Copied from YOLOX/yolox/utils/compat.py
-    """
-    if _TORCH_VER >= [1, 10]:
-        return torch.meshgrid(*tensors, indexing="ij")
-    else:
-        return torch.meshgrid(*tensors)
-def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
-    """
-    Copied from YOLOX/yolox/utils/boxes.py
-    """
-    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
-        raise IndexError
-    if xyxy:
-        tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
-        br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
-        area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
-        area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
-    else:
-        tl = torch.max(
-            (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
-            (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
-        )
-        br = torch.min(
-            (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
-            (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
-        )
-        area_a = torch.prod(bboxes_a[:, 2:], 1)
-        area_b = torch.prod(bboxes_b[:, 2:], 1)
-    en = (tl < br).type(tl.type()).prod(dim=2)
-    area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
-    return area_i / (area_a[:, None] + area_b - area_i)
-class YOLOXHead(nn.Module):
-    def __init__(
-        self,
-        num_classes,
-        width=1.0,
-        strides=[8, 16, 32],
-        in_channels=[256, 512, 1024],
-        act="silu",
-        depthwise=False,
-    ):
-        """
-        Args:
-            act (str): activation type of conv. Defalut value: "silu".
-            depthwise (bool): whether apply depthwise conv in conv branch. Defalut value: False.
-        """
-        super().__init__()
-        self.num_classes = num_classes
-        self.decode_in_inference = True  # for deploy, set to False
-        self.cls_convs = nn.ModuleList()
-        self.reg_convs = nn.ModuleList()
-        self.cls_preds = nn.ModuleList()
-        self.reg_preds = nn.ModuleList()
-        self.obj_preds = nn.ModuleList()
-        self.stems = nn.ModuleList()
-        Conv = DWConv if depthwise else BaseConv
-        for i in range(len(in_channels)):
-            self.stems.append(
-                BaseConv(
-                    in_channels=int(in_channels[i] * width),
-                    out_channels=int(256 * width),
-                    ksize=1,
-                    stride=1,
-                    act=act,
-                )
-            )
-            self.cls_convs.append(
-                nn.Sequential(
-                    *[
-                        Conv(
-                            in_channels=int(256 * width),
-                            out_channels=int(256 * width),
-                            ksize=3,
-                            stride=1,
-                            act=act,
-                        ),
-                        Conv(
-                            in_channels=int(256 * width),
-                            out_channels=int(256 * width),
-                            ksize=3,
-                            stride=1,
-                            act=act,
-                        ),
-                    ]
-                )
-            )
-            self.reg_convs.append(
-                nn.Sequential(
-                    *[
-                        Conv(
-                            in_channels=int(256 * width),
-                            out_channels=int(256 * width),
-                            ksize=3,
-                            stride=1,
-                            act=act,
-                        ),
-                        Conv(
-                            in_channels=int(256 * width),
-                            out_channels=int(256 * width),
-                            ksize=3,
-                            stride=1,
-                            act=act,
-                        ),
-                    ]
-                )
-            )
-            self.cls_preds.append(
-                nn.Conv2d(
-                    in_channels=int(256 * width),
-                    out_channels=self.num_classes,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                )
-            )
-            self.reg_preds.append(
-                nn.Conv2d(
-                    in_channels=int(256 * width),
-                    out_channels=4,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                )
-            )
-            self.obj_preds.append(
-                nn.Conv2d(
-                    in_channels=int(256 * width),
-                    out_channels=1,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                )
-            )
-        self.use_l1 = False
-        self.l1_loss = nn.L1Loss(reduction="none")
-        self.bcewithlog_loss = nn.BCEWithLogitsLoss(reduction="none")
-        self.iou_loss = None
-        self.strides = strides
-        self.grids = [torch.zeros(1)] * len(in_channels)
-    def forward(self, xin, labels=None, imgs=None):
-        outputs = []
-        for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate(
-            zip(self.cls_convs, self.reg_convs, self.strides, xin)
-        ):
-            x = self.stems[k](x)
-            cls_x = x
-            reg_x = x
-            cls_feat = cls_conv(cls_x)
-            cls_output = self.cls_preds[k](cls_feat)
-            reg_feat = reg_conv(reg_x)
-            reg_output = self.reg_preds[k](reg_feat)
-            obj_output = self.obj_preds[k](reg_feat)
-            output = torch.cat(
-                [reg_output, obj_output.sigmoid(), cls_output.sigmoid()], 1
-            )
-            outputs.append(output)
-        self.hw = [x.shape[-2:] for x in outputs]
-        # [batch, n_anchors_all, 85]
-        outputs = torch.cat(
-            [x.flatten(start_dim=2) for x in outputs], dim=2
-        ).permute(0, 2, 1)
-        if self.decode_in_inference:
-            return self.decode_outputs(outputs, dtype=xin[0].type())
-        else:
-            return outputs
-    def get_output_and_grid(self, output, k, stride, dtype):
-        grid = self.grids[k]
-        batch_size = output.shape[0]
-        n_ch = 5 + self.num_classes
-        hsize, wsize = output.shape[-2:]
-        if grid.shape[2:4] != output.shape[2:4]:
-            yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)])
-            grid = torch.stack((xv, yv), 2).view(1, 1, hsize, wsize, 2).type(dtype)
-            self.grids[k] = grid
-        output = output.view(batch_size, 1, n_ch, hsize, wsize)
-        output = output.permute(0, 1, 3, 4, 2).reshape(
-            batch_size, hsize * wsize, -1
-        )
-        grid = grid.view(1, -1, 2)
-        output[..., :2] = (output[..., :2] + grid) * stride
-        output[..., 2:4] = torch.exp(output[..., 2:4]) * stride
-        return output, grid
-    def decode_outputs(self, outputs, dtype):
-        grids = []
-        strides = []
-        for (hsize, wsize), stride in zip(self.hw, self.strides):
-            yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)])
-            grid = torch.stack((xv, yv), 2).view(1, -1, 2)
-            grids.append(grid)
-            shape = grid.shape[:2]
-            strides.append(torch.full((*shape, 1), stride))
-        grids = torch.cat(grids, dim=1).type(dtype)
-        strides = torch.cat(strides, dim=1).type(dtype)
-        outputs = torch.cat([
-            (outputs[..., 0:2] + grids) * strides,
-            torch.exp(outputs[..., 2:4]) * strides,
-            outputs[..., 4:]
-        ], dim=-1)
-        return outputs

yolox/yolo_pafpn.py DELETED Viewed

@@ -1,119 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-# Copyright (c) Megvii Inc. All rights reserved.
-import torch
-import torch.nn as nn
-from .darknet import CSPDarknet
-from .network_blocks import BaseConv, CSPLayer, DWConv
-class YOLOPAFPN(nn.Module):
-    """
-    YOLOv3 model. Darknet 53 is the default backbone of this model.
-    """
-    def __init__(
-        self,
-        depth=1.0,
-        width=1.0,
-        in_features=("dark3", "dark4", "dark5"),
-        in_channels=[256, 512, 1024],
-        depthwise=False,
-        act="silu",
-    ):
-        super().__init__()
-        self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act)
-        self.in_features = in_features
-        self.in_channels = in_channels
-        Conv = DWConv if depthwise else BaseConv
-        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
-        self.lateral_conv0 = BaseConv(
-            int(in_channels[2] * width), int(in_channels[1] * width), 1, 1, act=act
-        )
-        self.C3_p4 = CSPLayer(
-            int(2 * in_channels[1] * width),
-            int(in_channels[1] * width),
-            round(3 * depth),
-            False,
-            depthwise=depthwise,
-            act=act,
-        )  # cat
-        self.reduce_conv1 = BaseConv(
-            int(in_channels[1] * width), int(in_channels[0] * width), 1, 1, act=act
-        )
-        self.C3_p3 = CSPLayer(
-            int(2 * in_channels[0] * width),
-            int(in_channels[0] * width),
-            round(3 * depth),
-            False,
-            depthwise=depthwise,
-            act=act,
-        )
-        # bottom-up conv
-        self.bu_conv2 = Conv(
-            int(in_channels[0] * width), int(in_channels[0] * width), 3, 2, act=act
-        )
-        self.C3_n3 = CSPLayer(
-            int(2 * in_channels[0] * width),
-            int(in_channels[1] * width),
-            round(3 * depth),
-            False,
-            depthwise=depthwise,
-            act=act,
-        )
-        # bottom-up conv
-        self.bu_conv1 = Conv(
-            int(in_channels[1] * width), int(in_channels[1] * width), 3, 2, act=act
-        )
-        self.C3_n4 = CSPLayer(
-            int(2 * in_channels[1] * width),
-            int(in_channels[2] * width),
-            round(3 * depth),
-            False,
-            depthwise=depthwise,
-            act=act,
-        )
-    def forward(self, input):
-        """
-        Args:
-            inputs: input images.
-        Returns:
-            Tuple[Tensor]: FPN feature.
-        """
-        #  backbone
-        out_features = self.backbone(input)
-        features = [out_features[f] for f in self.in_features]
-        [x2, x1, x0] = features
-        fpn_out0 = self.lateral_conv0(x0)  # 1024->512/32
-        f_out0 = self.upsample(fpn_out0)  # 512/16
-        f_out0 = torch.cat([f_out0, x1], 1)  # 512->1024/16
-        f_out0 = self.C3_p4(f_out0)  # 1024->512/16
-        fpn_out1 = self.reduce_conv1(f_out0)  # 512->256/16
-        f_out1 = self.upsample(fpn_out1)  # 256/8
-        f_out1 = torch.cat([f_out1, x2], 1)  # 256->512/8
-        pan_out2 = self.C3_p3(f_out1)  # 512->256/8
-        p_out1 = self.bu_conv2(pan_out2)  # 256->256/16
-        p_out1 = torch.cat([p_out1, fpn_out1], 1)  # 256->512/16
-        pan_out1 = self.C3_n3(p_out1)  # 512->512/16
-        p_out0 = self.bu_conv1(pan_out1)  # 512->512/32
-        p_out0 = torch.cat([p_out0, fpn_out0], 1)  # 512->1024/32
-        pan_out0 = self.C3_n4(p_out0)  # 1024->1024/32
-        outputs = (pan_out2, pan_out1, pan_out0)
-        return outputs

yolox/yolox.py DELETED Viewed

@@ -1,35 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-# Copyright (c) Megvii Inc. All rights reserved.
-import torch.nn as nn
-from .yolo_head import YOLOXHead
-from .yolo_pafpn import YOLOPAFPN
-class YOLOX(nn.Module):
-    """
-    YOLOX model module. The module list is defined by create_yolov3_modules function.
-    The network returns loss values from three YOLO layers during training
-    and detection results during test.
-    """
-    def __init__(self, backbone=None, head=None):
-        super().__init__()
-        if backbone is None:
-            backbone = YOLOPAFPN()
-        if head is None:
-            head = YOLOXHead(80)
-        self.backbone = backbone
-        self.head = head
-    def forward(self, x, targets=None):
-        assert not self.training, "Training mode not supported, please refer to the YOLOX repo"
-        fpn_outs = self.backbone(x)
-        outputs = self.head(fpn_outs)
-        return outputs