diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index e567036d..665ebbd6 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -33,7 +33,7 @@ jobs: run: pip install -e .[dev] - name: Install latest luxonis-ml - run: pip install luxonis-ml[all]@git+https://github.com/luxonis/luxonis-ml.git@main --upgrade --no-deps --force-reinstall + run: pip install luxonis-ml[all]@git+https://github.com/luxonis/luxonis-ml.git@main --upgrade --force-reinstall - name: Authenticate to Google Cloud id: google-auth diff --git a/configs/detection_heavy_model.yaml b/configs/detection_heavy_model.yaml index b56bdba6..e19621c2 100644 --- a/configs/detection_heavy_model.yaml +++ b/configs/detection_heavy_model.yaml @@ -1,4 +1,5 @@ # Example configuration for training a predefined heavy detection model +# NOTE: This example downloads pretrained COCO weights and training parameters are already prepared for fine tuning model: name: detection_heavy @@ -22,6 +23,9 @@ trainer: keep_aspect_ratio: true normalize: active: true + params: + mean: [0., 0., 0.] + std: [1, 1, 1] batch_size: 8 epochs: &epochs 300 diff --git a/configs/detection_light_model.yaml b/configs/detection_light_model.yaml index 6cd85a0f..3a1f6205 100644 --- a/configs/detection_light_model.yaml +++ b/configs/detection_light_model.yaml @@ -23,6 +23,9 @@ trainer: keep_aspect_ratio: true normalize: active: true + params: + mean: [0., 0., 0.] + std: [1, 1, 1] batch_size: 8 epochs: &epochs 300 diff --git a/configs/instance_segmentation_heavy_model.yaml b/configs/instance_segmentation_heavy_model.yaml new file mode 100644 index 00000000..c1395f0c --- /dev/null +++ b/configs/instance_segmentation_heavy_model.yaml @@ -0,0 +1,54 @@ +# Example configuration for training a predefined heavy instance segmentation model + +model: + name: instance_segmentation_heavy + predefined_model: + name: InstanceSegmentationModel + params: + variant: heavy + loss_params: + bbox_loss_weight: 60 # Should be 7.5 * accumulate_grad_batches for best results + class_loss_weight: 4 # Should be 0.5 * accumulate_grad_batches for best results + dfl_loss_weight: 12 # Should be 1.5 * accumulate_grad_batches for best results + +loader: + params: + dataset_name: coco_test + +trainer: + preprocessing: + train_image_size: [384, 512] + keep_aspect_ratio: true + normalize: + active: true + params: + mean: [0., 0., 0.] + std: [1, 1, 1] + + batch_size: 8 + epochs: &epochs 300 + accumulate_grad_batches: 8 # For best results, always accumulate gradients to effectively use 64 batch size + n_workers: 8 + validation_interval: 10 + n_log_images: 8 + + callbacks: + - name: EMACallback + params: + decay: 0.9999 + use_dynamic_decay: True + decay_tau: 2000 + - name: ExportOnTrainEnd + - name: TestOnTrainEnd + + training_strategy: + name: "TripleLRSGDStrategy" + params: + warmup_epochs: 3 + warmup_bias_lr: 0.1 + warmup_momentum: 0.8 + lr: 0.01 + lre: 0.0001 + momentum: 0.937 + weight_decay: 0.0005 + nesterov: True \ No newline at end of file diff --git a/configs/instance_segmentation_light_model.yaml b/configs/instance_segmentation_light_model.yaml new file mode 100644 index 00000000..1d1736e4 --- /dev/null +++ b/configs/instance_segmentation_light_model.yaml @@ -0,0 +1,54 @@ +# Example configuration for training a predefined light instance segmentation model + +model: + name: instance_segmentation_light + predefined_model: + name: InstanceSegmentationModel + params: + variant: light + loss_params: + bbox_loss_weight: 60 # Should be 7.5 * accumulate_grad_batches for best results + class_loss_weight: 4 # Should be 0.5 * accumulate_grad_batches for best results + dfl_loss_weight: 12 # Should be 1.5 * accumulate_grad_batches for best results + +loader: + params: + dataset_name: coco_test + +trainer: + preprocessing: + train_image_size: [384, 512] + keep_aspect_ratio: true + normalize: + active: true + params: + mean: [0., 0., 0.] + std: [1, 1, 1] + + batch_size: 8 + epochs: &epochs 300 + accumulate_grad_batches: 8 # For best results, always accumulate gradients to effectively use 64 batch size + n_workers: 8 + validation_interval: 10 + n_log_images: 8 + + callbacks: + - name: EMACallback + params: + decay: 0.9999 + use_dynamic_decay: True + decay_tau: 2000 + - name: ExportOnTrainEnd + - name: TestOnTrainEnd + + training_strategy: + name: "TripleLRSGDStrategy" + params: + warmup_epochs: 3 + warmup_bias_lr: 0.1 + warmup_momentum: 0.8 + lr: 0.01 + lre: 0.0001 + momentum: 0.937 + weight_decay: 0.0005 + nesterov: True \ No newline at end of file diff --git a/configs/keypoint_bbox_heavy_model.yaml b/configs/keypoint_bbox_heavy_model.yaml index 10527921..4fabc83b 100644 --- a/configs/keypoint_bbox_heavy_model.yaml +++ b/configs/keypoint_bbox_heavy_model.yaml @@ -6,6 +6,13 @@ model: name: KeypointDetectionModel params: variant: heavy + loss_params: + iou_type: "siou" + n_warmup_epochs: 0 # No assigner warmup + iou_loss_weight: 60 # Should be 7.5 * accumulate_grad_batches for best results + class_loss_weight: 4 # Should be 0.5 * accumulate_grad_batches for best results + regr_kpts_loss_weight: 96 # Should be 12 * accumulate_grad_batches for best results + vis_kpts_loss_weight: 16 # Should be 2 * accumulate_grad_batches for best results loader: params: @@ -17,29 +24,34 @@ trainer: keep_aspect_ratio: true normalize: active: true + params: + mean: [0., 0., 0.] + std: [1, 1, 1] batch_size: 8 - epochs: &epochs 200 + epochs: &epochs 300 n_workers: 4 validation_interval: 10 n_log_images: 8 + accumulate_grad_batches: 8 # For best results, always accumulate gradients to effectively use 64 batch size callbacks: + - name: EMACallback + params: + decay: 0.9999 + use_dynamic_decay: True + decay_tau: 2000 - name: ExportOnTrainEnd - name: TestOnTrainEnd - optimizer: - name: SGD - params: - lr: 0.006 - momentum: 0.937 + training_strategy: + name: "TripleLRSGDStrategy" + params: + warmup_epochs: 3 + warmup_bias_lr: 0.1 + warmup_momentum: 0.8 + lr: 0.01 + lre: 0.0001 + momentum: 0.937 weight_decay: 0.0005 - dampening: 0.0 - nesterov: true - - scheduler: - name: CosineAnnealingLR - params: - T_max: *epochs - eta_min: 0.00001 - last_epoch: -1 + nesterov: True \ No newline at end of file diff --git a/configs/keypoint_bbox_light_model.yaml b/configs/keypoint_bbox_light_model.yaml index 57042b04..303dca31 100644 --- a/configs/keypoint_bbox_light_model.yaml +++ b/configs/keypoint_bbox_light_model.yaml @@ -6,6 +6,13 @@ model: name: KeypointDetectionModel params: variant: light + loss_params: + iou_type: "siou" + n_warmup_epochs: 0 # No assigner warmup + iou_loss_weight: 60 # Should be 7.5 * accumulate_grad_batches for best results + class_loss_weight: 4 # Should be 0.5 * accumulate_grad_batches for best results + regr_kpts_loss_weight: 96 # Should be 12 * accumulate_grad_batches for best results + vis_kpts_loss_weight: 16 # Should be 2 * accumulate_grad_batches for best results loader: params: @@ -17,29 +24,34 @@ trainer: keep_aspect_ratio: true normalize: active: true + params: + mean: [0., 0., 0.] + std: [1, 1, 1] batch_size: 8 - epochs: &epochs 200 + epochs: &epochs 300 n_workers: 4 validation_interval: 10 n_log_images: 8 + accumulate_grad_batches: 8 # For best results, always accumulate gradients to effectively use 64 batch size callbacks: + - name: EMACallback + params: + decay: 0.9999 + use_dynamic_decay: True + decay_tau: 2000 - name: ExportOnTrainEnd - name: TestOnTrainEnd - optimizer: - name: SGD - params: - lr: 0.006 - momentum: 0.937 + training_strategy: + name: "TripleLRSGDStrategy" + params: + warmup_epochs: 3 + warmup_bias_lr: 0.1 + warmup_momentum: 0.8 + lr: 0.01 + lre: 0.0001 + momentum: 0.937 weight_decay: 0.0005 - dampening: 0.0 - nesterov: true - - scheduler: - name: CosineAnnealingLR - params: - T_max: *epochs - eta_min: 0.00001 - last_epoch: -1 + nesterov: True diff --git a/luxonis_train/assigners/tal_assigner.py b/luxonis_train/assigners/tal_assigner.py index b289fbd6..c40b4934 100644 --- a/luxonis_train/assigners/tal_assigner.py +++ b/luxonis_train/assigners/tal_assigner.py @@ -254,10 +254,4 @@ def _get_final_assignments( torch.full_like(assigned_scores, 0), ) - assigned_labels = torch.where( - mask_pos_sum.bool(), - assigned_labels, - torch.full_like(assigned_labels, self.n_classes), - ) - return assigned_labels, assigned_bboxes, assigned_scores diff --git a/luxonis_train/attached_modules/losses/README.md b/luxonis_train/attached_modules/losses/README.md index aa1b9ca6..ffe218d4 100644 --- a/luxonis_train/attached_modules/losses/README.md +++ b/luxonis_train/attached_modules/losses/README.md @@ -12,6 +12,8 @@ List of all the available loss functions. - [`AdaptiveDetectionLoss`](#adaptivedetectionloss) - [`EfficientKeypointBBoxLoss`](#efficientkeypointbboxloss) - [`FOMOLocalizationLoss`](#fomolocalizationLoss) +- \[`PrecisionDFLDetectionLoss`\] (# precisiondfldetectionloss) +- \[`PrecisionDFLSegmentationLoss`\] (# precisiondflsegmentationloss) ## `CrossEntropyLoss` @@ -97,7 +99,7 @@ Keypoint Similarity Loss](https://arxiv.org/ftp/arxiv/papers/2204/2204.06806.pdf | `class_loss_weight` | `float` | `1.0` | Weight used for the classification sub-loss | | `iou_loss_weight` | `float` | `2.5` | Weight used for the `IoU` sub-loss | | `regr_kpts_loss_weight` | `float` | `1.5` | Weight used for the `OKS` sub-loss | -| `vis_kpts_loss_weight` | `float` | `1.0` | Weight used for the keypoint visibility sub-loss | +| `vis_kpts_loss_weight` | `float` | `2.0` | Weight used for the keypoint visibility sub-loss | | `sigmas` | `list[float] \ None` | `None` | Sigmas used in `KeypointLoss` for `OKS` metric. If `None` then use COCO ones if possible or default ones | | `area_factor` | `float \| None` | `None` | Factor by which we multiply bounding box area which is used in `KeypointLoss.` If `None` then use default one | @@ -120,4 +122,30 @@ Adapted from [here](https://arxiv.org/abs/2108.07610). | Key | Type | Default value | Description | | --------------- | ------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `object_weight` | `float` | `1000` | Weight for the objects in the loss calculation. Training with a larger `object_weight` in the loss parameters may result in more false positives (FP), but it will improve accuracy. | +| `object_weight` | `float` | `500` | Weight for the objects in the loss calculation. Training with a larger `object_weight` in the loss parameters may result in more false positives (FP), but it will improve accuracy. | + +## `PrecisionDFLDetectionLoss` + +Adapted from [here](https://arxiv.org/pdf/2207.02696.pdf) and [here](https://arxiv.org/pdf/2209.02976.pdf). + +**Parameters:** + +| Key | Type | Default value | Description | +| ------------------- | ------- | ------------- | ------------------------------------------ | +| `tal_topk` | `int` | `10` | Number of anchors considered in selection. | +| `class_loss_weight` | `float` | `0.5` | Weight for classification loss. | +| `bbox_loss_weight` | `float` | `7.5` | Weight for bbox loss. | +| `dfl_loss_weigth` | `float` | `1.5` | Weight for DFL loss. | + +## `PrecisionDFLSegmentationLoss` + +Adapted from [here](https://arxiv.org/pdf/2207.02696.pdf) and [here](https://arxiv.org/pdf/2209.02976.pdf). + +**Parameters:** + +| Key | Type | Default value | Description | +| ------------------- | ------- | ------------- | ------------------------------------------ | +| `tal_topk` | `int` | `10` | Number of anchors considered in selection. | +| `class_loss_weight` | `float` | `0.5` | Weight for classification loss. | +| `bbox_loss_weight` | `float` | `7.5` | Weight for bbox and segmentation loss. | +| `dfl_loss_weigth` | `float` | `1.5` | Weight for DFL loss. | diff --git a/luxonis_train/attached_modules/losses/__init__.py b/luxonis_train/attached_modules/losses/__init__.py index ff0bafc8..32b33174 100644 --- a/luxonis_train/attached_modules/losses/__init__.py +++ b/luxonis_train/attached_modules/losses/__init__.py @@ -7,6 +7,8 @@ from .ohem_bce_with_logits import OHEMBCEWithLogitsLoss from .ohem_cross_entropy import OHEMCrossEntropyLoss from .ohem_loss import OHEMLoss +from .precision_dfl_detection_loss import PrecisionDFLDetectionLoss +from .precision_dlf_segmentation_loss import PrecisionDFLSegmentationLoss from .reconstruction_segmentation_loss import ReconstructionSegmentationLoss from .sigmoid_focal_loss import SigmoidFocalLoss from .smooth_bce_with_logits import SmoothBCEWithLogitsLoss @@ -26,4 +28,6 @@ "OHEMCrossEntropyLoss", "OHEMBCEWithLogitsLoss", "FOMOLocalizationLoss", + "PrecisionDFLDetectionLoss", + "PrecisionDFLSegmentationLoss", ] diff --git a/luxonis_train/attached_modules/losses/adaptive_detection_loss.py b/luxonis_train/attached_modules/losses/adaptive_detection_loss.py index 6a7f57f2..5e212249 100644 --- a/luxonis_train/attached_modules/losses/adaptive_detection_loss.py +++ b/luxonis_train/attached_modules/losses/adaptive_detection_loss.py @@ -56,9 +56,9 @@ def __init__( @type reduction: Literal["sum", "mean"] @param reduction: Reduction type for loss. @type class_loss_weight: float - @param class_loss_weight: Weight of classification loss. + @param class_loss_weight: Weight of classification loss. Defaults to 1.0. For optimal results, multiply with accumulate_grad_batches. @type iou_loss_weight: float - @param iou_loss_weight: Weight of IoU loss. + @param iou_loss_weight: Weight of IoU loss. Defaults to 2.5. For optimal results, multiply with accumulate_grad_batches. """ super().__init__(**kwargs) @@ -133,6 +133,11 @@ def forward( assigned_scores: Tensor, mask_positive: Tensor, ) -> tuple[Tensor, dict[str, Tensor]]: + assigned_labels = torch.where( + mask_positive > 0, + assigned_labels, + torch.full_like(assigned_labels, self.n_classes), + ) one_hot_label = F.one_hot(assigned_labels.long(), self.n_classes + 1)[ ..., :-1 ] diff --git a/luxonis_train/attached_modules/losses/efficient_keypoint_bbox_loss.py b/luxonis_train/attached_modules/losses/efficient_keypoint_bbox_loss.py index 98630742..09cf7124 100644 --- a/luxonis_train/attached_modules/losses/efficient_keypoint_bbox_loss.py +++ b/luxonis_train/attached_modules/losses/efficient_keypoint_bbox_loss.py @@ -38,7 +38,7 @@ def __init__( iou_loss_weight: float = 7.5, viz_pw: float = 1.0, regr_kpts_loss_weight: float = 12, - vis_kpts_loss_weight: float = 1.0, + vis_kpts_loss_weight: float = 2.0, sigmas: list[float] | None = None, area_factor: float | None = None, **kwargs: Any, @@ -57,11 +57,11 @@ def __init__( @type class_loss_weight: float @param class_loss_weight: Weight of classification loss for bounding boxes. @type regr_kpts_loss_weight: float - @param regr_kpts_loss_weight: Weight of regression loss for keypoints. + @param regr_kpts_loss_weight: Weight of regression loss for keypoints. Defaults to 12.0. For optimal results, multiply with accumulate_grad_batches. @type vis_kpts_loss_weight: float - @param vis_kpts_loss_weight: Weight of visibility loss for keypoints. + @param vis_kpts_loss_weight: Weight of visibility loss for keypoints. Defaults to 2.0. For optimal results, multiply with accumulate_grad_batches. @type iou_loss_weight: float - @param iou_loss_weight: Weight of IoU loss. + @param iou_loss_weight: Weight of IoU loss. Defaults to 2.5. For optimal results, multiply with accumulate_grad_batches. @type sigmas: list[float] | None @param sigmas: Sigmas used in keypoint loss for OKS metric. If None then use COCO ones if possible or default ones. Defaults to C{None}. @type area_factor: float | None @@ -188,6 +188,11 @@ def forward( pred_kpts: Tensor, area: Tensor, ) -> tuple[Tensor, dict[str, Tensor]]: + assigned_labels = torch.where( + mask_positive > 0, + assigned_labels, + torch.full_like(assigned_labels, self.n_classes), + ) device = pred_bboxes.device sigmas = self.sigmas.to(device) d = (gt_kpts[..., 0] - pred_kpts[..., 0]).pow(2) + ( diff --git a/luxonis_train/attached_modules/losses/precision_dfl_detection_loss.py b/luxonis_train/attached_modules/losses/precision_dfl_detection_loss.py new file mode 100644 index 00000000..14817ce4 --- /dev/null +++ b/luxonis_train/attached_modules/losses/precision_dfl_detection_loss.py @@ -0,0 +1,292 @@ +import logging +from typing import Any, cast + +import torch +import torch.nn.functional as F +from torch import Tensor, nn +from torchvision.ops import box_convert + +from luxonis_train.assigners import TaskAlignedAssigner +from luxonis_train.enums import TaskType +from luxonis_train.nodes import PrecisionBBoxHead +from luxonis_train.utils import ( + Labels, + Packet, + anchors_for_fpn_features, + bbox2dist, + bbox_iou, + dist2bbox, +) + +from .base_loss import BaseLoss + +logger = logging.getLogger(__name__) + + +class PrecisionDFLDetectionLoss( + BaseLoss[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor] +): + node: PrecisionBBoxHead + supported_tasks: list[TaskType] = [TaskType.BOUNDINGBOX] + + def __init__( + self, + tal_topk: int = 10, + class_loss_weight: float = 0.5, + bbox_loss_weight: float = 7.5, + dfl_loss_weight: float = 1.5, + **kwargs: Any, + ): + """BBox loss adapted from U{Real-Time Flying Object Detection with YOLOv8 + } and from U{YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications + }. + Code is adapted from U{https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/models}. + + @type tal_topk: int + @param tal_topk: Number of anchors considered in selection. Defaults to 10. + @type class_loss_weight: float + @param class_loss_weight: Weight for classification loss. Defaults to 0.5. For optimal results, multiply with accumulate_grad_batches. + @type bbox_loss_weight: float + @param bbox_loss_weight: Weight for bbox loss. Defaults to 7.5. For optimal results, multiply with accumulate_grad_batches. + @type dfl_loss_weight: float + @param dfl_loss_weight: Weight for DFL loss. Defaults to 1.5. For optimal results, multiply with accumulate_grad_batches. + """ + super().__init__(**kwargs) + self.stride = self.node.stride + self.grid_cell_size = self.node.grid_cell_size + self.grid_cell_offset = self.node.grid_cell_offset + self.original_img_size = self.original_in_shape[1:] + + self.class_loss_weight = class_loss_weight + self.bbox_loss_weight = bbox_loss_weight + self.dfl_loss_weight = dfl_loss_weight + + self.assigner = TaskAlignedAssigner( + n_classes=self.n_classes, topk=tal_topk, alpha=0.5, beta=6.0 + ) + self.bbox_loss = CustomBboxLoss(self.node.reg_max) + self.proj = torch.arange(self.node.reg_max, dtype=torch.float) + self.bce = nn.BCEWithLogitsLoss(reduction="none") + + def prepare( + self, inputs: Packet[Tensor], labels: Labels + ) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]: + feats = self.get_input_tensors(inputs, "features") + self._init_parameters(feats) + batch_size = feats[0].shape[0] + pred_distri, pred_scores = torch.cat( + [xi.view(batch_size, self.node.no, -1) for xi in feats], 2 + ).split((self.node.reg_max * 4, self.n_classes), 1) + target = self.get_label(labels) + pred_distri = pred_distri.permute(0, 2, 1).contiguous() + pred_scores = pred_scores.permute(0, 2, 1).contiguous() + + target = self._preprocess_bbox_target(target, batch_size) + + pred_bboxes = self.decode_bbox(self.anchor_points_strided, pred_distri) + + gt_labels = target[:, :, :1] + gt_xyxy = target[:, :, 1:] + mask_gt = (gt_xyxy.sum(-1, keepdim=True) > 0).float() + + _, assigned_bboxes, assigned_scores, mask_positive, _ = self.assigner( + pred_scores.detach().sigmoid(), + (pred_bboxes.detach() * self.stride_tensor).type(gt_xyxy.dtype), + self.anchor_points, + gt_labels, + gt_xyxy, + mask_gt, + ) + + return ( + pred_distri, + pred_bboxes, + pred_scores, + assigned_bboxes / self.stride_tensor, + assigned_scores, + mask_positive, + ) + + def forward( + self, + pred_distri: Tensor, + pred_bboxes: Tensor, + pred_scores: Tensor, + assigned_bboxes: Tensor, + assigned_scores: Tensor, + mask_positive: Tensor, + ): + max_assigned_scores_sum = max(assigned_scores.sum().item(), 1) + loss_cls = ( + self.bce(pred_scores, assigned_scores) + ).sum() / max_assigned_scores_sum + if mask_positive.sum(): + loss_iou, loss_dfl = self.bbox_loss( + pred_distri, + pred_bboxes, + self.anchor_points_strided, + assigned_bboxes, + assigned_scores, + max_assigned_scores_sum, + mask_positive, + ) + else: + loss_iou = torch.tensor(0.0).to(pred_distri.device) + loss_dfl = torch.tensor(0.0).to(pred_distri.device) + + loss = ( + self.class_loss_weight * loss_cls + + self.bbox_loss_weight * loss_iou + + self.dfl_loss_weight * loss_dfl + ) + sub_losses = { + "class": loss_cls.detach(), + "iou": loss_iou.detach(), + "dfl": loss_dfl.detach(), + } + + return loss, sub_losses + + def _preprocess_bbox_target( + self, target: Tensor, batch_size: int + ) -> Tensor: + sample_ids, counts = cast( + tuple[Tensor, Tensor], + torch.unique(target[:, 0].int(), return_counts=True), + ) + c_max = int(counts.max()) if counts.numel() > 0 else 0 + out_target = torch.zeros(batch_size, c_max, 5, device=target.device) + out_target[:, :, 0] = -1 + for id, count in zip(sample_ids, counts): + out_target[id, :count] = target[target[:, 0] == id][:, 1:] + + scaled_target = out_target[:, :, 1:5] * self.gt_bboxes_scale + out_target[..., 1:] = box_convert(scaled_target, "xywh", "xyxy") + + return out_target + + def decode_bbox(self, anchor_points: Tensor, pred_dist: Tensor) -> Tensor: + """Decode predicted object bounding box coordinates from anchor + points and distribution. + + @type anchor_points: Tensor + @param anchor_points: Anchor points tensor of shape [N, 4] where + N is the number of anchors. + @type pred_dist: Tensor + @param pred_dist: Predicted distribution tensor of shape + [batch_size, N, 4 * reg_max] where N is the number of + anchors. + @rtype: Tensor + """ + if self.node.dfl: + batch_size, num_anchors, num_channels = pred_dist.shape + dist_probs = pred_dist.view( + batch_size, num_anchors, 4, num_channels // 4 + ).softmax(dim=3) + dist_transformed = dist_probs.matmul( + self.proj.to(anchor_points.device).type(pred_dist.dtype) + ) + return dist2bbox(dist_transformed, anchor_points, out_format="xyxy") + + def _init_parameters(self, features: list[Tensor]): + if not hasattr(self, "gt_bboxes_scale"): + _, self.anchor_points, _, self.stride_tensor = ( + anchors_for_fpn_features( + features, + self.stride, + self.grid_cell_size, + self.grid_cell_offset, + multiply_with_stride=True, + ) + ) + self.gt_bboxes_scale = torch.tensor( + [ + self.original_img_size[1], + self.original_img_size[0], + self.original_img_size[1], + self.original_img_size[0], + ], + device=features[0].device, + ) + self.anchor_points_strided = ( + self.anchor_points / self.stride_tensor + ) + + +class CustomBboxLoss(nn.Module): + def __init__(self, reg_max: int = 16): + """BBox loss that combines IoU and DFL losses. + + @type reg_max: int + @param reg_max: Maximum number of regression channels. Defaults + to 16. + """ + super().__init__() + self.dist_loss = CustomDFLoss(reg_max) if reg_max > 1 else None + + def forward( + self, + pred_dist: Tensor, + pred_bboxes: Tensor, + anchors: Tensor, + targets: Tensor, + scores: Tensor, + total_score: Tensor, + fg_mask: Tensor, + ) -> tuple[Tensor, Tensor]: + score_weights = scores.sum(dim=-1)[fg_mask].unsqueeze(dim=-1) + + iou_vals = bbox_iou( + pred_bboxes[fg_mask], + targets[fg_mask], + iou_type="ciou", + element_wise=True, + ).unsqueeze(dim=-1) + iou_loss_val = ((1.0 - iou_vals) * score_weights).sum() / total_score + + if self.dist_loss is not None: + offset_targets = bbox2dist( + targets, anchors, self.dist_loss.reg_max - 1 + ) + dfl_loss_val = ( + self.dist_loss( + pred_dist[fg_mask].view(-1, self.dist_loss.reg_max), + offset_targets[fg_mask], + ) + * score_weights + ) + dfl_loss_val = dfl_loss_val.sum() / total_score + else: + dfl_loss_val = torch.zeros(1, device=pred_dist.device) + + return iou_loss_val, dfl_loss_val + + +class CustomDFLoss(nn.Module): + def __init__(self, reg_max: int = 16): + """DFL loss that combines classification and regression losses. + + @type reg_max: int + @param reg_max: Maximum number of regression channels. Defaults + to 16. + """ + super().__init__() + self.reg_max = reg_max + + def __call__(self, pred_dist: Tensor, targets: Tensor) -> Tensor: + targets = targets.clamp(0, self.reg_max - 1 - 0.01) + left_target = targets.floor().long() + right_target = left_target + 1 + weight_left = right_target - targets + weight_right = 1.0 - weight_left + + left_val = F.cross_entropy( + pred_dist, left_target.view(-1), reduction="none" + ).view(left_target.shape) + right_val = F.cross_entropy( + pred_dist, right_target.view(-1), reduction="none" + ).view(left_target.shape) + + return (left_val * weight_left + right_val * weight_right).mean( + dim=-1, keepdim=True + ) diff --git a/luxonis_train/attached_modules/losses/precision_dlf_segmentation_loss.py b/luxonis_train/attached_modules/losses/precision_dlf_segmentation_loss.py new file mode 100644 index 00000000..54f960a9 --- /dev/null +++ b/luxonis_train/attached_modules/losses/precision_dlf_segmentation_loss.py @@ -0,0 +1,248 @@ +import logging +from typing import Any + +import torch +import torch.nn.functional as F +from torch import Tensor +from torchvision.ops import box_convert + +from luxonis_train.attached_modules.losses.precision_dfl_detection_loss import ( + PrecisionDFLDetectionLoss, +) +from luxonis_train.enums import TaskType +from luxonis_train.nodes import PrecisionSegmentBBoxHead +from luxonis_train.utils import ( + Labels, + Packet, + apply_bounding_box_to_masks, +) + +logger = logging.getLogger(__name__) + + +class PrecisionDFLSegmentationLoss(PrecisionDFLDetectionLoss): + node: PrecisionSegmentBBoxHead + supported_tasks: list[TaskType] = [ + TaskType.BOUNDINGBOX, + TaskType.INSTANCE_SEGMENTATION, + ] + + def __init__( + self, + tal_topk: int = 10, + class_loss_weight: float = 0.5, + bbox_loss_weight: float = 7.5, + dfl_loss_weight: float = 1.5, + **kwargs: Any, + ): + """Instance Segmentation and BBox loss adapted from U{Real-Time Flying Object Detection with YOLOv8 + } and from U{YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications + }. + Code is adapted from U{https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/models}. + + @type tal_topk: int + @param tal_topk: Number of anchors considered in selection. Defaults to 10. + @type class_loss_weight: float + @param class_loss_weight: Weight for classification loss. Defaults to 0.5. For optimal results, multiply with accumulate_grad_batches. + @type bbox_loss_weight: float + @param bbox_loss_weight: Weight for bbox loss. Defaults to 7.5. For optimal results, multiply with accumulate_grad_batches. + @type dfl_loss_weight: float + @param dfl_loss_weight: Weight for DFL loss. Defaults to 1.5. For optimal results, multiply with accumulate_grad_batches. + """ + super().__init__( + tal_topk=tal_topk, + class_loss_weight=class_loss_weight, + bbox_loss_weight=bbox_loss_weight, + dfl_loss_weight=dfl_loss_weight, + **kwargs, + ) + + def prepare( + self, inputs: Packet[Tensor], labels: Labels + ) -> tuple[ + Tensor, + Tensor, + Tensor, + Tensor, + Tensor, + Tensor, + Tensor, + Tensor, + Tensor, + Tensor, + Tensor, + ]: + det_feats = self.get_input_tensors(inputs, "features") + proto = self.get_input_tensors(inputs, "prototypes")[0] + pred_mask = self.get_input_tensors(inputs, "mask_coeficients")[0] + self._init_parameters(det_feats) + batch_size, _, mask_h, mask_w = proto.shape + pred_distri, pred_scores = torch.cat( + [xi.view(batch_size, self.node.no, -1) for xi in det_feats], 2 + ).split((self.node.reg_max * 4, self.n_classes), 1) + target_bbox = self.get_label(labels, TaskType.BOUNDINGBOX) + img_idx = target_bbox[:, 0].unsqueeze(-1) + target_masks = self.get_label(labels, TaskType.INSTANCE_SEGMENTATION) + if tuple(target_masks.shape[-2:]) != (mask_h, mask_w): + target_masks = F.interpolate( + target_masks.unsqueeze(0), (mask_h, mask_w), mode="nearest" + ).squeeze(0) + + pred_distri = pred_distri.permute(0, 2, 1).contiguous() + pred_scores = pred_scores.permute(0, 2, 1).contiguous() + pred_mask = pred_mask.permute(0, 2, 1).contiguous() + + target_bbox = self._preprocess_bbox_target(target_bbox, batch_size) + + pred_bboxes = self.decode_bbox(self.anchor_points_strided, pred_distri) + + gt_labels = target_bbox[:, :, :1] + gt_xyxy = target_bbox[:, :, 1:] + mask_gt = (gt_xyxy.sum(-1, keepdim=True) > 0).float() + + _, assigned_bboxes, assigned_scores, mask_positive, assigned_gt_idx = ( + self.assigner( + pred_scores.detach().sigmoid(), + (pred_bboxes.detach() * self.stride_tensor).type( + gt_xyxy.dtype + ), + self.anchor_points, + gt_labels, + gt_xyxy, + mask_gt, + ) + ) + + return ( + pred_distri, + pred_bboxes, + pred_scores, + assigned_bboxes, + assigned_scores, + mask_positive, + assigned_gt_idx, + pred_mask, + proto, + target_masks, + img_idx, + ) + + def forward( + self, + pred_distri: Tensor, + pred_bboxes: Tensor, + pred_scores: Tensor, + assigned_bboxes: Tensor, + assigned_scores: Tensor, + mask_positive: Tensor, + assigned_gt_idx: Tensor, + pred_masks: Tensor, + proto: Tensor, + target_masks: Tensor, + img_idx: Tensor, + ): + max_assigned_scores_sum = max(assigned_scores.sum().item(), 1) + loss_cls = ( + self.bce(pred_scores, assigned_scores) + ).sum() / max_assigned_scores_sum + if mask_positive.sum(): + loss_iou, loss_dfl = self.bbox_loss( + pred_distri, + pred_bboxes, + self.anchor_points_strided, + assigned_bboxes / self.stride_tensor, + assigned_scores, + max_assigned_scores_sum, + mask_positive, + ) + else: + loss_iou = torch.tensor(0.0).to(pred_distri.device) + loss_dfl = torch.tensor(0.0).to(pred_distri.device) + + loss_seg = self.compute_segmentation_loss( + mask_positive, + target_masks, + assigned_gt_idx, + assigned_bboxes, + img_idx, + proto, + pred_masks, + ) + + loss = ( + self.class_loss_weight * loss_cls + + self.bbox_loss_weight * loss_iou + + self.dfl_loss_weight * loss_dfl + + self.bbox_loss_weight * loss_seg + ) + sub_losses = { + "class": loss_cls.detach(), + "iou": loss_iou.detach(), + "dfl": loss_dfl.detach(), + "seg": loss_seg.detach(), + } + + return loss, sub_losses + + def compute_segmentation_loss( + self, + fg_mask: torch.Tensor, + gt_masks: torch.Tensor, + gt_idx: torch.Tensor, + bboxes: torch.Tensor, + batch_ids: torch.Tensor, + proto: torch.Tensor, + pred_masks: torch.Tensor, + ) -> torch.Tensor: + """Compute the segmentation loss for the entire batch. + + @type fg_mask: torch.Tensor + @param fg_mask: Foreground mask. Shape: (B, N_anchor). + @type gt_masks: torch.Tensor + @param gt_masks: Ground truth masks. Shape: (n, H, W). + @type gt_idx: torch.Tensor + @param gt_idx: Ground truth mask indices. Shape: (B, N_anchor). + @type bboxes: torch.Tensor + @param bboxes: Ground truth bounding boxes in xyxy format. + Shape: (B, N_anchor, 4). + @type batch_ids: torch.Tensor + @param batch_ids: Batch indices. Shape: (n, 1). + @type proto: torch.Tensor + @param proto: Prototype masks. Shape: (B, 32, H, W). + @type pred_masks: torch.Tensor + @param pred_masks: Predicted mask coefficients. Shape: (B, + N_anchor, 32). + """ + _, _, h, w = proto.shape + total_loss = 0 + bboxes_norm = bboxes / self.gt_bboxes_scale + bbox_area = box_convert(bboxes_norm, in_fmt="xyxy", out_fmt="xywh")[ + ..., 2: + ].prod(2) + bboxes_scaled = bboxes_norm * torch.tensor( + [w, h, w, h], device=proto.device + ) + + for img_idx, data in enumerate( + zip(fg_mask, gt_idx, pred_masks, proto, bboxes_scaled, bbox_area) + ): + fg, gt, pred, pr, bbox, area = data + if fg.any(): + mask_ids = gt[fg] + gt_mask = gt_masks[batch_ids.view(-1) == img_idx][mask_ids] + + # Compute individual image mask loss + pred_mask = torch.einsum("in,nhw->ihw", pred[fg], pr) + loss = F.binary_cross_entropy_with_logits( + pred_mask, gt_mask, reduction="none" + ) + total_loss += ( + apply_bounding_box_to_masks(loss, bbox[fg]).mean( + dim=(1, 2) + ) + / area[fg] + ).sum() + else: + total_loss += (proto * 0).sum() + (pred_masks * 0).sum() + + return total_loss / fg_mask.sum() diff --git a/luxonis_train/attached_modules/metrics/mean_average_precision.py b/luxonis_train/attached_modules/metrics/mean_average_precision.py index c082ee39..53d1a7f9 100644 --- a/luxonis_train/attached_modules/metrics/mean_average_precision.py +++ b/luxonis_train/attached_modules/metrics/mean_average_precision.py @@ -1,5 +1,6 @@ from typing import Any +import torch import torchmetrics.detection as detection from torch import Tensor from torchvision.ops import box_convert @@ -14,18 +15,30 @@ class MeanAveragePrecision( BaseMetric[list[dict[str, Tensor]], list[dict[str, Tensor]]] ): """Compute the Mean-Average-Precision (mAP) and Mean-Average-Recall - (mAR) for object detection predictions. + (mAR) for object detection predictions and instance segmentation. Adapted from U{Mean-Average-Precision (mAP) and Mean-Average-Recall (mAR) }. """ - supported_tasks: list[TaskType] = [TaskType.BOUNDINGBOX] + supported_tasks: list[TaskType] = [ + TaskType.BOUNDINGBOX, + TaskType.INSTANCE_SEGMENTATION, + ] def __init__(self, **kwargs: Any): super().__init__(**kwargs) - self.metric = detection.MeanAveragePrecision() + self.is_segmentation = (self.node.tasks is not None) and ( + TaskType.INSTANCE_SEGMENTATION in self.node.tasks + ) + + if self.is_segmentation: + iou_type = ("bbox", "segm") + else: + iou_type = "bbox" + + self.metric = detection.MeanAveragePrecision(iou_type=iou_type) # type: ignore def update( self, @@ -37,29 +50,53 @@ def update( def prepare( self, inputs: Packet[Tensor], labels: Labels ) -> tuple[list[dict[str, Tensor]], list[dict[str, Tensor]]]: - box_label = self.get_label(labels) - output_nms = self.get_input_tensors(inputs) - + box_label = self.get_label(labels, TaskType.BOUNDINGBOX) + mask_label = ( + self.get_label(labels, TaskType.INSTANCE_SEGMENTATION) + if self.is_segmentation + else None + ) + + output_nms_bboxes = self.get_input_tensors( + inputs, TaskType.BOUNDINGBOX + ) + output_nms_masks = ( + self.get_input_tensors(inputs, TaskType.INSTANCE_SEGMENTATION) + if self.is_segmentation + else None + ) image_size = self.original_in_shape[1:] output_list: list[dict[str, Tensor]] = [] label_list: list[dict[str, Tensor]] = [] - for i in range(len(output_nms)): - output_list.append( - { - "boxes": output_nms[i][:, :4], - "scores": output_nms[i][:, 4], - "labels": output_nms[i][:, 5].int(), - } - ) - + for i in range(len(output_nms_bboxes)): + # Prepare predictions + pred = { + "boxes": output_nms_bboxes[i][:, :4], + "scores": output_nms_bboxes[i][:, 4], + "labels": output_nms_bboxes[i][:, 5].int(), + } + if self.is_segmentation: + pred["masks"] = output_nms_masks[i].to( # type: ignore + dtype=torch.bool + ) # Predicted masks (M, H, W) + output_list.append(pred) + + # Prepare ground truth curr_label = box_label[box_label[:, 0] == i] curr_bboxs = box_convert(curr_label[:, 2:], "xywh", "xyxy") curr_bboxs[:, 0::2] *= image_size[1] curr_bboxs[:, 1::2] *= image_size[0] - label_list.append( - {"boxes": curr_bboxs, "labels": curr_label[:, 1].int()} - ) + + gt = { + "boxes": curr_bboxs, + "labels": curr_label[:, 1].int(), + } + if self.is_segmentation: + gt["masks"] = mask_label[box_label[:, 0] == i].to( # type: ignore + dtype=torch.bool + ) + label_list.append(gt) return output_list, label_list @@ -69,21 +106,52 @@ def reset(self) -> None: def compute(self) -> tuple[Tensor, dict[str, Tensor]]: metric_dict: dict[str, Tensor] = self.metric.compute() - del metric_dict["classes"] - del metric_dict["map_per_class"] - del metric_dict["mar_100_per_class"] - for key in list(metric_dict.keys()): - if "map" in key: - map = metric_dict[key] - mar_key = key.replace("map", "mar") - if mar_key in metric_dict: - mar = metric_dict[mar_key] - metric_dict[key.replace("map", "f1")] = ( - 2 * (map * mar) / (map + mar) - ) - - map = metric_dict.pop("map") + if self.is_segmentation: + keys_to_remove = [ + "classes", + "bbox_map_per_class", + "bbox_mar_100_per_class", + "segm_map_per_class", + "segm_mar_100_per_class", + ] + for key in keys_to_remove: + if key in metric_dict: + del metric_dict[key] + + for key in list(metric_dict.keys()): + if "map" in key: + map_metric = metric_dict[key] + mar_key = key.replace("map", "mar") + if mar_key in metric_dict: + mar_metric = metric_dict[mar_key] + metric_dict[key.replace("map", "f1")] = ( + 2 + * (map_metric * mar_metric) + / (map_metric + mar_metric) + ) + + scalar = metric_dict.get("segm_map", torch.tensor(0.0)) + else: + del metric_dict["classes"] + del metric_dict["map_per_class"] + del metric_dict["mar_100_per_class"] + + for key in list(metric_dict.keys()): + if "map" in key: + map_metric = metric_dict[key] + mar_key = key.replace("map", "mar") + if mar_key in metric_dict: + mar_metric = metric_dict[mar_key] + metric_dict[key.replace("map", "f1")] = ( + 2 + * (map_metric * mar_metric) + / (map_metric + mar_metric) + ) + + scalar = metric_dict.pop("map", torch.tensor(0.0)) + # WARNING: fix DDP pl.log error - map = map.to(self.device) metric_dict = {k: v.to(self.device) for k, v in metric_dict.items()} - return map, metric_dict + scalar = scalar.to(self.device) + + return scalar, metric_dict diff --git a/luxonis_train/attached_modules/visualizers/__init__.py b/luxonis_train/attached_modules/visualizers/__init__.py index 50b90471..1bd65f50 100644 --- a/luxonis_train/attached_modules/visualizers/__init__.py +++ b/luxonis_train/attached_modules/visualizers/__init__.py @@ -1,6 +1,7 @@ from .base_visualizer import BaseVisualizer from .bbox_visualizer import BBoxVisualizer from .classification_visualizer import ClassificationVisualizer +from .instance_segmentation_visualizer import InstanceSegmentationVisualizer from .keypoint_visualizer import KeypointVisualizer from .multi_visualizer import MultiVisualizer from .segmentation_visualizer import SegmentationVisualizer @@ -23,6 +24,7 @@ "KeypointVisualizer", "MultiVisualizer", "SegmentationVisualizer", + "InstanceSegmentationVisualizer", "combine_visualizations", "draw_bounding_box_labels", "draw_keypoint_labels", diff --git a/luxonis_train/attached_modules/visualizers/instance_segmentation_visualizer.py b/luxonis_train/attached_modules/visualizers/instance_segmentation_visualizer.py new file mode 100644 index 00000000..829cfbb8 --- /dev/null +++ b/luxonis_train/attached_modules/visualizers/instance_segmentation_visualizer.py @@ -0,0 +1,262 @@ +import logging + +import torch +from torch import Tensor + +from luxonis_train.enums import TaskType +from luxonis_train.utils import Labels, Packet + +from .base_visualizer import BaseVisualizer +from .utils import ( + Color, + draw_bounding_box_labels, + draw_bounding_boxes, + draw_segmentation_labels, + get_color, +) + +logger = logging.getLogger(__name__) + + +class InstanceSegmentationVisualizer(BaseVisualizer[Tensor, Tensor]): + """Visualizer for instance segmentation tasks, supporting the + visualization of predicted and ground truth bounding boxes and + instance segmentation masks.""" + + supported_tasks: list[TaskType] = [ + TaskType.INSTANCE_SEGMENTATION, + TaskType.BOUNDINGBOX, + ] + + def __init__( + self, + labels: dict[int, str] | list[str] | None = None, + draw_labels: bool = True, + colors: dict[str, Color] | list[Color] | None = None, + fill: bool = False, + width: int | None = None, + font: str | None = None, + font_size: int | None = None, + alpha: float = 0.6, + **kwargs, + ): + """Visualizer for instance segmentation tasks. + + @type labels: dict[int, str] | list[str] | None + @param labels: Dictionary mapping class indices to class labels. + @type draw_labels: bool + @param draw_labels: Whether to draw class labels on the + visualizations. + @type colors: dict[str, L{Color}] | list[L{Color}] | None + @param colors: Dicionary mapping class labels to colors. + @type fill: bool | None + @param fill: Whether to fill the boundingbox with color. + @type width: int | None + @param width: Width of the bounding box Lines. + @type font: str | None + @param font: Font of the clas labels. + @type font_size: int | None + @param font_size: Font size of the class Labels. + @type alpha: float + @param alpha: Alpha value of the segmentation masks. Defaults to + C{0.6}. + """ + super().__init__(**kwargs) + + if isinstance(labels, list): + labels = {i: label for i, label in enumerate(labels)} + + self.bbox_labels = labels or { + i: label for i, label in enumerate(self.class_names) + } + + if colors is None: + colors = { + label: get_color(i) for i, label in self.bbox_labels.items() + } + if isinstance(colors, list): + colors = { + self.bbox_labels[i]: color for i, color in enumerate(colors) + } + + self.colors = colors + self.fill = fill + self.width = width + self.font = font + self.font_size = font_size + self.draw_labels = draw_labels + self.alpha = alpha + + def prepare( + self, inputs: Packet[Tensor], labels: Labels + ) -> tuple[Tensor, Tensor, list[Tensor], list[Tensor]]: + # Override the prepare base method + target_bboxes = self.get_label(labels, TaskType.BOUNDINGBOX) + target_masks = self.get_label(labels, TaskType.INSTANCE_SEGMENTATION) + predicted_bboxes = self.get_input_tensors(inputs, TaskType.BOUNDINGBOX) + predicted_masks = self.get_input_tensors( + inputs, TaskType.INSTANCE_SEGMENTATION + ) + + return target_bboxes, target_masks, predicted_bboxes, predicted_masks + + def draw_predictions( + self, + canvas: Tensor, + pred_bboxes: list[Tensor], + pred_masks: list[Tensor], + width: int | None, + label_dict: dict[int, str], + color_dict: dict[str, Color], + draw_labels: bool, + alpha: float, + ) -> Tensor: + viz = torch.zeros_like(canvas) + + for i in range(len(canvas)): + viz[i] = canvas[i].clone() + image_bboxes = pred_bboxes[i] + image_masks = pred_masks[i] + prediction_classes = image_bboxes[..., 5].int() + + cls_labels = ( + [label_dict[int(c)] for c in prediction_classes] + if draw_labels and label_dict is not None + else None + ) + cls_colors = ( + [color_dict[label_dict[int(c)]] for c in prediction_classes] + if color_dict is not None and label_dict is not None + else None + ) + + *_, H, W = canvas.shape + width = width or max(1, int(min(H, W) / 100)) + + try: + viz[i] = draw_segmentation_labels( + viz[i], + image_masks, + colors=cls_colors, + alpha=alpha, + ).to(canvas.device) + + viz[i] = draw_bounding_boxes( + viz[i], + image_bboxes[:, :4], + width=width, + labels=cls_labels, + colors=cls_colors, + ).to(canvas.device) + except ValueError as e: + logger.warning( + f"Failed to draw bounding boxes or masks: {e}. Skipping visualization." + ) + viz[i] = canvas[i] + + return viz + + @staticmethod + def draw_targets( + canvas: Tensor, + target_bboxes: Tensor, + target_masks: Tensor, + width: int | None, + label_dict: dict[int, str], + color_dict: dict[str, Color], + draw_labels: bool, + alpha: float, + ) -> Tensor: + viz = torch.zeros_like(canvas) + + for i in range(len(canvas)): + viz[i] = canvas[i].clone() + image_bboxes = target_bboxes[target_bboxes[:, 0] == i] + image_masks = target_masks[target_bboxes[:, 0] == i] + target_classes = image_bboxes[:, 1].int() + + cls_labels = ( + [label_dict[int(c)] for c in target_classes] + if draw_labels and label_dict is not None + else None + ) + cls_colors = ( + [color_dict[label_dict[int(c)]] for c in target_classes] + if color_dict is not None and label_dict is not None + else None + ) + + *_, H, W = canvas.shape + width = width or max(1, int(min(H, W) / 100)) + + viz[i] = draw_segmentation_labels( + viz[i], + image_masks, + alpha=alpha, + colors=cls_colors, + ).to(canvas.device) + viz[i] = draw_bounding_box_labels( + viz[i], + image_bboxes[:, 2:], + width=width, + labels=cls_labels if cls_labels else None, + colors=cls_colors, + ).to(canvas.device) + + return viz + + def forward( + self, + label_canvas: Tensor, + prediction_canvas: Tensor, + target_bboxes: Tensor | None, + target_masks: Tensor | None, + predicted_bboxes: list[Tensor], + predicted_masks: list[Tensor], + ) -> tuple[Tensor, Tensor] | Tensor: + """Creates visualizations of the predicted and target bounding + boxes and instance masks. + + @type label_canvas: Tensor + @param label_canvas: Tensor containing the target + visualizations. + @type prediction_canvas: Tensor + @param prediction_canvas: Tensor containing the predicted + visualizations. + @type target_bboxes: Tensor | None + @param target_bboxes: Tensor containing the target bounding + boxes. + @type target_masks: Tensor | None + @param target_masks: Tensor containing the target instance + masks. + @type predicted_bboxes: list[Tensor] + @param predicted_bboxes: List of tensors containing the + predicted bounding boxes. + @type predicted_masks: list[Tensor] + @param predicted_masks: List of tensors containing the predicted + instance masks. + """ + predictions_viz = self.draw_predictions( + prediction_canvas, + predicted_bboxes, + predicted_masks, + self.width, + self.bbox_labels, + self.colors, + self.draw_labels, + self.alpha, + ) + if target_bboxes is None or target_masks is None: + return predictions_viz + + targets_viz = self.draw_targets( + label_canvas, + target_bboxes, + target_masks, + self.width, + self.bbox_labels, + self.colors, + self.draw_labels, + self.alpha, + ) + return targets_viz, predictions_viz diff --git a/luxonis_train/config/predefined_models/README.md b/luxonis_train/config/predefined_models/README.md index f19a21da..35dfb198 100644 --- a/luxonis_train/config/predefined_models/README.md +++ b/luxonis_train/config/predefined_models/README.md @@ -10,6 +10,7 @@ models which can be used instead. - [`KeypointDetectionModel`](#keypointdetectionmodel) - [`ClassificationModel`](#classificationmodel) - [`FOMOModel`](#fomomodel) +- [`InstanceSegmentationModel`](#instancesegmentationmodel) - [`AnomalyDetectionModel`](#anomalydetectionmodel) **Parameters:** @@ -25,7 +26,7 @@ models which can be used instead. ## `SegmentationModel` -The `SegmentationModel` allows for both `"light"` and `"heavy"` variants, where the `"heavy"` variant is more accurate, and the `"light"` variant is faster. +The `SegmentationModel` supports `"light"` and `"heavy"` variants, with `"light"` optimized for speed and `"heavy"` for accuracy. See an example configuration file using this predefined model [here](../../../configs/segmentation_light_model.yaml) for the `"light"` variant, and [here](../../../configs/segmentation_heavy_model.yaml) for the `"heavy"` variant. @@ -66,7 +67,7 @@ FPS (frames per second) for `light` and `heavy` variants on different devices wi ## `DetectionModel` -The `DetectionModel` allows for `"light"`, `"medium"` and `"heavy"` variants, where the `"heavy"` variant is more accurate, and the `"light"` variant is faster. +The `DetectionModel` supports `"light"`, `"medium"`, and `"heavy"` variants, with `"light"` optimized for speed, `"heavy"` for accuracy, and `"medium"` offering a balance between the two. See an example configuration file using this predefined model [here](../../../configs/detection_light_model.yaml) for the `"light"` variant, and [here](../../../configs/detection_heavy_model.yaml) for the `"heavy"` variant. @@ -116,7 +117,7 @@ FPS (frames per second) for `light`, `medium` and `heavy` variants on different ## `KeypointDetectionModel` -The `KeypointDetectionModel` allows for `"light"`, `"medium"` and `"heavy"` variants, where the `"heavy"` variant is more accurate, and the `"light"` variant is faster. +The `KeypointDetectionModel` supports `"light"`, `"medium"`, and `"heavy"` variants, with `"light"` optimized for speed, `"heavy"` for accuracy, and `"medium"` offering a balance between the two. See an example configuration file using this predefined model [here](../../../configs/keypoint_bbox_light_model.yaml) for the `"light"` variant, and [here](../../../configs/keypoint_bbox_heavy_model.yaml) for the `"heavy"` variant. @@ -161,7 +162,7 @@ FPS (frames per second) for `light`, `medium` and `heavy` variants on different ## `ClassificationModel` -The `ClassificationModel` allows for both `"light"` and `"heavy"` variants, where the `"heavy"` variant is more accurate, and the `"light"` variant is faster. Can be used for multi-class and multi-label tasks. +The `ClassificationModel` supports `"light"` and `"heavy"` variants, with `"light"` optimized for speed and `"heavy"` for accuracy. See an example configuration file using this predefined model [here](../../../configs/classification_light_model.yaml) for the `"light"` variant, and [here](../../../configs/classification_heavy_model.yaml) for the `"heavy"` variant. @@ -200,7 +201,7 @@ FPS (frames per second) for `light` and `heavy` variants on different devices wi ## `FOMOModel` -The `FOMOModel` allows for both `"light"` and `"heavy"` variants, where the `"heavy"` variant is more accurate, and the `"light"` variant is faster. +The `FOMOModel` supports `"light"` and `"heavy"` variants, with `"light"` optimized for speed and `"heavy"` for accuracy. There is a trade-off in this simple model: training with a larger `object_weight` in the loss parameters may result in more false positives (FP), but it will improve accuracy. You can also use `use_nms: True` in the `head_params` to enable NMS which can reduce FP, but it will also reduce TP for close neighbors. @@ -240,9 +241,50 @@ For larger heatmaps and improved accuracy, you can adjust the `attach_index` in | `visualizer_params` | `dict` | `{}` | Additional parameters for the visualizer. | | `task_name` | `str \| None` | `None` | Custom task name for the model head. | +## `InstanceSegmentationModel` + +The `InstanceSegmentationModel` supports `"light"`, `"medium"`, and `"heavy"` variants, with `"light"` optimized for speed, `"heavy"` for accuracy, and `"medium"` offering a balance between the two. + +See an example configuration file using this predefined model [here](../../../configs/instance_segmentation_light_model.yaml) for the `"light"` variant, and [here](../../../configs/instance_segmentation_heavy_model.yaml) for the `"heavy"` variant. + +### Performance Metrics + +FPS (frames per second) for `light`, `medium` and `heavy` variants on different devices with image size 384x512: + +| Variant | RVC2 FPS | RVC4 FPS | +| ------------ | -------- | -------- | +| **`light`** | 15 | 131 | +| **`medium`** | 9 | 116 | +| **`heavy`** | 3 | 82 | + +**Components:** + +| Name | Alias | Function | +| --------------------------------------------------------------------------------------------------------------- | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- | +| [`EfficientRep`](../../nodes/README.md#efficientrep) | `"instance_segmentation_backbone"` | Backbone of the model. Available variants: `"light"` (`EfficientRep-N`), `"medium"` (`EfficientRep-S`), and `"heavy"` (`EfficientRep-L`) | +| [`RepPANNeck`](../../nodes/README.md#reppanneck) | `"instance_segmentation_neck"` | Neck of the model | +| [`PrecisionSegmentBBoxHead`](../../nodes/README.md#precisionsegmentbboxhead) | `"instance_segmentation_head"` | Head of the model for instance segmentation | +| [`PrecisionDFLSegmentationLoss`](../../attached_modules/losses/README.md#precisiondflsegmentationloss) | `"instance_segmentation_loss"` | Loss function for training instance segmentation models | +| [`MeanAveragePrecision`](../../attached_modules/metrics/README.md#meanaverageprecision) | `"instance_segmentation_map"` | Main metric of the model, measuring mean average precision | +| [`InstanceSegmentationVisualizer`](../../attached_modules/visualizers/README.md#instancesegmentationvisualizer) | `"instance_segmentation_visualizer"` | Visualizer for displaying instance segmentation results | + +**Parameters:** + +| Key | Type | Default value | Description | +| ------------------- | ------------------------------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------ | +| `variant` | `Literal["light", "medium", "heavy"]` | `"light"` | Defines the variant of the model. `"light"` uses `EfficientRep-N`, `"medium"` uses `EfficientRep-S`, `"heavy"` uses `EfficientRep-L` | +| `use_neck` | `bool` | `True` | Whether to include the neck in the model | +| `backbone` | `str` | `"EfficientRep"` | Name of the node to be used as a backbone | +| `backbone_params` | `dict` | `{}` | Additional parameters to the backbone | +| `neck_params` | `dict` | `{}` | Additional parameters to the neck | +| `head_params` | `dict` | `{}` | Additional parameters to the head | +| `loss_params` | `dict` | `{}` | Additional parameters to the loss function | +| `visualizer_params` | `dict` | `{}` | Additional parameters to the visualizer | +| `task_name` | `str \| None` | `None` | Custom task name for the head | + ## `AnomalyDetectionModel` -The `AnomalyDetectionModel` allows for both `"light"` and `"heavy"` variants, where the `"heavy"` variant is more accurate, and the `"light"` variant is faster. +The `AnomalyDetectionModel` supports `"light"` and `"heavy"` variants, with `"light"` optimized for speed and `"heavy"` for accuracy. ### Performance Metrics diff --git a/luxonis_train/config/predefined_models/__init__.py b/luxonis_train/config/predefined_models/__init__.py index a52db8bb..7bec15b0 100644 --- a/luxonis_train/config/predefined_models/__init__.py +++ b/luxonis_train/config/predefined_models/__init__.py @@ -3,6 +3,7 @@ from .classification_model import ClassificationModel from .detection_fomo_model import FOMOModel from .detection_model import DetectionModel +from .instance_segmentation_model import InstanceSegmentationModel from .keypoint_detection_model import KeypointDetectionModel from .segmentation_model import SegmentationModel @@ -14,4 +15,5 @@ "SegmentationModel", "AnomalyDetectionModel", "FOMOModel", + "InstanceSegmentationModel", ] diff --git a/luxonis_train/config/predefined_models/instance_segmentation_model.py b/luxonis_train/config/predefined_models/instance_segmentation_model.py new file mode 100644 index 00000000..25490590 --- /dev/null +++ b/luxonis_train/config/predefined_models/instance_segmentation_model.py @@ -0,0 +1,165 @@ +from typing import Literal, TypeAlias + +from pydantic import BaseModel + +from luxonis_train.config import ( + AttachedModuleConfig, + LossModuleConfig, + MetricModuleConfig, + ModelNodeConfig, + Params, +) + +from .base_predefined_model import BasePredefinedModel + +VariantLiteral: TypeAlias = Literal["light", "medium", "heavy"] + + +class InstanceSegmentationVariant(BaseModel): + backbone: str + backbone_params: Params + neck_params: Params + + +def get_variant(variant: VariantLiteral) -> InstanceSegmentationVariant: + """Returns the specific variant configuration for the + InstanceSegmentationModel.""" + variants = { + "light": InstanceSegmentationVariant( + backbone="EfficientRep", + backbone_params={"variant": "n"}, + neck_params={"variant": "n"}, + ), + "medium": InstanceSegmentationVariant( + backbone="EfficientRep", + backbone_params={"variant": "s"}, + neck_params={"variant": "s"}, + ), + "heavy": InstanceSegmentationVariant( + backbone="EfficientRep", + backbone_params={"variant": "l"}, + neck_params={"variant": "l"}, + ), + } + + if variant not in variants: + raise ValueError( + f"Instance segmentation variant should be one of {list(variants.keys())}, got '{variant}'." + ) + + return variants[variant] + + +class InstanceSegmentationModel(BasePredefinedModel): + def __init__( + self, + variant: VariantLiteral = "light", + use_neck: bool = True, + backbone: str | None = None, + backbone_params: Params | None = None, + neck_params: Params | None = None, + head_params: Params | None = None, + loss_params: Params | None = None, + visualizer_params: Params | None = None, + task_name: str = "", + enable_confusion_matrix: bool = True, + confusion_matrix_params: Params | None = None, + ): + var_config = get_variant(variant) + + self.use_neck = use_neck + self.backbone_params = ( + backbone_params + if backbone is not None or backbone_params is not None + else var_config.backbone_params + ) or {} + self.backbone = backbone or var_config.backbone + self.neck_params = neck_params or var_config.neck_params + self.head_params = head_params or {} + self.loss_params = loss_params or {} + self.visualizer_params = visualizer_params or {} + self.task_name = task_name + self.enable_confusion_matrix = enable_confusion_matrix + self.confusion_matrix_params = confusion_matrix_params or {} + + @property + def nodes(self) -> list[ModelNodeConfig]: + """Defines the model nodes, including backbone, neck, and + head.""" + nodes = [ + ModelNodeConfig( + name=self.backbone, + alias=f"{self.task_name}/{self.backbone}", + freezing=self.backbone_params.pop("freezing", {}), + params=self.backbone_params, + ), + ] + if self.use_neck: + nodes.append( + ModelNodeConfig( + name="RepPANNeck", + alias=f"{self.task_name}/RepPANNeck", + inputs=[f"{self.task_name}/{self.backbone}"], + freezing=self.neck_params.pop("freezing", {}), + params=self.neck_params, + ) + ) + + nodes.append( + ModelNodeConfig( + name="PrecisionSegmentBBoxHead", + alias=f"{self.task_name}/PrecisionSegmentBBoxHead", + freezing=self.head_params.pop("freezing", {}), + inputs=[f"{self.task_name}/RepPANNeck"] + if self.use_neck + else [f"{self.backbone}-{self.task_name}"], + params=self.head_params, + ) + ) + return nodes + + @property + def losses(self) -> list[LossModuleConfig]: + """Defines the loss module for the instance segmentation + task.""" + return [ + LossModuleConfig( + name="PrecisionDFLSegmentationLoss", + attached_to=f"{self.task_name}/PrecisionSegmentBBoxHead", + params=self.loss_params, + weight=1.0, + ) + ] + + @property + def metrics(self) -> list[MetricModuleConfig]: + """Defines the metrics used for evaluation.""" + metrics = [ + MetricModuleConfig( + name="MeanAveragePrecision", + attached_to=f"{self.task_name}/PrecisionSegmentBBoxHead", + is_main_metric=True, + ), + ] + if self.enable_confusion_matrix: + metrics.append( + MetricModuleConfig( + name="ConfusionMatrix", + alias=f"{self.task_name}/ConfusionMatrix", + attached_to=f"{self.task_name}/PrecisionSegmentBBoxHead", + params={**self.confusion_matrix_params}, + ) + ) + return metrics + + @property + def visualizers(self) -> list[AttachedModuleConfig]: + """Defines the visualizer used for the instance segmentation + task.""" + return [ + AttachedModuleConfig( + name="InstanceSegmentationVisualizer", + attached_to=f"{self.task_name}/PrecisionSegmentBBoxHead", + params=self.visualizer_params, + ) + ] diff --git a/luxonis_train/loaders/base_loader.py b/luxonis_train/loaders/base_loader.py index 752a15d2..db97ac00 100644 --- a/luxonis_train/loaders/base_loader.py +++ b/luxonis_train/loaders/base_loader.py @@ -75,7 +75,7 @@ class ConfigItem: @type image_source: str @param image_source: Name of the image source. Only relevant for datasets with multiple image sources, e.g. C{"left"} and C{"right"}. This parameter defines which of these sources is used for - visualizations. + visualizations. @type keep_aspect_ratio: bool @param keep_aspect_ratio: Whether to keep the aspect ratio of the output image after resizing. @@ -226,7 +226,7 @@ def get(self, idx: int) -> tuple[Tensor | dict[str, Tensor], Labels]: @type idx: int @param idx: Sample index. @rtype: L{LuxonisLoaderTorchOutput} - @return: Sample's data in L{LuxonisLoaderTorchOutput} format + @return: Sample's data in L{LuxonisLoaderTorchOutput} format. """ ... diff --git a/luxonis_train/loaders/utils.py b/luxonis_train/loaders/utils.py index 9c9e1d45..aed4df94 100644 --- a/luxonis_train/loaders/utils.py +++ b/luxonis_train/loaders/utils.py @@ -44,8 +44,10 @@ def collate_fn( new_ann[:, 1:] = ann label_box.append(new_ann) out_labels[task] = torch.cat(label_box, 0) + elif task_type == "instance_segmentation": - out_labels[task] = torch.cat(annos, 0) + masks = [label[task] for label in labels] + out_labels[task] = torch.cat(masks, 0) else: out_labels[task] = torch.stack(annos, 0) diff --git a/luxonis_train/models/luxonis_lightning.py b/luxonis_train/models/luxonis_lightning.py index d2d3fe87..17aea732 100644 --- a/luxonis_train/models/luxonis_lightning.py +++ b/luxonis_train/models/luxonis_lightning.py @@ -594,9 +594,17 @@ def export_onnx(self, save_path: str, **kwargs) -> list[str]: idx += 1 else: output_names = [] + running_i = {} # for case where export_output_names should be used but output node's output is split into multiple subnodes for node_name, output_name, i in output_order: if node_name in export_output_names_dict: - output_names.append(export_output_names_dict[node_name][i]) + running_i[node_name] = ( + running_i.get(node_name, -1) + 1 + ) # if not present default to 0 otherwise add 1 + output_names.append( + export_output_names_dict[node_name][ + running_i[node_name] + ] + ) else: output_names.append(f"{node_name}/{output_name}/{i}") diff --git a/luxonis_train/nodes/README.md b/luxonis_train/nodes/README.md index 7e2540ee..ab139d04 100644 --- a/luxonis_train/nodes/README.md +++ b/luxonis_train/nodes/README.md @@ -29,6 +29,8 @@ arbitrarily as long as the two nodes are compatible with each other. We've group - [`DDRNetSegmentationHead`](#ddrnetsegmentationhead) - [`DiscSubNetHead`](#discsubnet) - [`FOMOHead`](#fomohead) + - [`PrecisionBBoxHead`](#precisionbboxhead) + - [`PrecisionSegmentBBoxHead`](#precisionsegmentbboxhead) Every node takes these parameters: | Key | Type | Default value | Description | @@ -239,7 +241,7 @@ Adapted from [here](https://arxiv.org/pdf/2209.02976.pdf). | Key | Type | Default value | Description | | -------------------- | ------- | ------------- | --------------------------------------------------------------------- | -| `n_heads` | `bool` | `3` | Number of output heads | +| `n_heads` | `int` | `3` | Number of output heads | | `conf_thres` | `float` | `0.25` | Confidence threshold for non-maxima-suppression (used for evaluation) | | `iou_thres` | `float` | `0.45` | `IoU` threshold for non-maxima-suppression (used for evaluation) | | `max_det` | `int` | `300` | Maximum number of detections retained after NMS | @@ -290,3 +292,33 @@ Adapted from [here](https://arxiv.org/abs/2108.07610). | `num_conv_layers` | `int` | `3` | Number of convolutional layers to use in the model. | | `conv_channels` | `int` | `16` | Number of output channels for each convolutional layer. | | `use_nms` | `bool` | `False` | If True, enable NMS. This can reduce FP, but it will also reduce TP for close neighbors. | + +## `PrecisionBBoxHead` + +Adapted from [here](https://arxiv.org/pdf/2207.02696.pdf) and [here](https://arxiv.org/pdf/2209.02976.pdf). + +**Parameters:** + +| Key | Type | Default value | Description | +| ------------ | ------- | ------------- | ------------------------------------------------------------------------- | +| `reg_max` | `int` | `16` | Maximum number of regression channels | +| `n_heads` | `int` | `3` | Number of output heads | +| `conf_thres` | `float` | `0.25` | Confidence threshold for non-maxima-suppression (used for evaluation) | +| `iou_thres` | `float` | `0.45` | IoU threshold for non-maxima-suppression (used for evaluation) | +| `max_det` | `int` | `300` | Max number of detections for non-maxima-suppression (used for evaluation) | + +## `PrecisionSegmentBBoxHead` + +Adapted from [here](https://arxiv.org/pdf/2207.02696.pdf) and [here](https://arxiv.org/pdf/2209.02976.pdf). + +**Parameters:** + +| Key | Type | Default value | Description | +| ------------ | ------- | ------------- | -------------------------------------------------------------------------- | +| `reg_max` | `int` | `16` | Maximum number of regression channels. | +| `n_heads` | `int` | `3` | Number of output heads. | +| `conf_thres` | `float` | `0.25` | Confidence threshold for non-maxima-suppression (used for evaluation). | +| `iou_thres` | `float` | `0.45` | IoU threshold for non-maxima-suppression (used for evaluation). | +| `max_det` | `int` | `300` | Max number of detections for non-maxima-suppression (used for evaluation). | +| `n_masks` | `int` | `32` | Number of of output instance segmentation masks at the output. | +| `n_proto` | `int` | `256` | Number of prototypes generated from the prototype generator. | diff --git a/luxonis_train/nodes/backbones/ddrnet/blocks.py b/luxonis_train/nodes/backbones/ddrnet/blocks.py index ce78503c..910969f2 100644 --- a/luxonis_train/nodes/backbones/ddrnet/blocks.py +++ b/luxonis_train/nodes/backbones/ddrnet/blocks.py @@ -135,9 +135,8 @@ def __init__( @type inter_mode: str @param inter_mode: Interpolation mode for upscaling. Defaults to "bilinear". - - @raises ValueError: If the lengths of `kernel_sizes` and `strides` - are not the same. + @raises ValueError: If the lengths of C{kernel_sizes} and + C{strides} are not the same. """ super().__init__() diff --git a/luxonis_train/nodes/blocks/__init__.py b/luxonis_train/nodes/blocks/__init__.py index ce0181c9..71228fbd 100644 --- a/luxonis_train/nodes/blocks/__init__.py +++ b/luxonis_train/nodes/blocks/__init__.py @@ -1,4 +1,5 @@ from .blocks import ( + DFL, AttentionRefinmentBlock, BasicResNetBlock, BlockRepeater, @@ -6,9 +7,11 @@ ConvModule, CSPStackRepBlock, DropPath, + DWConvModule, EfficientDecoupledBlock, FeatureFusionBlock, RepVGGBlock, + SegProto, SpatialPyramidPoolingBlock, SqueezeExciteBlock, UpBlock, @@ -32,4 +35,7 @@ "Bottleneck", "UpscaleOnline", "DropPath", + "SegProto", + "DWConvModule", + "DFL", ] diff --git a/luxonis_train/nodes/blocks/blocks.py b/luxonis_train/nodes/blocks/blocks.py index b32c1292..fa9912a8 100644 --- a/luxonis_train/nodes/blocks/blocks.py +++ b/luxonis_train/nodes/blocks/blocks.py @@ -81,6 +81,85 @@ def _initialize_weights_and_biases(self, prior_prob: float) -> None: module.weight = nn.Parameter(w, requires_grad=True) +class SegProto(nn.Module): + def __init__(self, in_ch, mid_ch=256, out_ch=32): + """Initializes the segmentation prototype generator. + + @type in_ch: int + @param in_ch: Number of input channels. + @type mid_ch: int + @param mid_ch: Number of intermediate channels. Defaults to 256. + @type out_ch: int + @param out_ch: Number of output channels. Defaults to 32. + """ + super().__init__() + self.conv1 = ConvModule( + in_channels=in_ch, + out_channels=mid_ch, + kernel_size=3, + stride=1, + padding=1, + activation=nn.SiLU(), + ) + self.upsample = nn.ConvTranspose2d( + in_channels=mid_ch, + out_channels=mid_ch, + kernel_size=2, + stride=2, + bias=True, + ) + self.conv2 = ConvModule( + in_channels=mid_ch, + out_channels=mid_ch, + kernel_size=3, + stride=1, + padding=1, + activation=nn.SiLU(), + ) + self.conv3 = ConvModule( + in_channels=mid_ch, + out_channels=out_ch, + kernel_size=1, + stride=1, + padding=0, + activation=nn.SiLU(), + ) + + def forward(self, x): + """Defines the forward pass of the segmentation prototype + generator. + + @type x: torch.Tensor + @param x: Input tensor. + @rtype: torch.Tensor + @return: Processed tensor. + """ + return self.conv3(self.conv2(self.upsample(self.conv1(x)))) + + +class DFL(nn.Module): + def __init__(self, reg_max: int = 16): + """The DFL (Distribution Focal Loss) module processes input + tensors by applying softmax over a specified dimension and + projecting the resulting tensor to produce output logits. + + @type reg_max: int + @param reg_max: Maximum number of regression outputs. Defaults + to 16. + """ + super().__init__() + self.proj_conv = nn.Conv2d(reg_max, 1, kernel_size=1, bias=False) + self.proj_conv.weight.data.copy_( + torch.arange(reg_max, dtype=torch.float32).view(1, reg_max, 1, 1) + ) + self.proj_conv.requires_grad_(False) + + def forward(self, x: Tensor) -> Tensor: + bs, _, h, w = x.size() + x = F.softmax(x.view(bs, 4, -1, h * w).permute(0, 2, 1, 3), dim=1) + return self.proj_conv(x)[:, 0].view(bs, 4, h, w) + + class ConvModule(nn.Sequential): def __init__( self, @@ -134,6 +213,51 @@ def __init__( ) +class DWConvModule(ConvModule): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + padding: int = 0, + dilation: int = 1, + bias: bool = False, + activation: nn.Module | None = None, + ): + """Depth-wise Conv2d + BN + Activation. + + @type in_channels: int + @param in_channels: Number of input channels. + @type out_channels: int + @param out_channels: Number of output channels. + @type kernel_size: int + @param kernel_size: Kernel size. + @type stride: int + @param stride: Stride. Defaults to 1. + @type padding: int + @param padding: Padding. Defaults to 0. + @type dilation: int + @param dilation: Dilation. Defaults to 1. + @type bias: bool + @param bias: Whether to use bias. Defaults to False. + @type activation: L{nn.Module} | None + @param activation: Activation function. If None then nn.Relu. + """ + + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=math.gcd(in_channels, out_channels), + bias=bias, + activation=activation, + ) + + class UpBlock(nn.Sequential): def __init__( self, diff --git a/luxonis_train/nodes/heads/__init__.py b/luxonis_train/nodes/heads/__init__.py index e5abd973..6ebcf816 100644 --- a/luxonis_train/nodes/heads/__init__.py +++ b/luxonis_train/nodes/heads/__init__.py @@ -6,6 +6,8 @@ from .efficient_bbox_head import EfficientBBoxHead from .efficient_keypoint_bbox_head import EfficientKeypointBBoxHead from .fomo_head import FOMOHead +from .precision_bbox_head import PrecisionBBoxHead +from .precision_seg_bbox_head import PrecisionSegmentBBoxHead from .segmentation_head import SegmentationHead __all__ = [ @@ -18,4 +20,6 @@ "DDRNetSegmentationHead", "DiscSubNetHead", "FOMOHead", + "PrecisionBBoxHead", + "PrecisionSegmentBBoxHead", ] diff --git a/luxonis_train/nodes/heads/efficient_bbox_head.py b/luxonis_train/nodes/heads/efficient_bbox_head.py index 95ebe1be..ea3ad988 100644 --- a/luxonis_train/nodes/heads/efficient_bbox_head.py +++ b/luxonis_train/nodes/heads/efficient_bbox_head.py @@ -82,27 +82,13 @@ def __init__( in_channels=self.in_channels[i], ) self.heads.append(curr_head) - if ( - self.export_output_names is None - or len(self.export_output_names) != self.n_heads - ): - if ( - self.export_output_names is not None - and len(self.export_output_names) != self.n_heads - ): - logger.warning( - f"Number of provided output names ({len(self.export_output_names)}) " - f"does not match number of heads ({self.n_heads}). " - f"Using default names." - ) - self._export_output_names = [ - f"output{i+1}_yolov6r2" for i in range(self.n_heads) - ] if initialize_weights: self.initialize_weights() - if download_weights: + if ( + download_weights and self.name == "EfficientBBoxHead" + ): # skip download on classes that inherit this one weights_path = self.get_variant_weights(initialize_weights) if weights_path: self.load_checkpoint(path=weights_path, strict=False) @@ -111,6 +97,8 @@ def __init__( f"No checkpoint available for {self.name}, skipping." ) + self.check_export_output_names() + def initialize_weights(self) -> None: for m in self.modules(): if isinstance(m, nn.Conv2d): @@ -142,6 +130,24 @@ def get_variant_weights(self, initialize_weights: bool) -> str | None: else: return None + def check_export_output_names(self): + if ( + self.export_output_names is None + or len(self.export_output_names) != self.n_heads + ): + if ( + self.export_output_names is not None + and len(self.export_output_names) != self.n_heads + ): + logger.warning( + f"Number of provided output names ({len(self.export_output_names)}) " + f"does not match number of heads ({self.n_heads}). " + f"Using default names." + ) + self._export_output_names = [ + f"output{i + 1}_yolov6r2" for i in range(self.n_heads) + ] + def forward( self, inputs: list[Tensor] ) -> tuple[list[Tensor], list[Tensor], list[Tensor]]: diff --git a/luxonis_train/nodes/heads/efficient_keypoint_bbox_head.py b/luxonis_train/nodes/heads/efficient_keypoint_bbox_head.py index b027b939..f9506547 100644 --- a/luxonis_train/nodes/heads/efficient_keypoint_bbox_head.py +++ b/luxonis_train/nodes/heads/efficient_keypoint_bbox_head.py @@ -1,3 +1,4 @@ +import logging from typing import Any, Literal import torch @@ -14,6 +15,8 @@ from .efficient_bbox_head import EfficientBBoxHead +logger = logging.getLogger(__name__) + class EfficientKeypointBBoxHead(EfficientBBoxHead): tasks: list[TaskType] = [TaskType.KEYPOINTS, TaskType.BOUNDINGBOX] @@ -67,6 +70,28 @@ def __init__( self._export_output_names = None + self.check_export_output_names() + + def check_export_output_names(self): + if ( + self.export_output_names is None + or len(self.export_output_names) != self.n_heads + ): + if ( + self.export_output_names is not None + and len(self.export_output_names) != self.n_heads + ): + logger.warning( + f"Number of provided output names ({len(self.export_output_names)}) " + f"does not match number of heads ({self.n_heads}). " + f"Using default names." + ) + self._export_output_names = [ + f"output{i + 1}_yolov6" for i in range(self.n_heads) + ] + [ + f"kpt_output{i + 1}" for i in range(self.n_heads) + ] # export names are applied on sorter output names + def forward( self, inputs: list[Tensor] ) -> tuple[list[Tensor], list[Tensor], list[Tensor], list[Tensor]]: @@ -75,7 +100,7 @@ def forward( ( _, self.anchor_points, - _, + self.n_anchors_list, self.stride_tensor, ) = anchors_for_fpn_features( features, @@ -99,22 +124,18 @@ def wrap( features, cls_score_list, reg_distri_list, kpt_list = output bs = features[0].shape[0] if self.export: - outputs: list[Tensor] = [] - for out_cls, out_reg, out_kpts in zip( - cls_score_list, reg_distri_list, kpt_list, strict=True + det_outputs: list[Tensor] = [] + kpt_outputs: list[Tensor] = [] + for i, (out_cls, out_reg, out_kpt) in enumerate( + zip(cls_score_list, reg_distri_list, kpt_list, strict=True) ): - chunks = torch.split(out_kpts, 3, dim=1) - modified_chunks: list[Tensor] = [] - for chunk in chunks: - x = chunk[:, 0:1, :, :] - y = chunk[:, 1:2, :, :] - v = torch.sigmoid(chunk[:, 2:3, :, :]) - modified_chunk = torch.cat([x, y, v], dim=1) - modified_chunks.append(modified_chunk) - out_kpts_modified = torch.cat(modified_chunks, dim=1) - out = torch.cat([out_reg, out_cls, out_kpts_modified], dim=1) - outputs.append(out) - return {"outputs": outputs} + conf, _ = out_cls.max(1, keepdim=True) + out = torch.cat([out_reg, conf, out_cls], dim=1) + det_outputs.append(out) + kpt_outputs.append( + self._dist2kpts(out_kpt.view(bs, self.nk, -1), bs, i) + ) + return {"boundingbox": det_outputs, "keypoints": kpt_outputs} cls_tensor = torch.cat( [cls_score_list[i].flatten(2) for i in range(len(cls_score_list))], @@ -142,8 +163,13 @@ def wrap( "distributions": [reg_tensor], "keypoints_raw": [kpt_tensor], } - - pred_kpt = self._dist2kpts(kpt_tensor) + pred_kpt = torch.cat( + [ + self._dist2kpts(kpt_list[i].view(bs, self.nk, -1), bs, i) + for i in range(len(kpt_list)) + ], + dim=2, + ).permute(0, 2, 1) detections = self._process_to_bbox_and_kps( (features, cls_tensor, reg_tensor, pred_kpt) ) @@ -159,26 +185,18 @@ def wrap( "keypoints_raw": [kpt_tensor], } - def _dist2kpts(self, kpts: Tensor) -> Tensor: + def _dist2kpts(self, kpts: Tensor, batch_size: int, index: int) -> Tensor: """Decodes keypoints.""" - y = kpts.clone() - - anchor_points_transposed = self.anchor_points.transpose(0, 1) - stride_tensor = self.stride_tensor.squeeze(-1) - - stride_tensor = stride_tensor.view(1, -1, 1) - anchor_points_x = anchor_points_transposed[0].view(1, -1, 1) - anchor_points_y = anchor_points_transposed[1].view(1, -1, 1) - - y[:, :, 0::3] = ( - y[:, :, 0::3] * 2.0 + (anchor_points_x - 0.5) - ) * stride_tensor - y[:, :, 1::3] = ( - y[:, :, 1::3] * 2.0 + (anchor_points_y - 0.5) - ) * stride_tensor - y[:, :, 2::3] = y[:, :, 2::3].sigmoid() - - return y + anchors = self.anchor_points.split(self.n_anchors_list, dim=0) + kpt_predictions = kpts.view(batch_size, self.n_keypoints, 3, -1) + grid_coords = ( + kpt_predictions[:, :, :2] * 2.0 + + (anchors[index].transpose(1, 0) - 0.5) + ) * self.stride[index] + decoded_kpts = torch.cat( + (grid_coords, kpt_predictions[:, :, 2:3].sigmoid()), 2 + ) + return decoded_kpts.view(batch_size, self.nk, -1) def _process_to_bbox_and_kps( self, output: tuple[list[Tensor], Tensor, Tensor, Tensor] diff --git a/luxonis_train/nodes/heads/precision_bbox_head.py b/luxonis_train/nodes/heads/precision_bbox_head.py new file mode 100644 index 00000000..e42189db --- /dev/null +++ b/luxonis_train/nodes/heads/precision_bbox_head.py @@ -0,0 +1,314 @@ +import logging +import math +from typing import Any, Literal + +import torch +from torch import Tensor, nn + +from luxonis_train.enums import TaskType +from luxonis_train.nodes.blocks import DFL, ConvModule, DWConvModule +from luxonis_train.nodes.heads import BaseHead +from luxonis_train.utils import ( + Packet, + anchors_for_fpn_features, + dist2bbox, + non_max_suppression, +) + +logger = logging.getLogger(__name__) + + +class PrecisionBBoxHead(BaseHead[list[Tensor], list[Tensor]]): + in_channels: list[int] + tasks: list[TaskType] = [TaskType.BOUNDINGBOX] + parser = "YOLO" + + def __init__( + self, + reg_max: int = 16, + n_heads: Literal[2, 3, 4] = 3, + conf_thres: float = 0.25, + iou_thres: float = 0.45, + max_det: int = 300, + **kwargs: Any, + ): + """ + Adapted from U{Real-Time Flying Object Detection with YOLOv8 + } and from U{YOLOv6: A Single-Stage Object Detection Framework + for Industrial Applications + }. + + @type ch: tuple[int] + @param ch: Channels for each detection layer. + @type reg_max: int + @param reg_max: Maximum number of regression channels. + @type n_heads: Literal[2, 3, 4] + @param n_heads: Number of output heads. + @type conf_thres: float + @param conf_thres: Confidence threshold for NMS. + @type iou_thres: float + @param iou_thres: IoU threshold for NMS. + @type max_det: int + @param max_det: Maximum number of detections retained after NMS. + """ + super().__init__(**kwargs) + self.reg_max = reg_max + self.no = self.n_classes + reg_max * 4 + self.n_heads = n_heads + self.conf_thres = conf_thres + self.iou_thres = iou_thres + self.grid_cell_offset = 0.5 + self.grid_cell_size = 5.0 + self.max_det = max_det + + reg_channels = max((16, self.in_channels[0] // 4, reg_max * 4)) + cls_channels = max(self.in_channels[0], min(self.n_classes, 100)) + + self.detection_heads = nn.ModuleList( + nn.Sequential( + # Regression branch + nn.Sequential( + ConvModule( + x, + reg_channels, + kernel_size=3, + padding=1, + activation=nn.SiLU(), + ), + ConvModule( + reg_channels, + reg_channels, + kernel_size=3, + padding=1, + activation=nn.SiLU(), + ), + nn.Conv2d(reg_channels, 4 * self.reg_max, kernel_size=1), + ), + # Classification branch + nn.Sequential( + nn.Sequential( + DWConvModule( + x, + x, + kernel_size=3, + padding=1, + activation=nn.SiLU(), + ), + ConvModule( + x, + cls_channels, + kernel_size=1, + activation=nn.SiLU(), + ), + ), + nn.Sequential( + DWConvModule( + cls_channels, + cls_channels, + kernel_size=3, + padding=1, + activation=nn.SiLU(), + ), + ConvModule( + cls_channels, + cls_channels, + kernel_size=1, + activation=nn.SiLU(), + ), + ), + nn.Conv2d(cls_channels, self.n_classes, kernel_size=1), + ), + ) + for x in self.in_channels + ) + + self.stride = self._fit_stride_to_n_heads() + self.dfl = DFL(reg_max) if reg_max > 1 else nn.Identity() + self.bias_init() + self.initialize_weights() + + self.check_export_output_names() + + def check_export_output_names(self): + if ( + self.export_output_names is None + or len(self.export_output_names) != self.n_heads + ): + if ( + self.export_output_names is not None + and len(self.export_output_names) != self.n_heads + ): + logger.warning( + f"Number of provided output names ({len(self.export_output_names)}) " + f"does not match number of heads ({self.n_heads}). " + f"Using default names." + ) + self._export_output_names = [ + f"output{i + 1}_yolov8" for i in range(self.n_heads) + ] + + def forward(self, x: list[Tensor]) -> tuple[list[Tensor], list[Tensor]]: + cls_outputs = [] + reg_outputs = [] + for i in range(self.n_heads): + reg_output = self.detection_heads[i][0](x[i]) # type: ignore + cls_output = self.detection_heads[i][1](x[i]) # type: ignore + reg_outputs.append(reg_output) + cls_outputs.append(cls_output) + return reg_outputs, cls_outputs + + def wrap( + self, output: tuple[list[Tensor], list[Tensor]] + ) -> Packet[Tensor]: + reg_outputs, cls_outputs = ( + output # ([bs, 4*reg_max, h_f, w_f]), ([bs, n_classes, h_f, w_f]) + ) + features = [ + torch.cat((reg, cls), dim=1) + for reg, cls in zip(reg_outputs, cls_outputs) + ] + if self.training: + return { + "features": features, + } + + if self.export: + return { + "boundingbox": self._prepare_bbox_export( + reg_outputs, cls_outputs + ) + } + + boxes = non_max_suppression( + self._prepare_bbox_inference_output(reg_outputs, cls_outputs), + n_classes=self.n_classes, + conf_thres=self.conf_thres, + iou_thres=self.iou_thres, + bbox_format="xyxy", + max_det=self.max_det, + predicts_objectness=False, + ) + + return { + "features": features, + "boundingbox": boxes, + } + + def _fit_stride_to_n_heads(self): + """Returns correct stride for number of heads and attach + index.""" + stride = torch.tensor( + [ + self.original_in_shape[1] / x[2] # type: ignore + for x in self.in_sizes[: self.n_heads] + ], + dtype=torch.int, + ) + return stride + + def _prepare_bbox_and_cls( + self, reg_outputs: list[Tensor], cls_outputs: list[Tensor] + ) -> list[Tensor]: + """Extract classification and bounding box tensors.""" + output = [] + for i in range(self.n_heads): + box = self.dfl(reg_outputs[i]) + cls = cls_outputs[i].sigmoid() + conf = cls.max(1, keepdim=True)[0] + output.append( + torch.cat([box, conf, cls], dim=1) + ) # [bs, 4 + 1 + n_classes, h_f, w_f] + return output + + def _prepare_bbox_export( + self, reg_outputs: list[Tensor], cls_outputs: list[Tensor] + ) -> list[Tensor]: + """Prepare the output for export.""" + return self._prepare_bbox_and_cls(reg_outputs, cls_outputs) + + def _prepare_bbox_inference_output( + self, reg_outputs: list[Tensor], cls_outputs: list[Tensor] + ) -> Tensor: + """Perform inference on predicted bounding boxes and class + probabilities.""" + processed_outputs = self._prepare_bbox_and_cls( + reg_outputs, cls_outputs + ) + box_dists = [] + class_probs = [] + for feature in processed_outputs: + bs, _, h, w = feature.size() + reshaped = feature.view(bs, -1, h * w) + box_dist = reshaped[:, :4, :] + cls = reshaped[:, 5:, :] + box_dists.append(box_dist) + class_probs.append(cls) + + box_dists = torch.cat(box_dists, dim=2) + class_probs = torch.cat(class_probs, dim=2) + + _, anchor_points, _, strides = anchors_for_fpn_features( + processed_outputs, self.stride, 0.5 + ) + + pred_bboxes = dist2bbox( + box_dists, anchor_points.transpose(0, 1), out_format="xyxy", dim=1 + ) * strides.transpose(0, 1) + + base_output = [ + pred_bboxes.permute(0, 2, 1), # [BS, H*W, 4] + torch.ones( + (box_dists.shape[0], pred_bboxes.shape[2], 1), + dtype=pred_bboxes.dtype, + device=pred_bboxes.device, + ), + class_probs.permute(0, 2, 1), # [BS, H*W, n_classes] + ] + + output_merged = torch.cat( + base_output, dim=-1 + ) # [BS, H*W, 4 + 1 + n_classes] + return output_merged + + def bias_init(self): + """Initialize biases for the detection heads. + + Assumes detection_heads structure with separate regression and + classification branches. + """ + for head, stride in zip(self.detection_heads, self.stride): + reg_branch = head[0] # type: ignore + cls_branch = head[1] # type: ignore + + reg_conv = reg_branch[-1] + reg_conv.bias.data[:] = 1.0 + + cls_conv = cls_branch[-1] + cls_conv.bias.data[: self.n_classes] = math.log( + 5 / self.n_classes / (self.original_in_shape[1] / stride) ** 2 + ) + + def initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + pass + elif isinstance(m, nn.BatchNorm2d): + m.eps = 0.001 + m.momentum = 0.03 + elif isinstance( + m, (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU) + ): + m.inplace = True + + def get_custom_head_config(self) -> dict: + """Returns custom head configuration. + + @rtype: dict + @return: Custom head configuration. + """ + return { + "subtype": "yolov8", + "iou_threshold": self.iou_thres, + "conf_threshold": self.conf_thres, + "max_det": self.max_det, + } diff --git a/luxonis_train/nodes/heads/precision_seg_bbox_head.py b/luxonis_train/nodes/heads/precision_seg_bbox_head.py new file mode 100644 index 00000000..3518b46f --- /dev/null +++ b/luxonis_train/nodes/heads/precision_seg_bbox_head.py @@ -0,0 +1,247 @@ +import logging +from typing import Any, Literal + +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +from luxonis_train.enums import TaskType +from luxonis_train.nodes.blocks import ConvModule, SegProto +from luxonis_train.utils import ( + Packet, + apply_bounding_box_to_masks, + non_max_suppression, +) + +from .precision_bbox_head import PrecisionBBoxHead + +logger = logging.getLogger(__name__) + + +class PrecisionSegmentBBoxHead(PrecisionBBoxHead): + tasks: list[TaskType] = [ + TaskType.INSTANCE_SEGMENTATION, + TaskType.BOUNDINGBOX, + ] + parser: str = "YOLOExtendedParser" + + def __init__( + self, + n_heads: Literal[2, 3, 4] = 3, + n_masks: int = 32, + n_proto: int = 256, + conf_thres: float = 0.25, + iou_thres: float = 0.45, + max_det: int = 300, + **kwargs: Any, + ): + """ + Head for instance segmentation and object detection. + Adapted from U{Real-Time Flying Object Detection with YOLOv8 + } and from U{YOLOv6: A Single-Stage Object Detection Framework + for Industrial Applications + }. + + @type n_heads: Literal[2, 3, 4] + @param n_heads: Number of output heads. Defaults to 3. + @type n_masks: int + @param n_masks: Number of masks. + @type n_proto: int + @param n_proto: Number of prototypes for segmentation. + @type conf_thres: flaot + @param conf_thres: Confidence threshold for NMS. + @type iou_thres: float + @param iou_thres: IoU threshold for NMS. + @type max_det: int + @param max_det: Maximum number of detections retained after NMS. + """ + super().__init__( + n_heads=n_heads, + conf_thres=conf_thres, + iou_thres=iou_thres, + max_det=max_det, + **kwargs, + ) + + self.n_masks = n_masks + mid_ch = max(self.in_channels[0] // 4, self.n_masks) + self.mask_layers = nn.ModuleList( + nn.Sequential( + ConvModule(x, mid_ch, 3, 1, 1, activation=nn.SiLU()), + ConvModule(mid_ch, mid_ch, 3, 1, 1, activation=nn.SiLU()), + nn.Conv2d(mid_ch, self.n_masks, 1, 1), + ) + for x in self.in_channels + ) + + self.n_proto = n_proto + self.proto = SegProto(self.in_channels[0], self.n_proto, self.n_masks) + + self.check_export_output_names() + + def check_export_output_names(self): + if ( + self.export_output_names is None + or len(self.export_output_names) != self.n_heads + ): + if ( + self.export_output_names is not None + and len(self.export_output_names) != self.n_heads + ): + logger.warning( + f"Number of provided output names ({len(self.export_output_names)}) " + f"does not match number of heads ({self.n_heads}). " + f"Using default names." + ) + self._export_output_names = ( + [f"output{i + 1}_yolov8" for i in range(self.n_heads)] + + [f"output{i + 1}_masks" for i in range(self.n_heads)] + + ["protos_output"] + ) # export names are applied on sorter output names + + def forward( + self, inputs: list[Tensor] + ) -> tuple[tuple[list[Tensor], list[Tensor]], Tensor, list[Tensor]]: + prototypes = self.proto(inputs[0]) + mask_coefficients = [ + self.mask_layers[i](inputs[i]) for i in range(self.n_heads) + ] + + det_outs = super().forward(inputs) + + return det_outs, prototypes, mask_coefficients + + def wrap( + self, + output: tuple[tuple[list[Tensor], list[Tensor]], Tensor, list[Tensor]], + ) -> Packet[Tensor]: + det_feats, prototypes, mask_coefficients = output + + if self.export: + pred_bboxes = self._prepare_bbox_export(*det_feats) # type: ignore + return { + "boundingbox": pred_bboxes, + "masks": mask_coefficients, + "prototypes": [prototypes], + } + + det_feats_combined = [ + torch.cat((reg, cls), dim=1) for reg, cls in zip(*det_feats) + ] + mask_coefficients = torch.cat( + [ + coef.view(coef.size(0), self.n_masks, -1) + for coef in mask_coefficients + ], + dim=2, + ) + + if self.training: + return { + "features": det_feats_combined, + "prototypes": [prototypes], + "mask_coeficients": [mask_coefficients], + } + + pred_bboxes = self._prepare_bbox_inference_output(*det_feats) # type: ignore + preds_combined = torch.cat( + [pred_bboxes, mask_coefficients.permute(0, 2, 1)], dim=-1 + ) + preds = non_max_suppression( + preds_combined, + n_classes=self.n_classes, + conf_thres=self.conf_thres, + iou_thres=self.iou_thres, + bbox_format="xyxy", + max_det=self.max_det, + predicts_objectness=False, + ) + + results = { + "features": det_feats_combined, + "prototypes": [prototypes], + "mask_coeficients": [mask_coefficients], + "boundingbox": [], + "instance_segmentation": [], + } + + for i, pred in enumerate(preds): + results["instance_segmentation"].append( + refine_and_apply_masks( + prototypes[i], + pred[:, 6:], + pred[:, :4], + self.original_in_shape[-2:], + upsample=True, + ) + ) + results["boundingbox"].append(pred[:, :6]) + + return results + + def get_custom_head_config(self) -> dict: + """Returns custom head configuration. + + @rtype: dict + @return: Custom head configuration. + """ + return { + "subtype": "yolov8", + "iou_threshold": self.iou_thres, + "conf_threshold": self.conf_thres, + "max_det": self.max_det, + } + + +def refine_and_apply_masks( + mask_prototypes, + predicted_masks, + bounding_boxes, + target_shape, + upsample=False, +): + """Refine and apply masks to bounding boxes based on the mask head + outputs. + + @type mask_prototypes: torch.Tensor + @param mask_prototypes: Tensor of shape [mask_dim, mask_height, + mask_width]. + @type predicted_masks: torch.Tensor + @param predicted_masks: Tensor of shape [num_masks, mask_dim], where + num_masks is the number of detected masks. + @type bounding_boxes: torch.Tensor + @param bounding_boxes: Tensor of shape [num_masks, 4], containing + bounding box coordinates. + @type target_shape: tuple + @param target_shape: Tuple (height, width) representing the + dimensions of the original image. + @type upsample: bool + @param upsample: If True, upsample the masks to the target image + dimensions. Default is False. + @rtype: torch.Tensor + @return: A binary mask tensor of shape [num_masks, height, width], + where the masks are cropped according to their respective + bounding boxes. + """ + if predicted_masks.size(0) == 0 or bounding_boxes.size(0) == 0: + img_h, img_w = target_shape + return torch.zeros(0, img_h, img_w, dtype=torch.uint8) + + channels, proto_h, proto_w = mask_prototypes.shape + img_h, img_w = target_shape + masks_combined = ( + predicted_masks @ mask_prototypes.float().view(channels, -1) + ).view(-1, proto_h, proto_w) + w_scale, h_scale = proto_w / img_w, proto_h / img_h + scaled_boxes = bounding_boxes.clone() + scaled_boxes[:, [0, 2]] *= w_scale + scaled_boxes[:, [1, 3]] *= h_scale + cropped_masks = apply_bounding_box_to_masks(masks_combined, scaled_boxes) + if upsample: + cropped_masks = F.interpolate( + cropped_masks.unsqueeze(0), + size=target_shape, + mode="bilinear", + align_corners=False, + ).squeeze(0) + return (cropped_masks > 0).to(cropped_masks.dtype) diff --git a/luxonis_train/utils/__init__.py b/luxonis_train/utils/__init__.py index 2f2b550a..8e12a214 100644 --- a/luxonis_train/utils/__init__.py +++ b/luxonis_train/utils/__init__.py @@ -1,5 +1,6 @@ from .boundingbox import ( anchors_for_fpn_features, + apply_bounding_box_to_masks, bbox2dist, bbox_iou, compute_iou_loss, @@ -44,4 +45,5 @@ "traverse_graph", "insert_class", "get_attribute_check_none", + "apply_bounding_box_to_masks", ] diff --git a/luxonis_train/utils/boundingbox.py b/luxonis_train/utils/boundingbox.py index e72360c3..ff2af2cf 100644 --- a/luxonis_train/utils/boundingbox.py +++ b/luxonis_train/utils/boundingbox.py @@ -19,6 +19,7 @@ def dist2bbox( distance: Tensor, anchor_points: Tensor, out_format: BBoxFormatType = "xyxy", + dim: int = -1, ) -> Tensor: """Transform distance (ltrb) to box ("xyxy", "xywh" or "cxcywh"). @@ -29,12 +30,14 @@ def dist2bbox( @type out_format: BBoxFormatType @param out_format: BBox output format. Defaults to "xyxy". @rtype: Tensor + @param dim: Dimension to split distance tensor. Defaults to -1. + @rtype: Tensor @return: BBoxes in correct format """ - lt, rb = torch.split(distance, 2, -1) + lt, rb = torch.split(distance, 2, dim=dim) x1y1 = anchor_points - lt x2y2 = anchor_points + rb - bbox = torch.cat([x1y1, x2y2], -1) + bbox = torch.cat([x1y1, x2y2], dim=dim) if out_format in ["xyxy", "xywh", "cxcywh"]: bbox = box_convert(bbox, in_fmt="xyxy", out_fmt=out_format) else: @@ -401,6 +404,39 @@ def anchors_for_fpn_features( ) +def apply_bounding_box_to_masks( + masks: Tensor, bounding_boxes: Tensor +) -> Tensor: + """Crops the given masks to the regions specified by the + corresponding bounding boxes. + + @type masks: Tensor + @param masks: Masks tensor of shape [n, h, w]. + @type bounding_boxes: Tensor + @param bounding_boxes: Bounding boxes tensor of shape [n, 4]. + @rtype: Tensor + @return: Cropped masks tensor of shape [n, h, w]. + """ + _, mask_height, mask_width = masks.shape + left, top, right, bottom = torch.split( + bounding_boxes[:, :, None], 1, dim=1 + ) + width_indices = torch.arange( + mask_width, device=masks.device, dtype=left.dtype + )[None, None, :] + height_indices = torch.arange( + mask_height, device=masks.device, dtype=left.dtype + )[None, :, None] + + cropped_masks = masks * ( + (width_indices >= left) + & (width_indices < right) + & (height_indices >= top) + & (height_indices < bottom) + ) + return cropped_masks + + def compute_iou_loss( pred_bboxes: Tensor, target_bboxes: Tensor, diff --git a/tests/integration/parking_lot.json b/tests/integration/parking_lot.json index bf3e3835..b9dde963 100644 --- a/tests/integration/parking_lot.json +++ b/tests/integration/parking_lot.json @@ -11,24 +11,11 @@ "name": "image", "dtype": "float32", "input_type": "image", - "shape": [ - 1, - 3, - 256, - 320 - ], + "shape": [1, 3, 256, 320], "layout": "NCHW", "preprocessing": { - "mean": [ - 123.675, - 116.28, - 103.53 - ], - "scale": [ - 58.395, - 57.12, - 57.375 - ], + "mean": [123.675, 116.28, 103.53], + "scale": [58.395, 57.12, 57.375], "reverse_channels": null, "interleaved_to_planar": null, "dai_type": "RGB888p" @@ -39,90 +26,68 @@ { "name": "BiSeNetHead/brand/segmentation/0", "dtype": "float32", - "shape": [ - 1, - 23, - 256, - 320 - ], + "shape": [1, 23, 256, 320], "layout": "NCHW" }, { - "name": "EfficientKeypointBBoxHead/outputs/0", + "name": "SegmentationHead/color/segmentation/0", "dtype": "float32", - "shape": [ - 1, - 14, - 32, - 40 - ], + "shape": [1, 4, 256, 320], "layout": "NCHW" }, { - "name": "EfficientKeypointBBoxHead/outputs/1", + "name": "output1_yolov6r2", "dtype": "float32", - "shape": [ - 1, - 14, - 16, - 20 - ], + "shape": [1, 8, 32, 40], "layout": "NCHW" }, { - "name": "EfficientKeypointBBoxHead/outputs/2", + "name": "output2_yolov6r2", "dtype": "float32", - "shape": [ - 1, - 14, - 8, - 10 - ], + "shape": [1, 8, 16, 20], + "layout": "NCHW" + }, + { + "name": "output3_yolov6r2", + "dtype": "float32", + "shape": [1, 8, 8, 10], "layout": "NCDE" }, { - "name": "SegmentationHead/color/segmentation/0", + "name": "output1_yolov6", "dtype": "float32", - "shape": [ - 1, - 4, - 256, - 320 - ], + "shape": [1, 6, 32, 40], "layout": "NCHW" }, { - "name": "output1_yolov6r2", + "name": "output2_yolov6", "dtype": "float32", - "shape": [ - 1, - 8, - 32, - 40 - ], + "shape": [1, 6, 16, 20], "layout": "NCHW" }, { - "name": "output2_yolov6r2", + "name": "output3_yolov6", "dtype": "float32", - "shape": [ - 1, - 8, - 16, - 20 - ], + "shape": [1, 6, 8, 10], "layout": "NCHW" }, { - "name": "output3_yolov6r2", + "name": "kpt_output1", "dtype": "float32", - "shape": [ - 1, - 8, - 8, - 10 - ], - "layout": "NCDE" + "shape": [1, 9, 1280], + "layout": "NCD" + }, + { + "name": "kpt_output2", + "dtype": "float32", + "shape": [1, 9, 320], + "layout": "NCD" + }, + { + "name": "kpt_output3", + "dtype": "float32", + "shape": [1, 9, 80], + "layout": "NCD" } ], "heads": [ @@ -132,47 +97,23 @@ "metadata": { "postprocessor_path": null, "classes": [ - "background", - "alfa-romeo", - "buick", - "ducati", - "harley", - "ferrari", - "infiniti", - "jeep", - "land-rover", - "roll-royce", - "yamaha", - "aprilia", - "bmw", - "dodge", - "honda", - "moto", - "piaggio", - "isuzu", - "Kawasaki", - "truimph", - "pontiac", - "saab", - "chrysler" + "background", "alfa-romeo", "buick", "ducati", "harley", + "ferrari", "infiniti", "jeep", "land-rover", "roll-royce", + "yamaha", "aprilia", "bmw", "dodge", "honda", "moto", + "piaggio", "isuzu", "Kawasaki", "truimph", "pontiac", + "saab", "chrysler" ], "n_classes": 23, "is_softmax": false }, - "outputs": [ - "BiSeNetHead/brand/segmentation/0" - ] + "outputs": ["BiSeNetHead/brand/segmentation/0"] }, { "name": "BiSeNetHead_0", "parser": "SegmentationParser", "metadata": { "postprocessor_path": null, - "classes": [ - "motorbike", - "car", - "background" - ], + "classes": ["motorbike", "car", "background"], "n_classes": 3, "is_softmax": false }, @@ -183,11 +124,7 @@ "parser": "YOLO", "metadata": { "postprocessor_path": null, - "classes": [ - "motorbike", - "car", - "background" - ], + "classes": ["motorbike", "car", "background"], "n_classes": 3, "iou_threshold": 0.45, "conf_threshold": 0.25, @@ -206,9 +143,7 @@ "parser": "YOLOExtendedParser", "metadata": { "postprocessor_path": null, - "classes": [ - "motorbike" - ], + "classes": ["motorbike"], "n_classes": 1, "iou_threshold": 0.45, "conf_threshold": 0.25, @@ -218,9 +153,12 @@ "n_keypoints": 3 }, "outputs": [ - "EfficientKeypointBBoxHead/outputs/0", - "EfficientKeypointBBoxHead/outputs/1", - "EfficientKeypointBBoxHead/outputs/2" + "output1_yolov6", + "output2_yolov6", + "output3_yolov6", + "kpt_output1", + "kpt_output2", + "kpt_output3" ] }, { @@ -228,19 +166,12 @@ "parser": "SegmentationParser", "metadata": { "postprocessor_path": null, - "classes": [ - "background", - "blue", - "green", - "red" - ], + "classes": ["background", "blue", "green", "red"], "n_classes": 4, "is_softmax": false }, - "outputs": [ - "SegmentationHead/color/segmentation/0" - ] + "outputs": ["SegmentationHead/color/segmentation/0"] } ] } -} +} \ No newline at end of file diff --git a/tests/integration/test_detection.py b/tests/integration/test_detection.py index 6360ec79..24e12e2b 100644 --- a/tests/integration/test_detection.py +++ b/tests/integration/test_detection.py @@ -110,6 +110,7 @@ def test_backbones( ): opts = get_opts_backbone(backbone) opts["loader.params.dataset_name"] = parking_lot_dataset.identifier + opts["trainer.epochs"] = 1 train_and_test(config, opts) @@ -121,4 +122,5 @@ def test_variants( ): opts = get_opts_variant(variant) opts["loader.params.dataset_name"] = parking_lot_dataset.identifier + opts["trainer.epochs"] = 1 train_and_test(config, opts) diff --git a/tests/integration/test_simple.py b/tests/integration/test_simple.py index 72331fcc..cef74ba3 100644 --- a/tests/integration/test_simple.py +++ b/tests/integration/test_simple.py @@ -58,6 +58,8 @@ def clear_files(): "detection_light_model", "keypoint_bbox_heavy_model", "keypoint_bbox_light_model", + "instance_segmentation_heavy_model", + "instance_segmentation_light_model", ], ) def test_predefined_models( diff --git a/tests/unittests/test_assigners/test_tal_assigner.py b/tests/unittests/test_assigners/test_tal_assigner.py index cb94b62d..ab64302b 100644 --- a/tests/unittests/test_assigners/test_tal_assigner.py +++ b/tests/unittests/test_assigners/test_tal_assigner.py @@ -34,6 +34,12 @@ def test_forward(): pred_scores, pred_bboxes, anchor_points, gt_labels, gt_bboxes, mask_gt ) + labels = torch.where( + mask, + labels, + torch.full_like(labels, n_classes), + ) + assert labels.shape == (batch_size, n_anchors) assert bboxes.shape == (batch_size, n_anchors, 4) assert scores.shape == (