diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index e567036d..665ebbd6 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -33,7 +33,7 @@ jobs:
       run: pip install -e .[dev]
     
     - name: Install latest luxonis-ml
-      run: pip install luxonis-ml[all]@git+https://github.com/luxonis/luxonis-ml.git@main --upgrade --no-deps --force-reinstall
+      run: pip install luxonis-ml[all]@git+https://github.com/luxonis/luxonis-ml.git@main --upgrade --force-reinstall
 
     - name: Authenticate to Google Cloud
       id: google-auth
diff --git a/configs/detection_heavy_model.yaml b/configs/detection_heavy_model.yaml
index b56bdba6..e19621c2 100644
--- a/configs/detection_heavy_model.yaml
+++ b/configs/detection_heavy_model.yaml
@@ -1,4 +1,5 @@
 # Example configuration for training a predefined heavy detection model
+# NOTE: This example downloads pretrained COCO weights and training parameters are already prepared for fine tuning
 
 model:
   name: detection_heavy
@@ -22,6 +23,9 @@ trainer:
     keep_aspect_ratio: true
     normalize:
       active: true
+      params:
+        mean: [0., 0., 0.]
+        std: [1, 1, 1]
 
   batch_size: 8
   epochs: &epochs 300
diff --git a/configs/detection_light_model.yaml b/configs/detection_light_model.yaml
index 6cd85a0f..3a1f6205 100644
--- a/configs/detection_light_model.yaml
+++ b/configs/detection_light_model.yaml
@@ -23,6 +23,9 @@ trainer:
     keep_aspect_ratio: true
     normalize:
       active: true
+      params:
+        mean: [0., 0., 0.]
+        std: [1, 1, 1]
 
   batch_size: 8
   epochs: &epochs 300
diff --git a/configs/instance_segmentation_heavy_model.yaml b/configs/instance_segmentation_heavy_model.yaml
new file mode 100644
index 00000000..c1395f0c
--- /dev/null
+++ b/configs/instance_segmentation_heavy_model.yaml
@@ -0,0 +1,54 @@
+# Example configuration for training a predefined heavy instance segmentation model
+
+model:
+  name: instance_segmentation_heavy
+  predefined_model:
+    name: InstanceSegmentationModel
+    params:
+      variant: heavy
+      loss_params:
+        bbox_loss_weight: 60 # Should be 7.5 * accumulate_grad_batches for best results
+        class_loss_weight: 4 # Should be 0.5 * accumulate_grad_batches for best results
+        dfl_loss_weight: 12 # Should be 1.5 * accumulate_grad_batches for best results
+
+loader:
+  params:
+    dataset_name: coco_test
+
+trainer:
+  preprocessing:
+    train_image_size: [384, 512]
+    keep_aspect_ratio: true
+    normalize:
+      active: true
+      params:
+        mean: [0., 0., 0.]
+        std: [1, 1, 1]
+
+  batch_size: 8
+  epochs: &epochs 300
+  accumulate_grad_batches: 8 # For best results, always accumulate gradients to effectively use 64 batch size
+  n_workers: 8
+  validation_interval: 10
+  n_log_images: 8
+
+  callbacks:
+    - name: EMACallback
+      params:
+        decay: 0.9999 
+        use_dynamic_decay: True 
+        decay_tau: 2000
+    - name: ExportOnTrainEnd
+    - name: TestOnTrainEnd
+
+  training_strategy:
+    name: "TripleLRSGDStrategy"
+    params: 
+      warmup_epochs: 3
+      warmup_bias_lr: 0.1
+      warmup_momentum: 0.8
+      lr: 0.01
+      lre: 0.0001
+      momentum: 0.937     
+      weight_decay: 0.0005
+      nesterov: True
\ No newline at end of file
diff --git a/configs/instance_segmentation_light_model.yaml b/configs/instance_segmentation_light_model.yaml
new file mode 100644
index 00000000..1d1736e4
--- /dev/null
+++ b/configs/instance_segmentation_light_model.yaml
@@ -0,0 +1,54 @@
+# Example configuration for training a predefined light instance segmentation model
+
+model:
+  name: instance_segmentation_light
+  predefined_model:
+    name: InstanceSegmentationModel
+    params:
+      variant: light
+      loss_params:
+        bbox_loss_weight: 60 # Should be 7.5 * accumulate_grad_batches for best results
+        class_loss_weight: 4 # Should be 0.5 * accumulate_grad_batches for best results
+        dfl_loss_weight: 12 # Should be 1.5 * accumulate_grad_batches for best results
+
+loader:
+  params:
+    dataset_name: coco_test
+
+trainer:
+  preprocessing:
+    train_image_size: [384, 512]
+    keep_aspect_ratio: true
+    normalize:
+      active: true
+      params:
+        mean: [0., 0., 0.]
+        std: [1, 1, 1]
+
+  batch_size: 8
+  epochs: &epochs 300
+  accumulate_grad_batches: 8 # For best results, always accumulate gradients to effectively use 64 batch size
+  n_workers: 8
+  validation_interval: 10
+  n_log_images: 8
+
+  callbacks:
+    - name: EMACallback
+      params:
+        decay: 0.9999 
+        use_dynamic_decay: True 
+        decay_tau: 2000
+    - name: ExportOnTrainEnd
+    - name: TestOnTrainEnd
+
+  training_strategy:
+    name: "TripleLRSGDStrategy"
+    params: 
+      warmup_epochs: 3
+      warmup_bias_lr: 0.1
+      warmup_momentum: 0.8
+      lr: 0.01
+      lre: 0.0001
+      momentum: 0.937     
+      weight_decay: 0.0005
+      nesterov: True
\ No newline at end of file
diff --git a/configs/keypoint_bbox_heavy_model.yaml b/configs/keypoint_bbox_heavy_model.yaml
index 10527921..4fabc83b 100644
--- a/configs/keypoint_bbox_heavy_model.yaml
+++ b/configs/keypoint_bbox_heavy_model.yaml
@@ -6,6 +6,13 @@ model:
     name: KeypointDetectionModel
     params:
       variant: heavy
+      loss_params:
+        iou_type: "siou"
+        n_warmup_epochs: 0 # No assigner warmup
+        iou_loss_weight: 60 # Should be 7.5 * accumulate_grad_batches for best results
+        class_loss_weight: 4 # Should be 0.5 * accumulate_grad_batches for best results
+        regr_kpts_loss_weight: 96 # Should be 12 * accumulate_grad_batches for best results
+        vis_kpts_loss_weight: 16 # Should be 2 * accumulate_grad_batches for best results
 
 loader:
   params:
@@ -17,29 +24,34 @@ trainer:
     keep_aspect_ratio: true
     normalize:
       active: true
+      params:
+        mean: [0., 0., 0.]
+        std: [1, 1, 1]
 
   batch_size: 8
-  epochs: &epochs 200
+  epochs: &epochs 300
   n_workers: 4
   validation_interval: 10
   n_log_images: 8
+  accumulate_grad_batches: 8 # For best results, always accumulate gradients to effectively use 64 batch size
 
   callbacks:
+    - name: EMACallback
+      params:
+        decay: 0.9999 
+        use_dynamic_decay: True 
+        decay_tau: 2000
     - name: ExportOnTrainEnd
     - name: TestOnTrainEnd
 
-  optimizer:
-    name: SGD
-    params:
-      lr: 0.006
-      momentum: 0.937
+  training_strategy:
+    name: "TripleLRSGDStrategy"
+    params: 
+      warmup_epochs: 3
+      warmup_bias_lr: 0.1
+      warmup_momentum: 0.8
+      lr: 0.01
+      lre: 0.0001
+      momentum: 0.937     
       weight_decay: 0.0005
-      dampening: 0.0
-      nesterov: true
-
-  scheduler:
-    name: CosineAnnealingLR
-    params:
-      T_max: *epochs
-      eta_min: 0.00001
-      last_epoch: -1
+      nesterov: True
\ No newline at end of file
diff --git a/configs/keypoint_bbox_light_model.yaml b/configs/keypoint_bbox_light_model.yaml
index 57042b04..303dca31 100644
--- a/configs/keypoint_bbox_light_model.yaml
+++ b/configs/keypoint_bbox_light_model.yaml
@@ -6,6 +6,13 @@ model:
     name: KeypointDetectionModel
     params:
       variant: light
+      loss_params:
+        iou_type: "siou"
+        n_warmup_epochs: 0 # No assigner warmup
+        iou_loss_weight: 60 # Should be 7.5 * accumulate_grad_batches for best results
+        class_loss_weight: 4 # Should be 0.5 * accumulate_grad_batches for best results
+        regr_kpts_loss_weight: 96 # Should be 12 * accumulate_grad_batches for best results
+        vis_kpts_loss_weight: 16 # Should be 2 * accumulate_grad_batches for best results
 
 loader:
   params:
@@ -17,29 +24,34 @@ trainer:
     keep_aspect_ratio: true
     normalize:
       active: true
+      params:
+        mean: [0., 0., 0.]
+        std: [1, 1, 1]
 
   batch_size: 8
-  epochs: &epochs 200
+  epochs: &epochs 300
   n_workers: 4
   validation_interval: 10
   n_log_images: 8
+  accumulate_grad_batches: 8 # For best results, always accumulate gradients to effectively use 64 batch size
 
   callbacks:
+    - name: EMACallback
+      params:
+        decay: 0.9999 
+        use_dynamic_decay: True 
+        decay_tau: 2000
     - name: ExportOnTrainEnd
     - name: TestOnTrainEnd
 
-  optimizer:
-    name: SGD
-    params:
-      lr: 0.006
-      momentum: 0.937
+  training_strategy:
+    name: "TripleLRSGDStrategy"
+    params: 
+      warmup_epochs: 3
+      warmup_bias_lr: 0.1
+      warmup_momentum: 0.8
+      lr: 0.01
+      lre: 0.0001
+      momentum: 0.937     
       weight_decay: 0.0005
-      dampening: 0.0
-      nesterov: true
-
-  scheduler:
-    name: CosineAnnealingLR
-    params:
-      T_max: *epochs
-      eta_min: 0.00001
-      last_epoch: -1
+      nesterov: True
diff --git a/luxonis_train/assigners/tal_assigner.py b/luxonis_train/assigners/tal_assigner.py
index b289fbd6..c40b4934 100644
--- a/luxonis_train/assigners/tal_assigner.py
+++ b/luxonis_train/assigners/tal_assigner.py
@@ -254,10 +254,4 @@ def _get_final_assignments(
             torch.full_like(assigned_scores, 0),
         )
 
-        assigned_labels = torch.where(
-            mask_pos_sum.bool(),
-            assigned_labels,
-            torch.full_like(assigned_labels, self.n_classes),
-        )
-
         return assigned_labels, assigned_bboxes, assigned_scores
diff --git a/luxonis_train/attached_modules/losses/README.md b/luxonis_train/attached_modules/losses/README.md
index aa1b9ca6..ffe218d4 100644
--- a/luxonis_train/attached_modules/losses/README.md
+++ b/luxonis_train/attached_modules/losses/README.md
@@ -12,6 +12,8 @@ List of all the available loss functions.
 - [`AdaptiveDetectionLoss`](#adaptivedetectionloss)
 - [`EfficientKeypointBBoxLoss`](#efficientkeypointbboxloss)
 - [`FOMOLocalizationLoss`](#fomolocalizationLoss)
+- \[`PrecisionDFLDetectionLoss`\] (# precisiondfldetectionloss)
+- \[`PrecisionDFLSegmentationLoss`\] (# precisiondflsegmentationloss)
 
 ## `CrossEntropyLoss`
 
@@ -97,7 +99,7 @@ Keypoint Similarity Loss](https://arxiv.org/ftp/arxiv/papers/2204/2204.06806.pdf
 | `class_loss_weight`     | `float`                                           | `1.0`         | Weight used for the classification sub-loss                                                                   |
 | `iou_loss_weight`       | `float`                                           | `2.5`         | Weight used for the `IoU` sub-loss                                                                            |
 | `regr_kpts_loss_weight` | `float`                                           | `1.5`         | Weight used for the `OKS` sub-loss                                                                            |
-| `vis_kpts_loss_weight`  | `float`                                           | `1.0`         | Weight used for the keypoint visibility sub-loss                                                              |
+| `vis_kpts_loss_weight`  | `float`                                           | `2.0`         | Weight used for the keypoint visibility sub-loss                                                              |
 | `sigmas`                | `list[float] \ None`                              | `None`        | Sigmas used in `KeypointLoss` for `OKS` metric. If `None` then use COCO ones if possible or default ones      |
 | `area_factor`           | `float \| None`                                   | `None`        | Factor by which we multiply bounding box area which is used in `KeypointLoss.` If `None` then use default one |
 
@@ -120,4 +122,30 @@ Adapted from [here](https://arxiv.org/abs/2108.07610).
 
 | Key             | Type    | Default value | Description                                                                                                                                                                          |
 | --------------- | ------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `object_weight` | `float` | `1000`        | Weight for the objects in the loss calculation. Training with a larger `object_weight` in the loss parameters may result in more false positives (FP), but it will improve accuracy. |
+| `object_weight` | `float` | `500`         | Weight for the objects in the loss calculation. Training with a larger `object_weight` in the loss parameters may result in more false positives (FP), but it will improve accuracy. |
+
+## `PrecisionDFLDetectionLoss`
+
+Adapted from [here](https://arxiv.org/pdf/2207.02696.pdf) and [here](https://arxiv.org/pdf/2209.02976.pdf).
+
+**Parameters:**
+
+| Key                 | Type    | Default value | Description                                |
+| ------------------- | ------- | ------------- | ------------------------------------------ |
+| `tal_topk`          | `int`   | `10`          | Number of anchors considered in selection. |
+| `class_loss_weight` | `float` | `0.5`         | Weight for classification loss.            |
+| `bbox_loss_weight`  | `float` | `7.5`         | Weight for bbox loss.                      |
+| `dfl_loss_weigth`   | `float` | `1.5`         | Weight for DFL loss.                       |
+
+## `PrecisionDFLSegmentationLoss`
+
+Adapted from [here](https://arxiv.org/pdf/2207.02696.pdf) and [here](https://arxiv.org/pdf/2209.02976.pdf).
+
+**Parameters:**
+
+| Key                 | Type    | Default value | Description                                |
+| ------------------- | ------- | ------------- | ------------------------------------------ |
+| `tal_topk`          | `int`   | `10`          | Number of anchors considered in selection. |
+| `class_loss_weight` | `float` | `0.5`         | Weight for classification loss.            |
+| `bbox_loss_weight`  | `float` | `7.5`         | Weight for bbox and segmentation loss.     |
+| `dfl_loss_weigth`   | `float` | `1.5`         | Weight for DFL loss.                       |
diff --git a/luxonis_train/attached_modules/losses/__init__.py b/luxonis_train/attached_modules/losses/__init__.py
index ff0bafc8..32b33174 100644
--- a/luxonis_train/attached_modules/losses/__init__.py
+++ b/luxonis_train/attached_modules/losses/__init__.py
@@ -7,6 +7,8 @@
 from .ohem_bce_with_logits import OHEMBCEWithLogitsLoss
 from .ohem_cross_entropy import OHEMCrossEntropyLoss
 from .ohem_loss import OHEMLoss
+from .precision_dfl_detection_loss import PrecisionDFLDetectionLoss
+from .precision_dlf_segmentation_loss import PrecisionDFLSegmentationLoss
 from .reconstruction_segmentation_loss import ReconstructionSegmentationLoss
 from .sigmoid_focal_loss import SigmoidFocalLoss
 from .smooth_bce_with_logits import SmoothBCEWithLogitsLoss
@@ -26,4 +28,6 @@
     "OHEMCrossEntropyLoss",
     "OHEMBCEWithLogitsLoss",
     "FOMOLocalizationLoss",
+    "PrecisionDFLDetectionLoss",
+    "PrecisionDFLSegmentationLoss",
 ]
diff --git a/luxonis_train/attached_modules/losses/adaptive_detection_loss.py b/luxonis_train/attached_modules/losses/adaptive_detection_loss.py
index 6a7f57f2..5e212249 100644
--- a/luxonis_train/attached_modules/losses/adaptive_detection_loss.py
+++ b/luxonis_train/attached_modules/losses/adaptive_detection_loss.py
@@ -56,9 +56,9 @@ def __init__(
         @type reduction: Literal["sum", "mean"]
         @param reduction: Reduction type for loss.
         @type class_loss_weight: float
-        @param class_loss_weight: Weight of classification loss.
+        @param class_loss_weight: Weight of classification loss. Defaults to 1.0. For optimal results, multiply with accumulate_grad_batches.
         @type iou_loss_weight: float
-        @param iou_loss_weight: Weight of IoU loss.
+        @param iou_loss_weight: Weight of IoU loss. Defaults to 2.5. For optimal results, multiply with accumulate_grad_batches.
         """
         super().__init__(**kwargs)
 
@@ -133,6 +133,11 @@ def forward(
         assigned_scores: Tensor,
         mask_positive: Tensor,
     ) -> tuple[Tensor, dict[str, Tensor]]:
+        assigned_labels = torch.where(
+            mask_positive > 0,
+            assigned_labels,
+            torch.full_like(assigned_labels, self.n_classes),
+        )
         one_hot_label = F.one_hot(assigned_labels.long(), self.n_classes + 1)[
             ..., :-1
         ]
diff --git a/luxonis_train/attached_modules/losses/efficient_keypoint_bbox_loss.py b/luxonis_train/attached_modules/losses/efficient_keypoint_bbox_loss.py
index 98630742..09cf7124 100644
--- a/luxonis_train/attached_modules/losses/efficient_keypoint_bbox_loss.py
+++ b/luxonis_train/attached_modules/losses/efficient_keypoint_bbox_loss.py
@@ -38,7 +38,7 @@ def __init__(
         iou_loss_weight: float = 7.5,
         viz_pw: float = 1.0,
         regr_kpts_loss_weight: float = 12,
-        vis_kpts_loss_weight: float = 1.0,
+        vis_kpts_loss_weight: float = 2.0,
         sigmas: list[float] | None = None,
         area_factor: float | None = None,
         **kwargs: Any,
@@ -57,11 +57,11 @@ def __init__(
         @type class_loss_weight: float
         @param class_loss_weight: Weight of classification loss for bounding boxes.
         @type regr_kpts_loss_weight: float
-        @param regr_kpts_loss_weight: Weight of regression loss for keypoints.
+        @param regr_kpts_loss_weight: Weight of regression loss for keypoints. Defaults to 12.0. For optimal results, multiply with accumulate_grad_batches.
         @type vis_kpts_loss_weight: float
-        @param vis_kpts_loss_weight: Weight of visibility loss for keypoints.
+        @param vis_kpts_loss_weight: Weight of visibility loss for keypoints. Defaults to 2.0. For optimal results, multiply with accumulate_grad_batches.
         @type iou_loss_weight: float
-        @param iou_loss_weight: Weight of IoU loss.
+        @param iou_loss_weight: Weight of IoU loss. Defaults to 2.5. For optimal results, multiply with accumulate_grad_batches.
         @type sigmas: list[float] | None
         @param sigmas: Sigmas used in keypoint loss for OKS metric. If None then use COCO ones if possible or default ones. Defaults to C{None}.
         @type area_factor: float | None
@@ -188,6 +188,11 @@ def forward(
         pred_kpts: Tensor,
         area: Tensor,
     ) -> tuple[Tensor, dict[str, Tensor]]:
+        assigned_labels = torch.where(
+            mask_positive > 0,
+            assigned_labels,
+            torch.full_like(assigned_labels, self.n_classes),
+        )
         device = pred_bboxes.device
         sigmas = self.sigmas.to(device)
         d = (gt_kpts[..., 0] - pred_kpts[..., 0]).pow(2) + (
diff --git a/luxonis_train/attached_modules/losses/precision_dfl_detection_loss.py b/luxonis_train/attached_modules/losses/precision_dfl_detection_loss.py
new file mode 100644
index 00000000..14817ce4
--- /dev/null
+++ b/luxonis_train/attached_modules/losses/precision_dfl_detection_loss.py
@@ -0,0 +1,292 @@
+import logging
+from typing import Any, cast
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torchvision.ops import box_convert
+
+from luxonis_train.assigners import TaskAlignedAssigner
+from luxonis_train.enums import TaskType
+from luxonis_train.nodes import PrecisionBBoxHead
+from luxonis_train.utils import (
+    Labels,
+    Packet,
+    anchors_for_fpn_features,
+    bbox2dist,
+    bbox_iou,
+    dist2bbox,
+)
+
+from .base_loss import BaseLoss
+
+logger = logging.getLogger(__name__)
+
+
+class PrecisionDFLDetectionLoss(
+    BaseLoss[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]
+):
+    node: PrecisionBBoxHead
+    supported_tasks: list[TaskType] = [TaskType.BOUNDINGBOX]
+
+    def __init__(
+        self,
+        tal_topk: int = 10,
+        class_loss_weight: float = 0.5,
+        bbox_loss_weight: float = 7.5,
+        dfl_loss_weight: float = 1.5,
+        **kwargs: Any,
+    ):
+        """BBox loss adapted from  U{Real-Time Flying Object Detection with YOLOv8
+        <https://arxiv.org/pdf/2305.09972>} and from U{YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications
+        <https://arxiv.org/pdf/2209.02976.pdf>}.
+        Code is adapted from U{https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/models}.
+
+        @type tal_topk: int
+        @param tal_topk: Number of anchors considered in selection. Defaults to 10.
+        @type class_loss_weight: float
+        @param class_loss_weight: Weight for classification loss. Defaults to 0.5. For optimal results, multiply with accumulate_grad_batches.
+        @type bbox_loss_weight: float
+        @param bbox_loss_weight: Weight for bbox loss. Defaults to 7.5. For optimal results, multiply with accumulate_grad_batches.
+        @type dfl_loss_weight: float
+        @param dfl_loss_weight: Weight for DFL loss. Defaults to 1.5. For optimal results, multiply with accumulate_grad_batches.
+        """
+        super().__init__(**kwargs)
+        self.stride = self.node.stride
+        self.grid_cell_size = self.node.grid_cell_size
+        self.grid_cell_offset = self.node.grid_cell_offset
+        self.original_img_size = self.original_in_shape[1:]
+
+        self.class_loss_weight = class_loss_weight
+        self.bbox_loss_weight = bbox_loss_weight
+        self.dfl_loss_weight = dfl_loss_weight
+
+        self.assigner = TaskAlignedAssigner(
+            n_classes=self.n_classes, topk=tal_topk, alpha=0.5, beta=6.0
+        )
+        self.bbox_loss = CustomBboxLoss(self.node.reg_max)
+        self.proj = torch.arange(self.node.reg_max, dtype=torch.float)
+        self.bce = nn.BCEWithLogitsLoss(reduction="none")
+
+    def prepare(
+        self, inputs: Packet[Tensor], labels: Labels
+    ) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
+        feats = self.get_input_tensors(inputs, "features")
+        self._init_parameters(feats)
+        batch_size = feats[0].shape[0]
+        pred_distri, pred_scores = torch.cat(
+            [xi.view(batch_size, self.node.no, -1) for xi in feats], 2
+        ).split((self.node.reg_max * 4, self.n_classes), 1)
+        target = self.get_label(labels)
+        pred_distri = pred_distri.permute(0, 2, 1).contiguous()
+        pred_scores = pred_scores.permute(0, 2, 1).contiguous()
+
+        target = self._preprocess_bbox_target(target, batch_size)
+
+        pred_bboxes = self.decode_bbox(self.anchor_points_strided, pred_distri)
+
+        gt_labels = target[:, :, :1]
+        gt_xyxy = target[:, :, 1:]
+        mask_gt = (gt_xyxy.sum(-1, keepdim=True) > 0).float()
+
+        _, assigned_bboxes, assigned_scores, mask_positive, _ = self.assigner(
+            pred_scores.detach().sigmoid(),
+            (pred_bboxes.detach() * self.stride_tensor).type(gt_xyxy.dtype),
+            self.anchor_points,
+            gt_labels,
+            gt_xyxy,
+            mask_gt,
+        )
+
+        return (
+            pred_distri,
+            pred_bboxes,
+            pred_scores,
+            assigned_bboxes / self.stride_tensor,
+            assigned_scores,
+            mask_positive,
+        )
+
+    def forward(
+        self,
+        pred_distri: Tensor,
+        pred_bboxes: Tensor,
+        pred_scores: Tensor,
+        assigned_bboxes: Tensor,
+        assigned_scores: Tensor,
+        mask_positive: Tensor,
+    ):
+        max_assigned_scores_sum = max(assigned_scores.sum().item(), 1)
+        loss_cls = (
+            self.bce(pred_scores, assigned_scores)
+        ).sum() / max_assigned_scores_sum
+        if mask_positive.sum():
+            loss_iou, loss_dfl = self.bbox_loss(
+                pred_distri,
+                pred_bboxes,
+                self.anchor_points_strided,
+                assigned_bboxes,
+                assigned_scores,
+                max_assigned_scores_sum,
+                mask_positive,
+            )
+        else:
+            loss_iou = torch.tensor(0.0).to(pred_distri.device)
+            loss_dfl = torch.tensor(0.0).to(pred_distri.device)
+
+        loss = (
+            self.class_loss_weight * loss_cls
+            + self.bbox_loss_weight * loss_iou
+            + self.dfl_loss_weight * loss_dfl
+        )
+        sub_losses = {
+            "class": loss_cls.detach(),
+            "iou": loss_iou.detach(),
+            "dfl": loss_dfl.detach(),
+        }
+
+        return loss, sub_losses
+
+    def _preprocess_bbox_target(
+        self, target: Tensor, batch_size: int
+    ) -> Tensor:
+        sample_ids, counts = cast(
+            tuple[Tensor, Tensor],
+            torch.unique(target[:, 0].int(), return_counts=True),
+        )
+        c_max = int(counts.max()) if counts.numel() > 0 else 0
+        out_target = torch.zeros(batch_size, c_max, 5, device=target.device)
+        out_target[:, :, 0] = -1
+        for id, count in zip(sample_ids, counts):
+            out_target[id, :count] = target[target[:, 0] == id][:, 1:]
+
+        scaled_target = out_target[:, :, 1:5] * self.gt_bboxes_scale
+        out_target[..., 1:] = box_convert(scaled_target, "xywh", "xyxy")
+
+        return out_target
+
+    def decode_bbox(self, anchor_points: Tensor, pred_dist: Tensor) -> Tensor:
+        """Decode predicted object bounding box coordinates from anchor
+        points and distribution.
+
+        @type anchor_points: Tensor
+        @param anchor_points: Anchor points tensor of shape [N, 4] where
+            N is the number of anchors.
+        @type pred_dist: Tensor
+        @param pred_dist: Predicted distribution tensor of shape
+            [batch_size, N, 4 * reg_max] where N is the number of
+            anchors.
+        @rtype: Tensor
+        """
+        if self.node.dfl:
+            batch_size, num_anchors, num_channels = pred_dist.shape
+            dist_probs = pred_dist.view(
+                batch_size, num_anchors, 4, num_channels // 4
+            ).softmax(dim=3)
+            dist_transformed = dist_probs.matmul(
+                self.proj.to(anchor_points.device).type(pred_dist.dtype)
+            )
+        return dist2bbox(dist_transformed, anchor_points, out_format="xyxy")
+
+    def _init_parameters(self, features: list[Tensor]):
+        if not hasattr(self, "gt_bboxes_scale"):
+            _, self.anchor_points, _, self.stride_tensor = (
+                anchors_for_fpn_features(
+                    features,
+                    self.stride,
+                    self.grid_cell_size,
+                    self.grid_cell_offset,
+                    multiply_with_stride=True,
+                )
+            )
+            self.gt_bboxes_scale = torch.tensor(
+                [
+                    self.original_img_size[1],
+                    self.original_img_size[0],
+                    self.original_img_size[1],
+                    self.original_img_size[0],
+                ],
+                device=features[0].device,
+            )
+            self.anchor_points_strided = (
+                self.anchor_points / self.stride_tensor
+            )
+
+
+class CustomBboxLoss(nn.Module):
+    def __init__(self, reg_max: int = 16):
+        """BBox loss that combines IoU and DFL losses.
+
+        @type reg_max: int
+        @param reg_max: Maximum number of regression channels. Defaults
+            to 16.
+        """
+        super().__init__()
+        self.dist_loss = CustomDFLoss(reg_max) if reg_max > 1 else None
+
+    def forward(
+        self,
+        pred_dist: Tensor,
+        pred_bboxes: Tensor,
+        anchors: Tensor,
+        targets: Tensor,
+        scores: Tensor,
+        total_score: Tensor,
+        fg_mask: Tensor,
+    ) -> tuple[Tensor, Tensor]:
+        score_weights = scores.sum(dim=-1)[fg_mask].unsqueeze(dim=-1)
+
+        iou_vals = bbox_iou(
+            pred_bboxes[fg_mask],
+            targets[fg_mask],
+            iou_type="ciou",
+            element_wise=True,
+        ).unsqueeze(dim=-1)
+        iou_loss_val = ((1.0 - iou_vals) * score_weights).sum() / total_score
+
+        if self.dist_loss is not None:
+            offset_targets = bbox2dist(
+                targets, anchors, self.dist_loss.reg_max - 1
+            )
+            dfl_loss_val = (
+                self.dist_loss(
+                    pred_dist[fg_mask].view(-1, self.dist_loss.reg_max),
+                    offset_targets[fg_mask],
+                )
+                * score_weights
+            )
+            dfl_loss_val = dfl_loss_val.sum() / total_score
+        else:
+            dfl_loss_val = torch.zeros(1, device=pred_dist.device)
+
+        return iou_loss_val, dfl_loss_val
+
+
+class CustomDFLoss(nn.Module):
+    def __init__(self, reg_max: int = 16):
+        """DFL loss that combines classification and regression losses.
+
+        @type reg_max: int
+        @param reg_max: Maximum number of regression channels. Defaults
+            to 16.
+        """
+        super().__init__()
+        self.reg_max = reg_max
+
+    def __call__(self, pred_dist: Tensor, targets: Tensor) -> Tensor:
+        targets = targets.clamp(0, self.reg_max - 1 - 0.01)
+        left_target = targets.floor().long()
+        right_target = left_target + 1
+        weight_left = right_target - targets
+        weight_right = 1.0 - weight_left
+
+        left_val = F.cross_entropy(
+            pred_dist, left_target.view(-1), reduction="none"
+        ).view(left_target.shape)
+        right_val = F.cross_entropy(
+            pred_dist, right_target.view(-1), reduction="none"
+        ).view(left_target.shape)
+
+        return (left_val * weight_left + right_val * weight_right).mean(
+            dim=-1, keepdim=True
+        )
diff --git a/luxonis_train/attached_modules/losses/precision_dlf_segmentation_loss.py b/luxonis_train/attached_modules/losses/precision_dlf_segmentation_loss.py
new file mode 100644
index 00000000..54f960a9
--- /dev/null
+++ b/luxonis_train/attached_modules/losses/precision_dlf_segmentation_loss.py
@@ -0,0 +1,248 @@
+import logging
+from typing import Any
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torchvision.ops import box_convert
+
+from luxonis_train.attached_modules.losses.precision_dfl_detection_loss import (
+    PrecisionDFLDetectionLoss,
+)
+from luxonis_train.enums import TaskType
+from luxonis_train.nodes import PrecisionSegmentBBoxHead
+from luxonis_train.utils import (
+    Labels,
+    Packet,
+    apply_bounding_box_to_masks,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class PrecisionDFLSegmentationLoss(PrecisionDFLDetectionLoss):
+    node: PrecisionSegmentBBoxHead
+    supported_tasks: list[TaskType] = [
+        TaskType.BOUNDINGBOX,
+        TaskType.INSTANCE_SEGMENTATION,
+    ]
+
+    def __init__(
+        self,
+        tal_topk: int = 10,
+        class_loss_weight: float = 0.5,
+        bbox_loss_weight: float = 7.5,
+        dfl_loss_weight: float = 1.5,
+        **kwargs: Any,
+    ):
+        """Instance Segmentation and BBox loss adapted from  U{Real-Time Flying Object Detection with YOLOv8
+        <https://arxiv.org/pdf/2305.09972>} and from U{YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications
+        <https://arxiv.org/pdf/2209.02976.pdf>}.
+        Code is adapted from U{https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/models}.
+
+        @type tal_topk: int
+        @param tal_topk: Number of anchors considered in selection. Defaults to 10.
+        @type class_loss_weight: float
+        @param class_loss_weight: Weight for classification loss. Defaults to 0.5. For optimal results, multiply with accumulate_grad_batches.
+        @type bbox_loss_weight: float
+        @param bbox_loss_weight: Weight for bbox loss. Defaults to 7.5. For optimal results, multiply with accumulate_grad_batches.
+        @type dfl_loss_weight: float
+        @param dfl_loss_weight: Weight for DFL loss. Defaults to 1.5. For optimal results, multiply with accumulate_grad_batches.
+        """
+        super().__init__(
+            tal_topk=tal_topk,
+            class_loss_weight=class_loss_weight,
+            bbox_loss_weight=bbox_loss_weight,
+            dfl_loss_weight=dfl_loss_weight,
+            **kwargs,
+        )
+
+    def prepare(
+        self, inputs: Packet[Tensor], labels: Labels
+    ) -> tuple[
+        Tensor,
+        Tensor,
+        Tensor,
+        Tensor,
+        Tensor,
+        Tensor,
+        Tensor,
+        Tensor,
+        Tensor,
+        Tensor,
+        Tensor,
+    ]:
+        det_feats = self.get_input_tensors(inputs, "features")
+        proto = self.get_input_tensors(inputs, "prototypes")[0]
+        pred_mask = self.get_input_tensors(inputs, "mask_coeficients")[0]
+        self._init_parameters(det_feats)
+        batch_size, _, mask_h, mask_w = proto.shape
+        pred_distri, pred_scores = torch.cat(
+            [xi.view(batch_size, self.node.no, -1) for xi in det_feats], 2
+        ).split((self.node.reg_max * 4, self.n_classes), 1)
+        target_bbox = self.get_label(labels, TaskType.BOUNDINGBOX)
+        img_idx = target_bbox[:, 0].unsqueeze(-1)
+        target_masks = self.get_label(labels, TaskType.INSTANCE_SEGMENTATION)
+        if tuple(target_masks.shape[-2:]) != (mask_h, mask_w):
+            target_masks = F.interpolate(
+                target_masks.unsqueeze(0), (mask_h, mask_w), mode="nearest"
+            ).squeeze(0)
+
+        pred_distri = pred_distri.permute(0, 2, 1).contiguous()
+        pred_scores = pred_scores.permute(0, 2, 1).contiguous()
+        pred_mask = pred_mask.permute(0, 2, 1).contiguous()
+
+        target_bbox = self._preprocess_bbox_target(target_bbox, batch_size)
+
+        pred_bboxes = self.decode_bbox(self.anchor_points_strided, pred_distri)
+
+        gt_labels = target_bbox[:, :, :1]
+        gt_xyxy = target_bbox[:, :, 1:]
+        mask_gt = (gt_xyxy.sum(-1, keepdim=True) > 0).float()
+
+        _, assigned_bboxes, assigned_scores, mask_positive, assigned_gt_idx = (
+            self.assigner(
+                pred_scores.detach().sigmoid(),
+                (pred_bboxes.detach() * self.stride_tensor).type(
+                    gt_xyxy.dtype
+                ),
+                self.anchor_points,
+                gt_labels,
+                gt_xyxy,
+                mask_gt,
+            )
+        )
+
+        return (
+            pred_distri,
+            pred_bboxes,
+            pred_scores,
+            assigned_bboxes,
+            assigned_scores,
+            mask_positive,
+            assigned_gt_idx,
+            pred_mask,
+            proto,
+            target_masks,
+            img_idx,
+        )
+
+    def forward(
+        self,
+        pred_distri: Tensor,
+        pred_bboxes: Tensor,
+        pred_scores: Tensor,
+        assigned_bboxes: Tensor,
+        assigned_scores: Tensor,
+        mask_positive: Tensor,
+        assigned_gt_idx: Tensor,
+        pred_masks: Tensor,
+        proto: Tensor,
+        target_masks: Tensor,
+        img_idx: Tensor,
+    ):
+        max_assigned_scores_sum = max(assigned_scores.sum().item(), 1)
+        loss_cls = (
+            self.bce(pred_scores, assigned_scores)
+        ).sum() / max_assigned_scores_sum
+        if mask_positive.sum():
+            loss_iou, loss_dfl = self.bbox_loss(
+                pred_distri,
+                pred_bboxes,
+                self.anchor_points_strided,
+                assigned_bboxes / self.stride_tensor,
+                assigned_scores,
+                max_assigned_scores_sum,
+                mask_positive,
+            )
+        else:
+            loss_iou = torch.tensor(0.0).to(pred_distri.device)
+            loss_dfl = torch.tensor(0.0).to(pred_distri.device)
+
+        loss_seg = self.compute_segmentation_loss(
+            mask_positive,
+            target_masks,
+            assigned_gt_idx,
+            assigned_bboxes,
+            img_idx,
+            proto,
+            pred_masks,
+        )
+
+        loss = (
+            self.class_loss_weight * loss_cls
+            + self.bbox_loss_weight * loss_iou
+            + self.dfl_loss_weight * loss_dfl
+            + self.bbox_loss_weight * loss_seg
+        )
+        sub_losses = {
+            "class": loss_cls.detach(),
+            "iou": loss_iou.detach(),
+            "dfl": loss_dfl.detach(),
+            "seg": loss_seg.detach(),
+        }
+
+        return loss, sub_losses
+
+    def compute_segmentation_loss(
+        self,
+        fg_mask: torch.Tensor,
+        gt_masks: torch.Tensor,
+        gt_idx: torch.Tensor,
+        bboxes: torch.Tensor,
+        batch_ids: torch.Tensor,
+        proto: torch.Tensor,
+        pred_masks: torch.Tensor,
+    ) -> torch.Tensor:
+        """Compute the segmentation loss for the entire batch.
+
+        @type fg_mask: torch.Tensor
+        @param fg_mask: Foreground mask. Shape: (B, N_anchor).
+        @type gt_masks: torch.Tensor
+        @param gt_masks: Ground truth masks. Shape: (n, H, W).
+        @type gt_idx: torch.Tensor
+        @param gt_idx: Ground truth mask indices. Shape: (B, N_anchor).
+        @type bboxes: torch.Tensor
+        @param bboxes: Ground truth bounding boxes in xyxy format.
+            Shape: (B, N_anchor, 4).
+        @type batch_ids: torch.Tensor
+        @param batch_ids: Batch indices. Shape: (n, 1).
+        @type proto: torch.Tensor
+        @param proto: Prototype masks. Shape: (B, 32, H, W).
+        @type pred_masks: torch.Tensor
+        @param pred_masks: Predicted mask coefficients. Shape: (B,
+            N_anchor, 32).
+        """
+        _, _, h, w = proto.shape
+        total_loss = 0
+        bboxes_norm = bboxes / self.gt_bboxes_scale
+        bbox_area = box_convert(bboxes_norm, in_fmt="xyxy", out_fmt="xywh")[
+            ..., 2:
+        ].prod(2)
+        bboxes_scaled = bboxes_norm * torch.tensor(
+            [w, h, w, h], device=proto.device
+        )
+
+        for img_idx, data in enumerate(
+            zip(fg_mask, gt_idx, pred_masks, proto, bboxes_scaled, bbox_area)
+        ):
+            fg, gt, pred, pr, bbox, area = data
+            if fg.any():
+                mask_ids = gt[fg]
+                gt_mask = gt_masks[batch_ids.view(-1) == img_idx][mask_ids]
+
+                # Compute individual image mask loss
+                pred_mask = torch.einsum("in,nhw->ihw", pred[fg], pr)
+                loss = F.binary_cross_entropy_with_logits(
+                    pred_mask, gt_mask, reduction="none"
+                )
+                total_loss += (
+                    apply_bounding_box_to_masks(loss, bbox[fg]).mean(
+                        dim=(1, 2)
+                    )
+                    / area[fg]
+                ).sum()
+            else:
+                total_loss += (proto * 0).sum() + (pred_masks * 0).sum()
+
+        return total_loss / fg_mask.sum()
diff --git a/luxonis_train/attached_modules/metrics/mean_average_precision.py b/luxonis_train/attached_modules/metrics/mean_average_precision.py
index c082ee39..53d1a7f9 100644
--- a/luxonis_train/attached_modules/metrics/mean_average_precision.py
+++ b/luxonis_train/attached_modules/metrics/mean_average_precision.py
@@ -1,5 +1,6 @@
 from typing import Any
 
+import torch
 import torchmetrics.detection as detection
 from torch import Tensor
 from torchvision.ops import box_convert
@@ -14,18 +15,30 @@ class MeanAveragePrecision(
     BaseMetric[list[dict[str, Tensor]], list[dict[str, Tensor]]]
 ):
     """Compute the Mean-Average-Precision (mAP) and Mean-Average-Recall
-    (mAR) for object detection predictions.
+    (mAR) for object detection predictions and instance segmentation.
 
     Adapted from U{Mean-Average-Precision (mAP) and Mean-Average-Recall
     (mAR)
     <https://lightning.ai/docs/torchmetrics/stable/detection/mean_average_precision.html>}.
     """
 
-    supported_tasks: list[TaskType] = [TaskType.BOUNDINGBOX]
+    supported_tasks: list[TaskType] = [
+        TaskType.BOUNDINGBOX,
+        TaskType.INSTANCE_SEGMENTATION,
+    ]
 
     def __init__(self, **kwargs: Any):
         super().__init__(**kwargs)
-        self.metric = detection.MeanAveragePrecision()
+        self.is_segmentation = (self.node.tasks is not None) and (
+            TaskType.INSTANCE_SEGMENTATION in self.node.tasks
+        )
+
+        if self.is_segmentation:
+            iou_type = ("bbox", "segm")
+        else:
+            iou_type = "bbox"
+
+        self.metric = detection.MeanAveragePrecision(iou_type=iou_type)  # type: ignore
 
     def update(
         self,
@@ -37,29 +50,53 @@ def update(
     def prepare(
         self, inputs: Packet[Tensor], labels: Labels
     ) -> tuple[list[dict[str, Tensor]], list[dict[str, Tensor]]]:
-        box_label = self.get_label(labels)
-        output_nms = self.get_input_tensors(inputs)
-
+        box_label = self.get_label(labels, TaskType.BOUNDINGBOX)
+        mask_label = (
+            self.get_label(labels, TaskType.INSTANCE_SEGMENTATION)
+            if self.is_segmentation
+            else None
+        )
+
+        output_nms_bboxes = self.get_input_tensors(
+            inputs, TaskType.BOUNDINGBOX
+        )
+        output_nms_masks = (
+            self.get_input_tensors(inputs, TaskType.INSTANCE_SEGMENTATION)
+            if self.is_segmentation
+            else None
+        )
         image_size = self.original_in_shape[1:]
 
         output_list: list[dict[str, Tensor]] = []
         label_list: list[dict[str, Tensor]] = []
-        for i in range(len(output_nms)):
-            output_list.append(
-                {
-                    "boxes": output_nms[i][:, :4],
-                    "scores": output_nms[i][:, 4],
-                    "labels": output_nms[i][:, 5].int(),
-                }
-            )
-
+        for i in range(len(output_nms_bboxes)):
+            # Prepare predictions
+            pred = {
+                "boxes": output_nms_bboxes[i][:, :4],
+                "scores": output_nms_bboxes[i][:, 4],
+                "labels": output_nms_bboxes[i][:, 5].int(),
+            }
+            if self.is_segmentation:
+                pred["masks"] = output_nms_masks[i].to(  # type: ignore
+                    dtype=torch.bool
+                )  # Predicted masks (M, H, W)
+            output_list.append(pred)
+
+            # Prepare ground truth
             curr_label = box_label[box_label[:, 0] == i]
             curr_bboxs = box_convert(curr_label[:, 2:], "xywh", "xyxy")
             curr_bboxs[:, 0::2] *= image_size[1]
             curr_bboxs[:, 1::2] *= image_size[0]
-            label_list.append(
-                {"boxes": curr_bboxs, "labels": curr_label[:, 1].int()}
-            )
+
+            gt = {
+                "boxes": curr_bboxs,
+                "labels": curr_label[:, 1].int(),
+            }
+            if self.is_segmentation:
+                gt["masks"] = mask_label[box_label[:, 0] == i].to(  # type: ignore
+                    dtype=torch.bool
+                )
+            label_list.append(gt)
 
         return output_list, label_list
 
@@ -69,21 +106,52 @@ def reset(self) -> None:
     def compute(self) -> tuple[Tensor, dict[str, Tensor]]:
         metric_dict: dict[str, Tensor] = self.metric.compute()
 
-        del metric_dict["classes"]
-        del metric_dict["map_per_class"]
-        del metric_dict["mar_100_per_class"]
-        for key in list(metric_dict.keys()):
-            if "map" in key:
-                map = metric_dict[key]
-                mar_key = key.replace("map", "mar")
-                if mar_key in metric_dict:
-                    mar = metric_dict[mar_key]
-                    metric_dict[key.replace("map", "f1")] = (
-                        2 * (map * mar) / (map + mar)
-                    )
-
-        map = metric_dict.pop("map")
+        if self.is_segmentation:
+            keys_to_remove = [
+                "classes",
+                "bbox_map_per_class",
+                "bbox_mar_100_per_class",
+                "segm_map_per_class",
+                "segm_mar_100_per_class",
+            ]
+            for key in keys_to_remove:
+                if key in metric_dict:
+                    del metric_dict[key]
+
+            for key in list(metric_dict.keys()):
+                if "map" in key:
+                    map_metric = metric_dict[key]
+                    mar_key = key.replace("map", "mar")
+                    if mar_key in metric_dict:
+                        mar_metric = metric_dict[mar_key]
+                        metric_dict[key.replace("map", "f1")] = (
+                            2
+                            * (map_metric * mar_metric)
+                            / (map_metric + mar_metric)
+                        )
+
+            scalar = metric_dict.get("segm_map", torch.tensor(0.0))
+        else:
+            del metric_dict["classes"]
+            del metric_dict["map_per_class"]
+            del metric_dict["mar_100_per_class"]
+
+            for key in list(metric_dict.keys()):
+                if "map" in key:
+                    map_metric = metric_dict[key]
+                    mar_key = key.replace("map", "mar")
+                    if mar_key in metric_dict:
+                        mar_metric = metric_dict[mar_key]
+                        metric_dict[key.replace("map", "f1")] = (
+                            2
+                            * (map_metric * mar_metric)
+                            / (map_metric + mar_metric)
+                        )
+
+            scalar = metric_dict.pop("map", torch.tensor(0.0))
+
         # WARNING: fix DDP pl.log error
-        map = map.to(self.device)
         metric_dict = {k: v.to(self.device) for k, v in metric_dict.items()}
-        return map, metric_dict
+        scalar = scalar.to(self.device)
+
+        return scalar, metric_dict
diff --git a/luxonis_train/attached_modules/visualizers/__init__.py b/luxonis_train/attached_modules/visualizers/__init__.py
index 50b90471..1bd65f50 100644
--- a/luxonis_train/attached_modules/visualizers/__init__.py
+++ b/luxonis_train/attached_modules/visualizers/__init__.py
@@ -1,6 +1,7 @@
 from .base_visualizer import BaseVisualizer
 from .bbox_visualizer import BBoxVisualizer
 from .classification_visualizer import ClassificationVisualizer
+from .instance_segmentation_visualizer import InstanceSegmentationVisualizer
 from .keypoint_visualizer import KeypointVisualizer
 from .multi_visualizer import MultiVisualizer
 from .segmentation_visualizer import SegmentationVisualizer
@@ -23,6 +24,7 @@
     "KeypointVisualizer",
     "MultiVisualizer",
     "SegmentationVisualizer",
+    "InstanceSegmentationVisualizer",
     "combine_visualizations",
     "draw_bounding_box_labels",
     "draw_keypoint_labels",
diff --git a/luxonis_train/attached_modules/visualizers/instance_segmentation_visualizer.py b/luxonis_train/attached_modules/visualizers/instance_segmentation_visualizer.py
new file mode 100644
index 00000000..829cfbb8
--- /dev/null
+++ b/luxonis_train/attached_modules/visualizers/instance_segmentation_visualizer.py
@@ -0,0 +1,262 @@
+import logging
+
+import torch
+from torch import Tensor
+
+from luxonis_train.enums import TaskType
+from luxonis_train.utils import Labels, Packet
+
+from .base_visualizer import BaseVisualizer
+from .utils import (
+    Color,
+    draw_bounding_box_labels,
+    draw_bounding_boxes,
+    draw_segmentation_labels,
+    get_color,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class InstanceSegmentationVisualizer(BaseVisualizer[Tensor, Tensor]):
+    """Visualizer for instance segmentation tasks, supporting the
+    visualization of predicted and ground truth bounding boxes and
+    instance segmentation masks."""
+
+    supported_tasks: list[TaskType] = [
+        TaskType.INSTANCE_SEGMENTATION,
+        TaskType.BOUNDINGBOX,
+    ]
+
+    def __init__(
+        self,
+        labels: dict[int, str] | list[str] | None = None,
+        draw_labels: bool = True,
+        colors: dict[str, Color] | list[Color] | None = None,
+        fill: bool = False,
+        width: int | None = None,
+        font: str | None = None,
+        font_size: int | None = None,
+        alpha: float = 0.6,
+        **kwargs,
+    ):
+        """Visualizer for instance segmentation tasks.
+
+        @type labels: dict[int, str] | list[str] | None
+        @param labels: Dictionary mapping class indices to class labels.
+        @type draw_labels: bool
+        @param draw_labels: Whether to draw class labels on the
+            visualizations.
+        @type colors: dict[str, L{Color}] | list[L{Color}] | None
+        @param colors: Dicionary mapping class labels to colors.
+        @type fill: bool | None
+        @param fill: Whether to fill the boundingbox with color.
+        @type width: int | None
+        @param width: Width of the bounding box Lines.
+        @type font: str | None
+        @param font: Font of the clas labels.
+        @type font_size: int | None
+        @param font_size: Font size of the class Labels.
+        @type alpha: float
+        @param alpha: Alpha value of the segmentation masks. Defaults to
+            C{0.6}.
+        """
+        super().__init__(**kwargs)
+
+        if isinstance(labels, list):
+            labels = {i: label for i, label in enumerate(labels)}
+
+        self.bbox_labels = labels or {
+            i: label for i, label in enumerate(self.class_names)
+        }
+
+        if colors is None:
+            colors = {
+                label: get_color(i) for i, label in self.bbox_labels.items()
+            }
+        if isinstance(colors, list):
+            colors = {
+                self.bbox_labels[i]: color for i, color in enumerate(colors)
+            }
+
+        self.colors = colors
+        self.fill = fill
+        self.width = width
+        self.font = font
+        self.font_size = font_size
+        self.draw_labels = draw_labels
+        self.alpha = alpha
+
+    def prepare(
+        self, inputs: Packet[Tensor], labels: Labels
+    ) -> tuple[Tensor, Tensor, list[Tensor], list[Tensor]]:
+        # Override the prepare base method
+        target_bboxes = self.get_label(labels, TaskType.BOUNDINGBOX)
+        target_masks = self.get_label(labels, TaskType.INSTANCE_SEGMENTATION)
+        predicted_bboxes = self.get_input_tensors(inputs, TaskType.BOUNDINGBOX)
+        predicted_masks = self.get_input_tensors(
+            inputs, TaskType.INSTANCE_SEGMENTATION
+        )
+
+        return target_bboxes, target_masks, predicted_bboxes, predicted_masks
+
+    def draw_predictions(
+        self,
+        canvas: Tensor,
+        pred_bboxes: list[Tensor],
+        pred_masks: list[Tensor],
+        width: int | None,
+        label_dict: dict[int, str],
+        color_dict: dict[str, Color],
+        draw_labels: bool,
+        alpha: float,
+    ) -> Tensor:
+        viz = torch.zeros_like(canvas)
+
+        for i in range(len(canvas)):
+            viz[i] = canvas[i].clone()
+            image_bboxes = pred_bboxes[i]
+            image_masks = pred_masks[i]
+            prediction_classes = image_bboxes[..., 5].int()
+
+            cls_labels = (
+                [label_dict[int(c)] for c in prediction_classes]
+                if draw_labels and label_dict is not None
+                else None
+            )
+            cls_colors = (
+                [color_dict[label_dict[int(c)]] for c in prediction_classes]
+                if color_dict is not None and label_dict is not None
+                else None
+            )
+
+            *_, H, W = canvas.shape
+            width = width or max(1, int(min(H, W) / 100))
+
+            try:
+                viz[i] = draw_segmentation_labels(
+                    viz[i],
+                    image_masks,
+                    colors=cls_colors,
+                    alpha=alpha,
+                ).to(canvas.device)
+
+                viz[i] = draw_bounding_boxes(
+                    viz[i],
+                    image_bboxes[:, :4],
+                    width=width,
+                    labels=cls_labels,
+                    colors=cls_colors,
+                ).to(canvas.device)
+            except ValueError as e:
+                logger.warning(
+                    f"Failed to draw bounding boxes or masks: {e}. Skipping visualization."
+                )
+                viz[i] = canvas[i]
+
+        return viz
+
+    @staticmethod
+    def draw_targets(
+        canvas: Tensor,
+        target_bboxes: Tensor,
+        target_masks: Tensor,
+        width: int | None,
+        label_dict: dict[int, str],
+        color_dict: dict[str, Color],
+        draw_labels: bool,
+        alpha: float,
+    ) -> Tensor:
+        viz = torch.zeros_like(canvas)
+
+        for i in range(len(canvas)):
+            viz[i] = canvas[i].clone()
+            image_bboxes = target_bboxes[target_bboxes[:, 0] == i]
+            image_masks = target_masks[target_bboxes[:, 0] == i]
+            target_classes = image_bboxes[:, 1].int()
+
+            cls_labels = (
+                [label_dict[int(c)] for c in target_classes]
+                if draw_labels and label_dict is not None
+                else None
+            )
+            cls_colors = (
+                [color_dict[label_dict[int(c)]] for c in target_classes]
+                if color_dict is not None and label_dict is not None
+                else None
+            )
+
+            *_, H, W = canvas.shape
+            width = width or max(1, int(min(H, W) / 100))
+
+            viz[i] = draw_segmentation_labels(
+                viz[i],
+                image_masks,
+                alpha=alpha,
+                colors=cls_colors,
+            ).to(canvas.device)
+            viz[i] = draw_bounding_box_labels(
+                viz[i],
+                image_bboxes[:, 2:],
+                width=width,
+                labels=cls_labels if cls_labels else None,
+                colors=cls_colors,
+            ).to(canvas.device)
+
+        return viz
+
+    def forward(
+        self,
+        label_canvas: Tensor,
+        prediction_canvas: Tensor,
+        target_bboxes: Tensor | None,
+        target_masks: Tensor | None,
+        predicted_bboxes: list[Tensor],
+        predicted_masks: list[Tensor],
+    ) -> tuple[Tensor, Tensor] | Tensor:
+        """Creates visualizations of the predicted and target bounding
+        boxes and instance masks.
+
+        @type label_canvas: Tensor
+        @param label_canvas: Tensor containing the target
+            visualizations.
+        @type prediction_canvas: Tensor
+        @param prediction_canvas: Tensor containing the predicted
+            visualizations.
+        @type target_bboxes: Tensor | None
+        @param target_bboxes: Tensor containing the target bounding
+            boxes.
+        @type target_masks: Tensor | None
+        @param target_masks: Tensor containing the target instance
+            masks.
+        @type predicted_bboxes: list[Tensor]
+        @param predicted_bboxes: List of tensors containing the
+            predicted bounding boxes.
+        @type predicted_masks: list[Tensor]
+        @param predicted_masks: List of tensors containing the predicted
+            instance masks.
+        """
+        predictions_viz = self.draw_predictions(
+            prediction_canvas,
+            predicted_bboxes,
+            predicted_masks,
+            self.width,
+            self.bbox_labels,
+            self.colors,
+            self.draw_labels,
+            self.alpha,
+        )
+        if target_bboxes is None or target_masks is None:
+            return predictions_viz
+
+        targets_viz = self.draw_targets(
+            label_canvas,
+            target_bboxes,
+            target_masks,
+            self.width,
+            self.bbox_labels,
+            self.colors,
+            self.draw_labels,
+            self.alpha,
+        )
+        return targets_viz, predictions_viz
diff --git a/luxonis_train/config/predefined_models/README.md b/luxonis_train/config/predefined_models/README.md
index f19a21da..35dfb198 100644
--- a/luxonis_train/config/predefined_models/README.md
+++ b/luxonis_train/config/predefined_models/README.md
@@ -10,6 +10,7 @@ models which can be used instead.
 - [`KeypointDetectionModel`](#keypointdetectionmodel)
 - [`ClassificationModel`](#classificationmodel)
 - [`FOMOModel`](#fomomodel)
+- [`InstanceSegmentationModel`](#instancesegmentationmodel)
 - [`AnomalyDetectionModel`](#anomalydetectionmodel)
 
 **Parameters:**
@@ -25,7 +26,7 @@ models which can be used instead.
 
 ## `SegmentationModel`
 
-The `SegmentationModel` allows for both `"light"` and `"heavy"` variants, where the `"heavy"` variant is more accurate, and the `"light"` variant is faster.
+The `SegmentationModel` supports `"light"` and `"heavy"` variants, with `"light"` optimized for speed and `"heavy"` for accuracy.
 
 See an example configuration file using this predefined model [here](../../../configs/segmentation_light_model.yaml) for the `"light"` variant, and [here](../../../configs/segmentation_heavy_model.yaml) for the `"heavy"` variant.
 
@@ -66,7 +67,7 @@ FPS (frames per second) for `light` and `heavy` variants on different devices wi
 
 ## `DetectionModel`
 
-The `DetectionModel` allows for `"light"`, `"medium"` and `"heavy"` variants, where the `"heavy"` variant is more accurate, and the `"light"` variant is faster.
+The `DetectionModel` supports `"light"`, `"medium"`, and `"heavy"` variants, with `"light"` optimized for speed, `"heavy"` for accuracy, and `"medium"` offering a balance between the two.
 
 See an example configuration file using this predefined model [here](../../../configs/detection_light_model.yaml) for the `"light"` variant, and [here](../../../configs/detection_heavy_model.yaml) for the `"heavy"` variant.
 
@@ -116,7 +117,7 @@ FPS (frames per second) for `light`, `medium` and `heavy` variants on different
 
 ## `KeypointDetectionModel`
 
-The `KeypointDetectionModel` allows for `"light"`, `"medium"` and `"heavy"` variants, where the `"heavy"` variant is more accurate, and the `"light"` variant is faster.
+The `KeypointDetectionModel` supports `"light"`, `"medium"`, and `"heavy"` variants, with `"light"` optimized for speed, `"heavy"` for accuracy, and `"medium"` offering a balance between the two.
 
 See an example configuration file using this predefined model [here](../../../configs/keypoint_bbox_light_model.yaml) for the `"light"` variant, and [here](../../../configs/keypoint_bbox_heavy_model.yaml) for the `"heavy"` variant.
 
@@ -161,7 +162,7 @@ FPS (frames per second) for `light`, `medium` and `heavy` variants on different
 
 ## `ClassificationModel`
 
-The `ClassificationModel` allows for both `"light"` and `"heavy"` variants, where the `"heavy"` variant is more accurate, and the `"light"` variant is faster. Can be used for multi-class and multi-label tasks.
+The `ClassificationModel` supports `"light"` and `"heavy"` variants, with `"light"` optimized for speed and `"heavy"` for accuracy.
 
 See an example configuration file using this predefined model [here](../../../configs/classification_light_model.yaml) for the `"light"` variant, and [here](../../../configs/classification_heavy_model.yaml) for the `"heavy"` variant.
 
@@ -200,7 +201,7 @@ FPS (frames per second) for `light` and `heavy` variants on different devices wi
 
 ## `FOMOModel`
 
-The `FOMOModel` allows for both `"light"` and `"heavy"` variants, where the `"heavy"` variant is more accurate, and the `"light"` variant is faster.
+The `FOMOModel` supports `"light"` and `"heavy"` variants, with `"light"` optimized for speed and `"heavy"` for accuracy.
 
 There is a trade-off in this simple model: training with a larger `object_weight` in the loss parameters may result in more false positives (FP), but it will improve accuracy. You can also use `use_nms: True` in the `head_params` to enable NMS which can reduce FP, but it will also reduce TP for close neighbors.
 
@@ -240,9 +241,50 @@ For larger heatmaps and improved accuracy, you can adjust the `attach_index` in
 | `visualizer_params` | `dict`                      | `{}`            | Additional parameters for the visualizer.                                                       |
 | `task_name`         | `str \| None`               | `None`          | Custom task name for the model head.                                                            |
 
+## `InstanceSegmentationModel`
+
+The `InstanceSegmentationModel` supports `"light"`, `"medium"`, and `"heavy"` variants, with `"light"` optimized for speed, `"heavy"` for accuracy, and `"medium"` offering a balance between the two.
+
+See an example configuration file using this predefined model [here](../../../configs/instance_segmentation_light_model.yaml) for the `"light"` variant, and [here](../../../configs/instance_segmentation_heavy_model.yaml) for the `"heavy"` variant.
+
+### Performance Metrics
+
+FPS (frames per second) for `light`, `medium` and `heavy` variants on different devices with image size 384x512:
+
+| Variant      | RVC2 FPS | RVC4 FPS |
+| ------------ | -------- | -------- |
+| **`light`**  | 15       | 131      |
+| **`medium`** | 9        | 116      |
+| **`heavy`**  | 3        | 82       |
+
+**Components:**
+
+| Name                                                                                                            | Alias                                | Function                                                                                                                                 |
+| --------------------------------------------------------------------------------------------------------------- | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- |
+| [`EfficientRep`](../../nodes/README.md#efficientrep)                                                            | `"instance_segmentation_backbone"`   | Backbone of the model. Available variants: `"light"` (`EfficientRep-N`), `"medium"` (`EfficientRep-S`), and `"heavy"` (`EfficientRep-L`) |
+| [`RepPANNeck`](../../nodes/README.md#reppanneck)                                                                | `"instance_segmentation_neck"`       | Neck of the model                                                                                                                        |
+| [`PrecisionSegmentBBoxHead`](../../nodes/README.md#precisionsegmentbboxhead)                                    | `"instance_segmentation_head"`       | Head of the model for instance segmentation                                                                                              |
+| [`PrecisionDFLSegmentationLoss`](../../attached_modules/losses/README.md#precisiondflsegmentationloss)          | `"instance_segmentation_loss"`       | Loss function for training instance segmentation models                                                                                  |
+| [`MeanAveragePrecision`](../../attached_modules/metrics/README.md#meanaverageprecision)                         | `"instance_segmentation_map"`        | Main metric of the model, measuring mean average precision                                                                               |
+| [`InstanceSegmentationVisualizer`](../../attached_modules/visualizers/README.md#instancesegmentationvisualizer) | `"instance_segmentation_visualizer"` | Visualizer for displaying instance segmentation results                                                                                  |
+
+**Parameters:**
+
+| Key                 | Type                                  | Default value    | Description                                                                                                                          |
+| ------------------- | ------------------------------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------ |
+| `variant`           | `Literal["light", "medium", "heavy"]` | `"light"`        | Defines the variant of the model. `"light"` uses `EfficientRep-N`, `"medium"` uses `EfficientRep-S`, `"heavy"` uses `EfficientRep-L` |
+| `use_neck`          | `bool`                                | `True`           | Whether to include the neck in the model                                                                                             |
+| `backbone`          | `str`                                 | `"EfficientRep"` | Name of the node to be used as a backbone                                                                                            |
+| `backbone_params`   | `dict`                                | `{}`             | Additional parameters to the backbone                                                                                                |
+| `neck_params`       | `dict`                                | `{}`             | Additional parameters to the neck                                                                                                    |
+| `head_params`       | `dict`                                | `{}`             | Additional parameters to the head                                                                                                    |
+| `loss_params`       | `dict`                                | `{}`             | Additional parameters to the loss function                                                                                           |
+| `visualizer_params` | `dict`                                | `{}`             | Additional parameters to the visualizer                                                                                              |
+| `task_name`         | `str \| None`                         | `None`           | Custom task name for the head                                                                                                        |
+
 ## `AnomalyDetectionModel`
 
-The `AnomalyDetectionModel` allows for both `"light"` and `"heavy"` variants, where the `"heavy"` variant is more accurate, and the `"light"` variant is faster.
+The `AnomalyDetectionModel` supports `"light"` and `"heavy"` variants, with `"light"` optimized for speed and `"heavy"` for accuracy.
 
 ### Performance Metrics
 
diff --git a/luxonis_train/config/predefined_models/__init__.py b/luxonis_train/config/predefined_models/__init__.py
index a52db8bb..7bec15b0 100644
--- a/luxonis_train/config/predefined_models/__init__.py
+++ b/luxonis_train/config/predefined_models/__init__.py
@@ -3,6 +3,7 @@
 from .classification_model import ClassificationModel
 from .detection_fomo_model import FOMOModel
 from .detection_model import DetectionModel
+from .instance_segmentation_model import InstanceSegmentationModel
 from .keypoint_detection_model import KeypointDetectionModel
 from .segmentation_model import SegmentationModel
 
@@ -14,4 +15,5 @@
     "SegmentationModel",
     "AnomalyDetectionModel",
     "FOMOModel",
+    "InstanceSegmentationModel",
 ]
diff --git a/luxonis_train/config/predefined_models/instance_segmentation_model.py b/luxonis_train/config/predefined_models/instance_segmentation_model.py
new file mode 100644
index 00000000..25490590
--- /dev/null
+++ b/luxonis_train/config/predefined_models/instance_segmentation_model.py
@@ -0,0 +1,165 @@
+from typing import Literal, TypeAlias
+
+from pydantic import BaseModel
+
+from luxonis_train.config import (
+    AttachedModuleConfig,
+    LossModuleConfig,
+    MetricModuleConfig,
+    ModelNodeConfig,
+    Params,
+)
+
+from .base_predefined_model import BasePredefinedModel
+
+VariantLiteral: TypeAlias = Literal["light", "medium", "heavy"]
+
+
+class InstanceSegmentationVariant(BaseModel):
+    backbone: str
+    backbone_params: Params
+    neck_params: Params
+
+
+def get_variant(variant: VariantLiteral) -> InstanceSegmentationVariant:
+    """Returns the specific variant configuration for the
+    InstanceSegmentationModel."""
+    variants = {
+        "light": InstanceSegmentationVariant(
+            backbone="EfficientRep",
+            backbone_params={"variant": "n"},
+            neck_params={"variant": "n"},
+        ),
+        "medium": InstanceSegmentationVariant(
+            backbone="EfficientRep",
+            backbone_params={"variant": "s"},
+            neck_params={"variant": "s"},
+        ),
+        "heavy": InstanceSegmentationVariant(
+            backbone="EfficientRep",
+            backbone_params={"variant": "l"},
+            neck_params={"variant": "l"},
+        ),
+    }
+
+    if variant not in variants:
+        raise ValueError(
+            f"Instance segmentation variant should be one of {list(variants.keys())}, got '{variant}'."
+        )
+
+    return variants[variant]
+
+
+class InstanceSegmentationModel(BasePredefinedModel):
+    def __init__(
+        self,
+        variant: VariantLiteral = "light",
+        use_neck: bool = True,
+        backbone: str | None = None,
+        backbone_params: Params | None = None,
+        neck_params: Params | None = None,
+        head_params: Params | None = None,
+        loss_params: Params | None = None,
+        visualizer_params: Params | None = None,
+        task_name: str = "",
+        enable_confusion_matrix: bool = True,
+        confusion_matrix_params: Params | None = None,
+    ):
+        var_config = get_variant(variant)
+
+        self.use_neck = use_neck
+        self.backbone_params = (
+            backbone_params
+            if backbone is not None or backbone_params is not None
+            else var_config.backbone_params
+        ) or {}
+        self.backbone = backbone or var_config.backbone
+        self.neck_params = neck_params or var_config.neck_params
+        self.head_params = head_params or {}
+        self.loss_params = loss_params or {}
+        self.visualizer_params = visualizer_params or {}
+        self.task_name = task_name
+        self.enable_confusion_matrix = enable_confusion_matrix
+        self.confusion_matrix_params = confusion_matrix_params or {}
+
+    @property
+    def nodes(self) -> list[ModelNodeConfig]:
+        """Defines the model nodes, including backbone, neck, and
+        head."""
+        nodes = [
+            ModelNodeConfig(
+                name=self.backbone,
+                alias=f"{self.task_name}/{self.backbone}",
+                freezing=self.backbone_params.pop("freezing", {}),
+                params=self.backbone_params,
+            ),
+        ]
+        if self.use_neck:
+            nodes.append(
+                ModelNodeConfig(
+                    name="RepPANNeck",
+                    alias=f"{self.task_name}/RepPANNeck",
+                    inputs=[f"{self.task_name}/{self.backbone}"],
+                    freezing=self.neck_params.pop("freezing", {}),
+                    params=self.neck_params,
+                )
+            )
+
+        nodes.append(
+            ModelNodeConfig(
+                name="PrecisionSegmentBBoxHead",
+                alias=f"{self.task_name}/PrecisionSegmentBBoxHead",
+                freezing=self.head_params.pop("freezing", {}),
+                inputs=[f"{self.task_name}/RepPANNeck"]
+                if self.use_neck
+                else [f"{self.backbone}-{self.task_name}"],
+                params=self.head_params,
+            )
+        )
+        return nodes
+
+    @property
+    def losses(self) -> list[LossModuleConfig]:
+        """Defines the loss module for the instance segmentation
+        task."""
+        return [
+            LossModuleConfig(
+                name="PrecisionDFLSegmentationLoss",
+                attached_to=f"{self.task_name}/PrecisionSegmentBBoxHead",
+                params=self.loss_params,
+                weight=1.0,
+            )
+        ]
+
+    @property
+    def metrics(self) -> list[MetricModuleConfig]:
+        """Defines the metrics used for evaluation."""
+        metrics = [
+            MetricModuleConfig(
+                name="MeanAveragePrecision",
+                attached_to=f"{self.task_name}/PrecisionSegmentBBoxHead",
+                is_main_metric=True,
+            ),
+        ]
+        if self.enable_confusion_matrix:
+            metrics.append(
+                MetricModuleConfig(
+                    name="ConfusionMatrix",
+                    alias=f"{self.task_name}/ConfusionMatrix",
+                    attached_to=f"{self.task_name}/PrecisionSegmentBBoxHead",
+                    params={**self.confusion_matrix_params},
+                )
+            )
+        return metrics
+
+    @property
+    def visualizers(self) -> list[AttachedModuleConfig]:
+        """Defines the visualizer used for the instance segmentation
+        task."""
+        return [
+            AttachedModuleConfig(
+                name="InstanceSegmentationVisualizer",
+                attached_to=f"{self.task_name}/PrecisionSegmentBBoxHead",
+                params=self.visualizer_params,
+            )
+        ]
diff --git a/luxonis_train/loaders/base_loader.py b/luxonis_train/loaders/base_loader.py
index 752a15d2..db97ac00 100644
--- a/luxonis_train/loaders/base_loader.py
+++ b/luxonis_train/loaders/base_loader.py
@@ -75,7 +75,7 @@ class ConfigItem:
         @type image_source: str
         @param image_source: Name of the image source. Only relevant for
             datasets with multiple image sources, e.g. C{"left"} and C{"right"}. This parameter defines which of these sources is used for
-                visualizations.
+            visualizations.
 
         @type keep_aspect_ratio: bool
         @param keep_aspect_ratio: Whether to keep the aspect ratio of the output image after resizing.
@@ -226,7 +226,7 @@ def get(self, idx: int) -> tuple[Tensor | dict[str, Tensor], Labels]:
         @type idx: int
         @param idx: Sample index.
         @rtype: L{LuxonisLoaderTorchOutput}
-        @return: Sample's data in L{LuxonisLoaderTorchOutput} format
+        @return: Sample's data in L{LuxonisLoaderTorchOutput} format.
         """
         ...
 
diff --git a/luxonis_train/loaders/utils.py b/luxonis_train/loaders/utils.py
index 9c9e1d45..aed4df94 100644
--- a/luxonis_train/loaders/utils.py
+++ b/luxonis_train/loaders/utils.py
@@ -44,8 +44,10 @@ def collate_fn(
                 new_ann[:, 1:] = ann
                 label_box.append(new_ann)
             out_labels[task] = torch.cat(label_box, 0)
+
         elif task_type == "instance_segmentation":
-            out_labels[task] = torch.cat(annos, 0)
+            masks = [label[task] for label in labels]
+            out_labels[task] = torch.cat(masks, 0)
         else:
             out_labels[task] = torch.stack(annos, 0)
 
diff --git a/luxonis_train/models/luxonis_lightning.py b/luxonis_train/models/luxonis_lightning.py
index d2d3fe87..17aea732 100644
--- a/luxonis_train/models/luxonis_lightning.py
+++ b/luxonis_train/models/luxonis_lightning.py
@@ -594,9 +594,17 @@ def export_onnx(self, save_path: str, **kwargs) -> list[str]:
                         idx += 1
         else:
             output_names = []
+            running_i = {}  # for case where export_output_names should be used but output node's output is split into multiple subnodes
             for node_name, output_name, i in output_order:
                 if node_name in export_output_names_dict:
-                    output_names.append(export_output_names_dict[node_name][i])
+                    running_i[node_name] = (
+                        running_i.get(node_name, -1) + 1
+                    )  # if not present default to 0 otherwise add 1
+                    output_names.append(
+                        export_output_names_dict[node_name][
+                            running_i[node_name]
+                        ]
+                    )
                 else:
                     output_names.append(f"{node_name}/{output_name}/{i}")
 
diff --git a/luxonis_train/nodes/README.md b/luxonis_train/nodes/README.md
index 7e2540ee..ab139d04 100644
--- a/luxonis_train/nodes/README.md
+++ b/luxonis_train/nodes/README.md
@@ -29,6 +29,8 @@ arbitrarily as long as the two nodes are compatible with each other. We've group
   - [`DDRNetSegmentationHead`](#ddrnetsegmentationhead)
   - [`DiscSubNetHead`](#discsubnet)
   - [`FOMOHead`](#fomohead)
+  - [`PrecisionBBoxHead`](#precisionbboxhead)
+  - [`PrecisionSegmentBBoxHead`](#precisionsegmentbboxhead)
     Every node takes these parameters:
 
 | Key                | Type          | Default value | Description                                                                 |
@@ -239,7 +241,7 @@ Adapted from [here](https://arxiv.org/pdf/2209.02976.pdf).
 
 | Key                  | Type    | Default value | Description                                                           |
 | -------------------- | ------- | ------------- | --------------------------------------------------------------------- |
-| `n_heads`            | `bool`  | `3`           | Number of output heads                                                |
+| `n_heads`            | `int`   | `3`           | Number of output heads                                                |
 | `conf_thres`         | `float` | `0.25`        | Confidence threshold for non-maxima-suppression (used for evaluation) |
 | `iou_thres`          | `float` | `0.45`        | `IoU` threshold for non-maxima-suppression (used for evaluation)      |
 | `max_det`            | `int`   | `300`         | Maximum number of detections retained after NMS                       |
@@ -290,3 +292,33 @@ Adapted from [here](https://arxiv.org/abs/2108.07610).
 | `num_conv_layers` | `int`  | `3`           | Number of convolutional layers to use in the model.                                      |
 | `conv_channels`   | `int`  | `16`          | Number of output channels for each convolutional layer.                                  |
 | `use_nms`         | `bool` | `False`       | If True, enable NMS. This can reduce FP, but it will also reduce TP for close neighbors. |
+
+## `PrecisionBBoxHead`
+
+Adapted from [here](https://arxiv.org/pdf/2207.02696.pdf) and [here](https://arxiv.org/pdf/2209.02976.pdf).
+
+**Parameters:**
+
+| Key          | Type    | Default value | Description                                                               |
+| ------------ | ------- | ------------- | ------------------------------------------------------------------------- |
+| `reg_max`    | `int`   | `16`          | Maximum number of regression channels                                     |
+| `n_heads`    | `int`   | `3`           | Number of output heads                                                    |
+| `conf_thres` | `float` | `0.25`        | Confidence threshold for non-maxima-suppression (used for evaluation)     |
+| `iou_thres`  | `float` | `0.45`        | IoU threshold for non-maxima-suppression (used for evaluation)            |
+| `max_det`    | `int`   | `300`         | Max number of detections for non-maxima-suppression (used for evaluation) |
+
+## `PrecisionSegmentBBoxHead`
+
+Adapted from [here](https://arxiv.org/pdf/2207.02696.pdf) and [here](https://arxiv.org/pdf/2209.02976.pdf).
+
+**Parameters:**
+
+| Key          | Type    | Default value | Description                                                                |
+| ------------ | ------- | ------------- | -------------------------------------------------------------------------- |
+| `reg_max`    | `int`   | `16`          | Maximum number of regression channels.                                     |
+| `n_heads`    | `int`   | `3`           | Number of output heads.                                                    |
+| `conf_thres` | `float` | `0.25`        | Confidence threshold for non-maxima-suppression (used for evaluation).     |
+| `iou_thres`  | `float` | `0.45`        | IoU threshold for non-maxima-suppression (used for evaluation).            |
+| `max_det`    | `int`   | `300`         | Max number of detections for non-maxima-suppression (used for evaluation). |
+| `n_masks`    | `int`   | `32`          | Number of of output instance segmentation masks at the output.             |
+| `n_proto`    | `int`   | `256`         | Number of prototypes generated from the prototype generator.               |
diff --git a/luxonis_train/nodes/backbones/ddrnet/blocks.py b/luxonis_train/nodes/backbones/ddrnet/blocks.py
index ce78503c..910969f2 100644
--- a/luxonis_train/nodes/backbones/ddrnet/blocks.py
+++ b/luxonis_train/nodes/backbones/ddrnet/blocks.py
@@ -135,9 +135,8 @@ def __init__(
         @type inter_mode: str
         @param inter_mode: Interpolation mode for upscaling. Defaults to
             "bilinear".
-
-        @raises ValueError: If the lengths of `kernel_sizes` and `strides`
-            are not the same.
+        @raises ValueError: If the lengths of C{kernel_sizes} and
+            C{strides} are not the same.
         """
         super().__init__()
 
diff --git a/luxonis_train/nodes/blocks/__init__.py b/luxonis_train/nodes/blocks/__init__.py
index ce0181c9..71228fbd 100644
--- a/luxonis_train/nodes/blocks/__init__.py
+++ b/luxonis_train/nodes/blocks/__init__.py
@@ -1,4 +1,5 @@
 from .blocks import (
+    DFL,
     AttentionRefinmentBlock,
     BasicResNetBlock,
     BlockRepeater,
@@ -6,9 +7,11 @@
     ConvModule,
     CSPStackRepBlock,
     DropPath,
+    DWConvModule,
     EfficientDecoupledBlock,
     FeatureFusionBlock,
     RepVGGBlock,
+    SegProto,
     SpatialPyramidPoolingBlock,
     SqueezeExciteBlock,
     UpBlock,
@@ -32,4 +35,7 @@
     "Bottleneck",
     "UpscaleOnline",
     "DropPath",
+    "SegProto",
+    "DWConvModule",
+    "DFL",
 ]
diff --git a/luxonis_train/nodes/blocks/blocks.py b/luxonis_train/nodes/blocks/blocks.py
index b32c1292..fa9912a8 100644
--- a/luxonis_train/nodes/blocks/blocks.py
+++ b/luxonis_train/nodes/blocks/blocks.py
@@ -81,6 +81,85 @@ def _initialize_weights_and_biases(self, prior_prob: float) -> None:
             module.weight = nn.Parameter(w, requires_grad=True)
 
 
+class SegProto(nn.Module):
+    def __init__(self, in_ch, mid_ch=256, out_ch=32):
+        """Initializes the segmentation prototype generator.
+
+        @type in_ch: int
+        @param in_ch: Number of input channels.
+        @type mid_ch: int
+        @param mid_ch: Number of intermediate channels. Defaults to 256.
+        @type out_ch: int
+        @param out_ch: Number of output channels. Defaults to 32.
+        """
+        super().__init__()
+        self.conv1 = ConvModule(
+            in_channels=in_ch,
+            out_channels=mid_ch,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            activation=nn.SiLU(),
+        )
+        self.upsample = nn.ConvTranspose2d(
+            in_channels=mid_ch,
+            out_channels=mid_ch,
+            kernel_size=2,
+            stride=2,
+            bias=True,
+        )
+        self.conv2 = ConvModule(
+            in_channels=mid_ch,
+            out_channels=mid_ch,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            activation=nn.SiLU(),
+        )
+        self.conv3 = ConvModule(
+            in_channels=mid_ch,
+            out_channels=out_ch,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            activation=nn.SiLU(),
+        )
+
+    def forward(self, x):
+        """Defines the forward pass of the segmentation prototype
+        generator.
+
+        @type x: torch.Tensor
+        @param x: Input tensor.
+        @rtype: torch.Tensor
+        @return: Processed tensor.
+        """
+        return self.conv3(self.conv2(self.upsample(self.conv1(x))))
+
+
+class DFL(nn.Module):
+    def __init__(self, reg_max: int = 16):
+        """The DFL (Distribution Focal Loss) module processes input
+        tensors by applying softmax over a specified dimension and
+        projecting the resulting tensor to produce output logits.
+
+        @type reg_max: int
+        @param reg_max: Maximum number of regression outputs. Defaults
+            to 16.
+        """
+        super().__init__()
+        self.proj_conv = nn.Conv2d(reg_max, 1, kernel_size=1, bias=False)
+        self.proj_conv.weight.data.copy_(
+            torch.arange(reg_max, dtype=torch.float32).view(1, reg_max, 1, 1)
+        )
+        self.proj_conv.requires_grad_(False)
+
+    def forward(self, x: Tensor) -> Tensor:
+        bs, _, h, w = x.size()
+        x = F.softmax(x.view(bs, 4, -1, h * w).permute(0, 2, 1, 3), dim=1)
+        return self.proj_conv(x)[:, 0].view(bs, 4, h, w)
+
+
 class ConvModule(nn.Sequential):
     def __init__(
         self,
@@ -134,6 +213,51 @@ def __init__(
         )
 
 
+class DWConvModule(ConvModule):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        bias: bool = False,
+        activation: nn.Module | None = None,
+    ):
+        """Depth-wise Conv2d + BN + Activation.
+
+        @type in_channels: int
+        @param in_channels: Number of input channels.
+        @type out_channels: int
+        @param out_channels: Number of output channels.
+        @type kernel_size: int
+        @param kernel_size: Kernel size.
+        @type stride: int
+        @param stride: Stride. Defaults to 1.
+        @type padding: int
+        @param padding: Padding. Defaults to 0.
+        @type dilation: int
+        @param dilation: Dilation. Defaults to 1.
+        @type bias: bool
+        @param bias: Whether to use bias. Defaults to False.
+        @type activation: L{nn.Module} | None
+        @param activation: Activation function. If None then nn.Relu.
+        """
+
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=math.gcd(in_channels, out_channels),
+            bias=bias,
+            activation=activation,
+        )
+
+
 class UpBlock(nn.Sequential):
     def __init__(
         self,
diff --git a/luxonis_train/nodes/heads/__init__.py b/luxonis_train/nodes/heads/__init__.py
index e5abd973..6ebcf816 100644
--- a/luxonis_train/nodes/heads/__init__.py
+++ b/luxonis_train/nodes/heads/__init__.py
@@ -6,6 +6,8 @@
 from .efficient_bbox_head import EfficientBBoxHead
 from .efficient_keypoint_bbox_head import EfficientKeypointBBoxHead
 from .fomo_head import FOMOHead
+from .precision_bbox_head import PrecisionBBoxHead
+from .precision_seg_bbox_head import PrecisionSegmentBBoxHead
 from .segmentation_head import SegmentationHead
 
 __all__ = [
@@ -18,4 +20,6 @@
     "DDRNetSegmentationHead",
     "DiscSubNetHead",
     "FOMOHead",
+    "PrecisionBBoxHead",
+    "PrecisionSegmentBBoxHead",
 ]
diff --git a/luxonis_train/nodes/heads/efficient_bbox_head.py b/luxonis_train/nodes/heads/efficient_bbox_head.py
index 95ebe1be..ea3ad988 100644
--- a/luxonis_train/nodes/heads/efficient_bbox_head.py
+++ b/luxonis_train/nodes/heads/efficient_bbox_head.py
@@ -82,27 +82,13 @@ def __init__(
                 in_channels=self.in_channels[i],
             )
             self.heads.append(curr_head)
-        if (
-            self.export_output_names is None
-            or len(self.export_output_names) != self.n_heads
-        ):
-            if (
-                self.export_output_names is not None
-                and len(self.export_output_names) != self.n_heads
-            ):
-                logger.warning(
-                    f"Number of provided output names ({len(self.export_output_names)}) "
-                    f"does not match number of heads ({self.n_heads}). "
-                    f"Using default names."
-                )
-            self._export_output_names = [
-                f"output{i+1}_yolov6r2" for i in range(self.n_heads)
-            ]
 
         if initialize_weights:
             self.initialize_weights()
 
-        if download_weights:
+        if (
+            download_weights and self.name == "EfficientBBoxHead"
+        ):  # skip download on classes that inherit this one
             weights_path = self.get_variant_weights(initialize_weights)
             if weights_path:
                 self.load_checkpoint(path=weights_path, strict=False)
@@ -111,6 +97,8 @@ def __init__(
                     f"No checkpoint available for {self.name}, skipping."
                 )
 
+        self.check_export_output_names()
+
     def initialize_weights(self) -> None:
         for m in self.modules():
             if isinstance(m, nn.Conv2d):
@@ -142,6 +130,24 @@ def get_variant_weights(self, initialize_weights: bool) -> str | None:
         else:
             return None
 
+    def check_export_output_names(self):
+        if (
+            self.export_output_names is None
+            or len(self.export_output_names) != self.n_heads
+        ):
+            if (
+                self.export_output_names is not None
+                and len(self.export_output_names) != self.n_heads
+            ):
+                logger.warning(
+                    f"Number of provided output names ({len(self.export_output_names)}) "
+                    f"does not match number of heads ({self.n_heads}). "
+                    f"Using default names."
+                )
+            self._export_output_names = [
+                f"output{i + 1}_yolov6r2" for i in range(self.n_heads)
+            ]
+
     def forward(
         self, inputs: list[Tensor]
     ) -> tuple[list[Tensor], list[Tensor], list[Tensor]]:
diff --git a/luxonis_train/nodes/heads/efficient_keypoint_bbox_head.py b/luxonis_train/nodes/heads/efficient_keypoint_bbox_head.py
index b027b939..f9506547 100644
--- a/luxonis_train/nodes/heads/efficient_keypoint_bbox_head.py
+++ b/luxonis_train/nodes/heads/efficient_keypoint_bbox_head.py
@@ -1,3 +1,4 @@
+import logging
 from typing import Any, Literal
 
 import torch
@@ -14,6 +15,8 @@
 
 from .efficient_bbox_head import EfficientBBoxHead
 
+logger = logging.getLogger(__name__)
+
 
 class EfficientKeypointBBoxHead(EfficientBBoxHead):
     tasks: list[TaskType] = [TaskType.KEYPOINTS, TaskType.BOUNDINGBOX]
@@ -67,6 +70,28 @@ def __init__(
 
         self._export_output_names = None
 
+        self.check_export_output_names()
+
+    def check_export_output_names(self):
+        if (
+            self.export_output_names is None
+            or len(self.export_output_names) != self.n_heads
+        ):
+            if (
+                self.export_output_names is not None
+                and len(self.export_output_names) != self.n_heads
+            ):
+                logger.warning(
+                    f"Number of provided output names ({len(self.export_output_names)}) "
+                    f"does not match number of heads ({self.n_heads}). "
+                    f"Using default names."
+                )
+            self._export_output_names = [
+                f"output{i + 1}_yolov6" for i in range(self.n_heads)
+            ] + [
+                f"kpt_output{i + 1}" for i in range(self.n_heads)
+            ]  # export names are applied on sorter output names
+
     def forward(
         self, inputs: list[Tensor]
     ) -> tuple[list[Tensor], list[Tensor], list[Tensor], list[Tensor]]:
@@ -75,7 +100,7 @@ def forward(
         (
             _,
             self.anchor_points,
-            _,
+            self.n_anchors_list,
             self.stride_tensor,
         ) = anchors_for_fpn_features(
             features,
@@ -99,22 +124,18 @@ def wrap(
         features, cls_score_list, reg_distri_list, kpt_list = output
         bs = features[0].shape[0]
         if self.export:
-            outputs: list[Tensor] = []
-            for out_cls, out_reg, out_kpts in zip(
-                cls_score_list, reg_distri_list, kpt_list, strict=True
+            det_outputs: list[Tensor] = []
+            kpt_outputs: list[Tensor] = []
+            for i, (out_cls, out_reg, out_kpt) in enumerate(
+                zip(cls_score_list, reg_distri_list, kpt_list, strict=True)
             ):
-                chunks = torch.split(out_kpts, 3, dim=1)
-                modified_chunks: list[Tensor] = []
-                for chunk in chunks:
-                    x = chunk[:, 0:1, :, :]
-                    y = chunk[:, 1:2, :, :]
-                    v = torch.sigmoid(chunk[:, 2:3, :, :])
-                    modified_chunk = torch.cat([x, y, v], dim=1)
-                    modified_chunks.append(modified_chunk)
-                out_kpts_modified = torch.cat(modified_chunks, dim=1)
-                out = torch.cat([out_reg, out_cls, out_kpts_modified], dim=1)
-                outputs.append(out)
-            return {"outputs": outputs}
+                conf, _ = out_cls.max(1, keepdim=True)
+                out = torch.cat([out_reg, conf, out_cls], dim=1)
+                det_outputs.append(out)
+                kpt_outputs.append(
+                    self._dist2kpts(out_kpt.view(bs, self.nk, -1), bs, i)
+                )
+            return {"boundingbox": det_outputs, "keypoints": kpt_outputs}
 
         cls_tensor = torch.cat(
             [cls_score_list[i].flatten(2) for i in range(len(cls_score_list))],
@@ -142,8 +163,13 @@ def wrap(
                 "distributions": [reg_tensor],
                 "keypoints_raw": [kpt_tensor],
             }
-
-        pred_kpt = self._dist2kpts(kpt_tensor)
+        pred_kpt = torch.cat(
+            [
+                self._dist2kpts(kpt_list[i].view(bs, self.nk, -1), bs, i)
+                for i in range(len(kpt_list))
+            ],
+            dim=2,
+        ).permute(0, 2, 1)
         detections = self._process_to_bbox_and_kps(
             (features, cls_tensor, reg_tensor, pred_kpt)
         )
@@ -159,26 +185,18 @@ def wrap(
             "keypoints_raw": [kpt_tensor],
         }
 
-    def _dist2kpts(self, kpts: Tensor) -> Tensor:
+    def _dist2kpts(self, kpts: Tensor, batch_size: int, index: int) -> Tensor:
         """Decodes keypoints."""
-        y = kpts.clone()
-
-        anchor_points_transposed = self.anchor_points.transpose(0, 1)
-        stride_tensor = self.stride_tensor.squeeze(-1)
-
-        stride_tensor = stride_tensor.view(1, -1, 1)
-        anchor_points_x = anchor_points_transposed[0].view(1, -1, 1)
-        anchor_points_y = anchor_points_transposed[1].view(1, -1, 1)
-
-        y[:, :, 0::3] = (
-            y[:, :, 0::3] * 2.0 + (anchor_points_x - 0.5)
-        ) * stride_tensor
-        y[:, :, 1::3] = (
-            y[:, :, 1::3] * 2.0 + (anchor_points_y - 0.5)
-        ) * stride_tensor
-        y[:, :, 2::3] = y[:, :, 2::3].sigmoid()
-
-        return y
+        anchors = self.anchor_points.split(self.n_anchors_list, dim=0)
+        kpt_predictions = kpts.view(batch_size, self.n_keypoints, 3, -1)
+        grid_coords = (
+            kpt_predictions[:, :, :2] * 2.0
+            + (anchors[index].transpose(1, 0) - 0.5)
+        ) * self.stride[index]
+        decoded_kpts = torch.cat(
+            (grid_coords, kpt_predictions[:, :, 2:3].sigmoid()), 2
+        )
+        return decoded_kpts.view(batch_size, self.nk, -1)
 
     def _process_to_bbox_and_kps(
         self, output: tuple[list[Tensor], Tensor, Tensor, Tensor]
diff --git a/luxonis_train/nodes/heads/precision_bbox_head.py b/luxonis_train/nodes/heads/precision_bbox_head.py
new file mode 100644
index 00000000..e42189db
--- /dev/null
+++ b/luxonis_train/nodes/heads/precision_bbox_head.py
@@ -0,0 +1,314 @@
+import logging
+import math
+from typing import Any, Literal
+
+import torch
+from torch import Tensor, nn
+
+from luxonis_train.enums import TaskType
+from luxonis_train.nodes.blocks import DFL, ConvModule, DWConvModule
+from luxonis_train.nodes.heads import BaseHead
+from luxonis_train.utils import (
+    Packet,
+    anchors_for_fpn_features,
+    dist2bbox,
+    non_max_suppression,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class PrecisionBBoxHead(BaseHead[list[Tensor], list[Tensor]]):
+    in_channels: list[int]
+    tasks: list[TaskType] = [TaskType.BOUNDINGBOX]
+    parser = "YOLO"
+
+    def __init__(
+        self,
+        reg_max: int = 16,
+        n_heads: Literal[2, 3, 4] = 3,
+        conf_thres: float = 0.25,
+        iou_thres: float = 0.45,
+        max_det: int = 300,
+        **kwargs: Any,
+    ):
+        """
+        Adapted from U{Real-Time Flying Object Detection with YOLOv8
+        <https://arxiv.org/pdf/2305.09972>} and from U{YOLOv6: A Single-Stage Object Detection Framework
+        for Industrial Applications
+        <https://arxiv.org/pdf/2209.02976.pdf>}.
+
+        @type ch: tuple[int]
+        @param ch: Channels for each detection layer.
+        @type reg_max: int
+        @param reg_max: Maximum number of regression channels.
+        @type n_heads: Literal[2, 3, 4]
+        @param n_heads: Number of output heads.
+        @type conf_thres: float
+        @param conf_thres: Confidence threshold for NMS.
+        @type iou_thres: float
+        @param iou_thres: IoU threshold for NMS.
+        @type max_det: int
+        @param max_det: Maximum number of detections retained after NMS.
+        """
+        super().__init__(**kwargs)
+        self.reg_max = reg_max
+        self.no = self.n_classes + reg_max * 4
+        self.n_heads = n_heads
+        self.conf_thres = conf_thres
+        self.iou_thres = iou_thres
+        self.grid_cell_offset = 0.5
+        self.grid_cell_size = 5.0
+        self.max_det = max_det
+
+        reg_channels = max((16, self.in_channels[0] // 4, reg_max * 4))
+        cls_channels = max(self.in_channels[0], min(self.n_classes, 100))
+
+        self.detection_heads = nn.ModuleList(
+            nn.Sequential(
+                # Regression branch
+                nn.Sequential(
+                    ConvModule(
+                        x,
+                        reg_channels,
+                        kernel_size=3,
+                        padding=1,
+                        activation=nn.SiLU(),
+                    ),
+                    ConvModule(
+                        reg_channels,
+                        reg_channels,
+                        kernel_size=3,
+                        padding=1,
+                        activation=nn.SiLU(),
+                    ),
+                    nn.Conv2d(reg_channels, 4 * self.reg_max, kernel_size=1),
+                ),
+                # Classification branch
+                nn.Sequential(
+                    nn.Sequential(
+                        DWConvModule(
+                            x,
+                            x,
+                            kernel_size=3,
+                            padding=1,
+                            activation=nn.SiLU(),
+                        ),
+                        ConvModule(
+                            x,
+                            cls_channels,
+                            kernel_size=1,
+                            activation=nn.SiLU(),
+                        ),
+                    ),
+                    nn.Sequential(
+                        DWConvModule(
+                            cls_channels,
+                            cls_channels,
+                            kernel_size=3,
+                            padding=1,
+                            activation=nn.SiLU(),
+                        ),
+                        ConvModule(
+                            cls_channels,
+                            cls_channels,
+                            kernel_size=1,
+                            activation=nn.SiLU(),
+                        ),
+                    ),
+                    nn.Conv2d(cls_channels, self.n_classes, kernel_size=1),
+                ),
+            )
+            for x in self.in_channels
+        )
+
+        self.stride = self._fit_stride_to_n_heads()
+        self.dfl = DFL(reg_max) if reg_max > 1 else nn.Identity()
+        self.bias_init()
+        self.initialize_weights()
+
+        self.check_export_output_names()
+
+    def check_export_output_names(self):
+        if (
+            self.export_output_names is None
+            or len(self.export_output_names) != self.n_heads
+        ):
+            if (
+                self.export_output_names is not None
+                and len(self.export_output_names) != self.n_heads
+            ):
+                logger.warning(
+                    f"Number of provided output names ({len(self.export_output_names)}) "
+                    f"does not match number of heads ({self.n_heads}). "
+                    f"Using default names."
+                )
+            self._export_output_names = [
+                f"output{i + 1}_yolov8" for i in range(self.n_heads)
+            ]
+
+    def forward(self, x: list[Tensor]) -> tuple[list[Tensor], list[Tensor]]:
+        cls_outputs = []
+        reg_outputs = []
+        for i in range(self.n_heads):
+            reg_output = self.detection_heads[i][0](x[i])  # type: ignore
+            cls_output = self.detection_heads[i][1](x[i])  # type: ignore
+            reg_outputs.append(reg_output)
+            cls_outputs.append(cls_output)
+        return reg_outputs, cls_outputs
+
+    def wrap(
+        self, output: tuple[list[Tensor], list[Tensor]]
+    ) -> Packet[Tensor]:
+        reg_outputs, cls_outputs = (
+            output  # ([bs, 4*reg_max, h_f, w_f]), ([bs, n_classes, h_f, w_f])
+        )
+        features = [
+            torch.cat((reg, cls), dim=1)
+            for reg, cls in zip(reg_outputs, cls_outputs)
+        ]
+        if self.training:
+            return {
+                "features": features,
+            }
+
+        if self.export:
+            return {
+                "boundingbox": self._prepare_bbox_export(
+                    reg_outputs, cls_outputs
+                )
+            }
+
+        boxes = non_max_suppression(
+            self._prepare_bbox_inference_output(reg_outputs, cls_outputs),
+            n_classes=self.n_classes,
+            conf_thres=self.conf_thres,
+            iou_thres=self.iou_thres,
+            bbox_format="xyxy",
+            max_det=self.max_det,
+            predicts_objectness=False,
+        )
+
+        return {
+            "features": features,
+            "boundingbox": boxes,
+        }
+
+    def _fit_stride_to_n_heads(self):
+        """Returns correct stride for number of heads and attach
+        index."""
+        stride = torch.tensor(
+            [
+                self.original_in_shape[1] / x[2]  # type: ignore
+                for x in self.in_sizes[: self.n_heads]
+            ],
+            dtype=torch.int,
+        )
+        return stride
+
+    def _prepare_bbox_and_cls(
+        self, reg_outputs: list[Tensor], cls_outputs: list[Tensor]
+    ) -> list[Tensor]:
+        """Extract classification and bounding box tensors."""
+        output = []
+        for i in range(self.n_heads):
+            box = self.dfl(reg_outputs[i])
+            cls = cls_outputs[i].sigmoid()
+            conf = cls.max(1, keepdim=True)[0]
+            output.append(
+                torch.cat([box, conf, cls], dim=1)
+            )  # [bs, 4 + 1 + n_classes, h_f, w_f]
+        return output
+
+    def _prepare_bbox_export(
+        self, reg_outputs: list[Tensor], cls_outputs: list[Tensor]
+    ) -> list[Tensor]:
+        """Prepare the output for export."""
+        return self._prepare_bbox_and_cls(reg_outputs, cls_outputs)
+
+    def _prepare_bbox_inference_output(
+        self, reg_outputs: list[Tensor], cls_outputs: list[Tensor]
+    ) -> Tensor:
+        """Perform inference on predicted bounding boxes and class
+        probabilities."""
+        processed_outputs = self._prepare_bbox_and_cls(
+            reg_outputs, cls_outputs
+        )
+        box_dists = []
+        class_probs = []
+        for feature in processed_outputs:
+            bs, _, h, w = feature.size()
+            reshaped = feature.view(bs, -1, h * w)
+            box_dist = reshaped[:, :4, :]
+            cls = reshaped[:, 5:, :]
+            box_dists.append(box_dist)
+            class_probs.append(cls)
+
+        box_dists = torch.cat(box_dists, dim=2)
+        class_probs = torch.cat(class_probs, dim=2)
+
+        _, anchor_points, _, strides = anchors_for_fpn_features(
+            processed_outputs, self.stride, 0.5
+        )
+
+        pred_bboxes = dist2bbox(
+            box_dists, anchor_points.transpose(0, 1), out_format="xyxy", dim=1
+        ) * strides.transpose(0, 1)
+
+        base_output = [
+            pred_bboxes.permute(0, 2, 1),  # [BS, H*W, 4]
+            torch.ones(
+                (box_dists.shape[0], pred_bboxes.shape[2], 1),
+                dtype=pred_bboxes.dtype,
+                device=pred_bboxes.device,
+            ),
+            class_probs.permute(0, 2, 1),  # [BS, H*W, n_classes]
+        ]
+
+        output_merged = torch.cat(
+            base_output, dim=-1
+        )  # [BS, H*W, 4 + 1 + n_classes]
+        return output_merged
+
+    def bias_init(self):
+        """Initialize biases for the detection heads.
+
+        Assumes detection_heads structure with separate regression and
+        classification branches.
+        """
+        for head, stride in zip(self.detection_heads, self.stride):
+            reg_branch = head[0]  # type: ignore
+            cls_branch = head[1]  # type: ignore
+
+            reg_conv = reg_branch[-1]
+            reg_conv.bias.data[:] = 1.0
+
+            cls_conv = cls_branch[-1]
+            cls_conv.bias.data[: self.n_classes] = math.log(
+                5 / self.n_classes / (self.original_in_shape[1] / stride) ** 2
+            )
+
+    def initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                pass
+            elif isinstance(m, nn.BatchNorm2d):
+                m.eps = 0.001
+                m.momentum = 0.03
+            elif isinstance(
+                m, (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU)
+            ):
+                m.inplace = True
+
+    def get_custom_head_config(self) -> dict:
+        """Returns custom head configuration.
+
+        @rtype: dict
+        @return: Custom head configuration.
+        """
+        return {
+            "subtype": "yolov8",
+            "iou_threshold": self.iou_thres,
+            "conf_threshold": self.conf_thres,
+            "max_det": self.max_det,
+        }
diff --git a/luxonis_train/nodes/heads/precision_seg_bbox_head.py b/luxonis_train/nodes/heads/precision_seg_bbox_head.py
new file mode 100644
index 00000000..3518b46f
--- /dev/null
+++ b/luxonis_train/nodes/heads/precision_seg_bbox_head.py
@@ -0,0 +1,247 @@
+import logging
+from typing import Any, Literal
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from luxonis_train.enums import TaskType
+from luxonis_train.nodes.blocks import ConvModule, SegProto
+from luxonis_train.utils import (
+    Packet,
+    apply_bounding_box_to_masks,
+    non_max_suppression,
+)
+
+from .precision_bbox_head import PrecisionBBoxHead
+
+logger = logging.getLogger(__name__)
+
+
+class PrecisionSegmentBBoxHead(PrecisionBBoxHead):
+    tasks: list[TaskType] = [
+        TaskType.INSTANCE_SEGMENTATION,
+        TaskType.BOUNDINGBOX,
+    ]
+    parser: str = "YOLOExtendedParser"
+
+    def __init__(
+        self,
+        n_heads: Literal[2, 3, 4] = 3,
+        n_masks: int = 32,
+        n_proto: int = 256,
+        conf_thres: float = 0.25,
+        iou_thres: float = 0.45,
+        max_det: int = 300,
+        **kwargs: Any,
+    ):
+        """
+        Head for instance segmentation and object detection.
+        Adapted from U{Real-Time Flying Object Detection with YOLOv8
+        <https://arxiv.org/pdf/2305.09972>} and from U{YOLOv6: A Single-Stage Object Detection Framework
+        for Industrial Applications
+        <https://arxiv.org/pdf/2209.02976.pdf>}.
+
+        @type n_heads: Literal[2, 3, 4]
+        @param n_heads: Number of output heads. Defaults to 3.
+        @type n_masks: int
+        @param n_masks: Number of masks.
+        @type n_proto: int
+        @param n_proto: Number of prototypes for segmentation.
+        @type conf_thres: flaot
+        @param conf_thres: Confidence threshold for NMS.
+        @type iou_thres: float
+        @param iou_thres: IoU threshold for NMS.
+        @type max_det: int
+        @param max_det: Maximum number of detections retained after NMS.
+        """
+        super().__init__(
+            n_heads=n_heads,
+            conf_thres=conf_thres,
+            iou_thres=iou_thres,
+            max_det=max_det,
+            **kwargs,
+        )
+
+        self.n_masks = n_masks
+        mid_ch = max(self.in_channels[0] // 4, self.n_masks)
+        self.mask_layers = nn.ModuleList(
+            nn.Sequential(
+                ConvModule(x, mid_ch, 3, 1, 1, activation=nn.SiLU()),
+                ConvModule(mid_ch, mid_ch, 3, 1, 1, activation=nn.SiLU()),
+                nn.Conv2d(mid_ch, self.n_masks, 1, 1),
+            )
+            for x in self.in_channels
+        )
+
+        self.n_proto = n_proto
+        self.proto = SegProto(self.in_channels[0], self.n_proto, self.n_masks)
+
+        self.check_export_output_names()
+
+    def check_export_output_names(self):
+        if (
+            self.export_output_names is None
+            or len(self.export_output_names) != self.n_heads
+        ):
+            if (
+                self.export_output_names is not None
+                and len(self.export_output_names) != self.n_heads
+            ):
+                logger.warning(
+                    f"Number of provided output names ({len(self.export_output_names)}) "
+                    f"does not match number of heads ({self.n_heads}). "
+                    f"Using default names."
+                )
+            self._export_output_names = (
+                [f"output{i + 1}_yolov8" for i in range(self.n_heads)]
+                + [f"output{i + 1}_masks" for i in range(self.n_heads)]
+                + ["protos_output"]
+            )  # export names are applied on sorter output names
+
+    def forward(
+        self, inputs: list[Tensor]
+    ) -> tuple[tuple[list[Tensor], list[Tensor]], Tensor, list[Tensor]]:
+        prototypes = self.proto(inputs[0])
+        mask_coefficients = [
+            self.mask_layers[i](inputs[i]) for i in range(self.n_heads)
+        ]
+
+        det_outs = super().forward(inputs)
+
+        return det_outs, prototypes, mask_coefficients
+
+    def wrap(
+        self,
+        output: tuple[tuple[list[Tensor], list[Tensor]], Tensor, list[Tensor]],
+    ) -> Packet[Tensor]:
+        det_feats, prototypes, mask_coefficients = output
+
+        if self.export:
+            pred_bboxes = self._prepare_bbox_export(*det_feats)  # type: ignore
+            return {
+                "boundingbox": pred_bboxes,
+                "masks": mask_coefficients,
+                "prototypes": [prototypes],
+            }
+
+        det_feats_combined = [
+            torch.cat((reg, cls), dim=1) for reg, cls in zip(*det_feats)
+        ]
+        mask_coefficients = torch.cat(
+            [
+                coef.view(coef.size(0), self.n_masks, -1)
+                for coef in mask_coefficients
+            ],
+            dim=2,
+        )
+
+        if self.training:
+            return {
+                "features": det_feats_combined,
+                "prototypes": [prototypes],
+                "mask_coeficients": [mask_coefficients],
+            }
+
+        pred_bboxes = self._prepare_bbox_inference_output(*det_feats)  # type: ignore
+        preds_combined = torch.cat(
+            [pred_bboxes, mask_coefficients.permute(0, 2, 1)], dim=-1
+        )
+        preds = non_max_suppression(
+            preds_combined,
+            n_classes=self.n_classes,
+            conf_thres=self.conf_thres,
+            iou_thres=self.iou_thres,
+            bbox_format="xyxy",
+            max_det=self.max_det,
+            predicts_objectness=False,
+        )
+
+        results = {
+            "features": det_feats_combined,
+            "prototypes": [prototypes],
+            "mask_coeficients": [mask_coefficients],
+            "boundingbox": [],
+            "instance_segmentation": [],
+        }
+
+        for i, pred in enumerate(preds):
+            results["instance_segmentation"].append(
+                refine_and_apply_masks(
+                    prototypes[i],
+                    pred[:, 6:],
+                    pred[:, :4],
+                    self.original_in_shape[-2:],
+                    upsample=True,
+                )
+            )
+            results["boundingbox"].append(pred[:, :6])
+
+        return results
+
+    def get_custom_head_config(self) -> dict:
+        """Returns custom head configuration.
+
+        @rtype: dict
+        @return: Custom head configuration.
+        """
+        return {
+            "subtype": "yolov8",
+            "iou_threshold": self.iou_thres,
+            "conf_threshold": self.conf_thres,
+            "max_det": self.max_det,
+        }
+
+
+def refine_and_apply_masks(
+    mask_prototypes,
+    predicted_masks,
+    bounding_boxes,
+    target_shape,
+    upsample=False,
+):
+    """Refine and apply masks to bounding boxes based on the mask head
+    outputs.
+
+    @type mask_prototypes: torch.Tensor
+    @param mask_prototypes: Tensor of shape [mask_dim, mask_height,
+        mask_width].
+    @type predicted_masks: torch.Tensor
+    @param predicted_masks: Tensor of shape [num_masks, mask_dim], where
+        num_masks is the number of detected masks.
+    @type bounding_boxes: torch.Tensor
+    @param bounding_boxes: Tensor of shape [num_masks, 4], containing
+        bounding box coordinates.
+    @type target_shape: tuple
+    @param target_shape: Tuple (height, width) representing the
+        dimensions of the original image.
+    @type upsample: bool
+    @param upsample: If True, upsample the masks to the target image
+        dimensions. Default is False.
+    @rtype: torch.Tensor
+    @return: A binary mask tensor of shape [num_masks, height, width],
+        where the masks are cropped according to their respective
+        bounding boxes.
+    """
+    if predicted_masks.size(0) == 0 or bounding_boxes.size(0) == 0:
+        img_h, img_w = target_shape
+        return torch.zeros(0, img_h, img_w, dtype=torch.uint8)
+
+    channels, proto_h, proto_w = mask_prototypes.shape
+    img_h, img_w = target_shape
+    masks_combined = (
+        predicted_masks @ mask_prototypes.float().view(channels, -1)
+    ).view(-1, proto_h, proto_w)
+    w_scale, h_scale = proto_w / img_w, proto_h / img_h
+    scaled_boxes = bounding_boxes.clone()
+    scaled_boxes[:, [0, 2]] *= w_scale
+    scaled_boxes[:, [1, 3]] *= h_scale
+    cropped_masks = apply_bounding_box_to_masks(masks_combined, scaled_boxes)
+    if upsample:
+        cropped_masks = F.interpolate(
+            cropped_masks.unsqueeze(0),
+            size=target_shape,
+            mode="bilinear",
+            align_corners=False,
+        ).squeeze(0)
+    return (cropped_masks > 0).to(cropped_masks.dtype)
diff --git a/luxonis_train/utils/__init__.py b/luxonis_train/utils/__init__.py
index 2f2b550a..8e12a214 100644
--- a/luxonis_train/utils/__init__.py
+++ b/luxonis_train/utils/__init__.py
@@ -1,5 +1,6 @@
 from .boundingbox import (
     anchors_for_fpn_features,
+    apply_bounding_box_to_masks,
     bbox2dist,
     bbox_iou,
     compute_iou_loss,
@@ -44,4 +45,5 @@
     "traverse_graph",
     "insert_class",
     "get_attribute_check_none",
+    "apply_bounding_box_to_masks",
 ]
diff --git a/luxonis_train/utils/boundingbox.py b/luxonis_train/utils/boundingbox.py
index e72360c3..ff2af2cf 100644
--- a/luxonis_train/utils/boundingbox.py
+++ b/luxonis_train/utils/boundingbox.py
@@ -19,6 +19,7 @@ def dist2bbox(
     distance: Tensor,
     anchor_points: Tensor,
     out_format: BBoxFormatType = "xyxy",
+    dim: int = -1,
 ) -> Tensor:
     """Transform distance (ltrb) to box ("xyxy", "xywh" or "cxcywh").
 
@@ -29,12 +30,14 @@ def dist2bbox(
     @type out_format: BBoxFormatType
     @param out_format: BBox output format. Defaults to "xyxy".
     @rtype: Tensor
+    @param dim: Dimension to split distance tensor. Defaults to -1.
+    @rtype: Tensor
     @return: BBoxes in correct format
     """
-    lt, rb = torch.split(distance, 2, -1)
+    lt, rb = torch.split(distance, 2, dim=dim)
     x1y1 = anchor_points - lt
     x2y2 = anchor_points + rb
-    bbox = torch.cat([x1y1, x2y2], -1)
+    bbox = torch.cat([x1y1, x2y2], dim=dim)
     if out_format in ["xyxy", "xywh", "cxcywh"]:
         bbox = box_convert(bbox, in_fmt="xyxy", out_fmt=out_format)
     else:
@@ -401,6 +404,39 @@ def anchors_for_fpn_features(
     )
 
 
+def apply_bounding_box_to_masks(
+    masks: Tensor, bounding_boxes: Tensor
+) -> Tensor:
+    """Crops the given masks to the regions specified by the
+    corresponding bounding boxes.
+
+    @type masks: Tensor
+    @param masks: Masks tensor of shape [n, h, w].
+    @type bounding_boxes: Tensor
+    @param bounding_boxes: Bounding boxes tensor of shape [n, 4].
+    @rtype: Tensor
+    @return: Cropped masks tensor of shape [n, h, w].
+    """
+    _, mask_height, mask_width = masks.shape
+    left, top, right, bottom = torch.split(
+        bounding_boxes[:, :, None], 1, dim=1
+    )
+    width_indices = torch.arange(
+        mask_width, device=masks.device, dtype=left.dtype
+    )[None, None, :]
+    height_indices = torch.arange(
+        mask_height, device=masks.device, dtype=left.dtype
+    )[None, :, None]
+
+    cropped_masks = masks * (
+        (width_indices >= left)
+        & (width_indices < right)
+        & (height_indices >= top)
+        & (height_indices < bottom)
+    )
+    return cropped_masks
+
+
 def compute_iou_loss(
     pred_bboxes: Tensor,
     target_bboxes: Tensor,
diff --git a/tests/integration/parking_lot.json b/tests/integration/parking_lot.json
index bf3e3835..b9dde963 100644
--- a/tests/integration/parking_lot.json
+++ b/tests/integration/parking_lot.json
@@ -11,24 +11,11 @@
                 "name": "image",
                 "dtype": "float32",
                 "input_type": "image",
-                "shape": [
-                    1,
-                    3,
-                    256,
-                    320
-                ],
+                "shape": [1, 3, 256, 320],
                 "layout": "NCHW",
                 "preprocessing": {
-                    "mean": [
-                        123.675,
-                        116.28,
-                        103.53
-                    ],
-                    "scale": [
-                        58.395,
-                        57.12,
-                        57.375
-                    ],
+                    "mean": [123.675, 116.28, 103.53],
+                    "scale": [58.395, 57.12, 57.375],
                     "reverse_channels": null,
                     "interleaved_to_planar": null,
                     "dai_type": "RGB888p"
@@ -39,90 +26,68 @@
             {
                 "name": "BiSeNetHead/brand/segmentation/0",
                 "dtype": "float32",
-                "shape": [
-                    1,
-                    23,
-                    256,
-                    320
-                ],
+                "shape": [1, 23, 256, 320],
                 "layout": "NCHW"
             },
             {
-                "name": "EfficientKeypointBBoxHead/outputs/0",
+                "name": "SegmentationHead/color/segmentation/0",
                 "dtype": "float32",
-                "shape": [
-                    1,
-                    14,
-                    32,
-                    40
-                ],
+                "shape": [1, 4, 256, 320],
                 "layout": "NCHW"
             },
             {
-                "name": "EfficientKeypointBBoxHead/outputs/1",
+                "name": "output1_yolov6r2",
                 "dtype": "float32",
-                "shape": [
-                    1,
-                    14,
-                    16,
-                    20
-                ],
+                "shape": [1, 8, 32, 40],
                 "layout": "NCHW"
             },
             {
-                "name": "EfficientKeypointBBoxHead/outputs/2",
+                "name": "output2_yolov6r2",
                 "dtype": "float32",
-                "shape": [
-                    1,
-                    14,
-                    8,
-                    10
-                ],
+                "shape": [1, 8, 16, 20],
+                "layout": "NCHW"
+            },
+            {
+                "name": "output3_yolov6r2",
+                "dtype": "float32",
+                "shape": [1, 8, 8, 10],
                 "layout": "NCDE"
             },
             {
-                "name": "SegmentationHead/color/segmentation/0",
+                "name": "output1_yolov6",
                 "dtype": "float32",
-                "shape": [
-                    1,
-                    4,
-                    256,
-                    320
-                ],
+                "shape": [1, 6, 32, 40],
                 "layout": "NCHW"
             },
             {
-                "name": "output1_yolov6r2",
+                "name": "output2_yolov6",
                 "dtype": "float32",
-                "shape": [
-                    1,
-                    8,
-                    32,
-                    40
-                ],
+                "shape": [1, 6, 16, 20],
                 "layout": "NCHW"
             },
             {
-                "name": "output2_yolov6r2",
+                "name": "output3_yolov6",
                 "dtype": "float32",
-                "shape": [
-                    1,
-                    8,
-                    16,
-                    20
-                ],
+                "shape": [1, 6, 8, 10],
                 "layout": "NCHW"
             },
             {
-                "name": "output3_yolov6r2",
+                "name": "kpt_output1",
                 "dtype": "float32",
-                "shape": [
-                    1,
-                    8,
-                    8,
-                    10
-                ],
-                "layout": "NCDE"
+                "shape": [1, 9, 1280],
+                "layout": "NCD"
+            },
+            {
+                "name": "kpt_output2",
+                "dtype": "float32",
+                "shape": [1, 9, 320],
+                "layout": "NCD"
+            },
+            {
+                "name": "kpt_output3",
+                "dtype": "float32",
+                "shape": [1, 9, 80],
+                "layout": "NCD"
             }
         ],
         "heads": [
@@ -132,47 +97,23 @@
                 "metadata": {
                     "postprocessor_path": null,
                     "classes": [
-                        "background",
-                        "alfa-romeo",
-                        "buick",
-                        "ducati",
-                        "harley",
-                        "ferrari",
-                        "infiniti",
-                        "jeep",
-                        "land-rover",
-                        "roll-royce",
-                        "yamaha",
-                        "aprilia",
-                        "bmw",
-                        "dodge",
-                        "honda",
-                        "moto",
-                        "piaggio",
-                        "isuzu",
-                        "Kawasaki",
-                        "truimph",
-                        "pontiac",
-                        "saab",
-                        "chrysler"
+                        "background", "alfa-romeo", "buick", "ducati", "harley",
+                        "ferrari", "infiniti", "jeep", "land-rover", "roll-royce",
+                        "yamaha", "aprilia", "bmw", "dodge", "honda", "moto",
+                        "piaggio", "isuzu", "Kawasaki", "truimph", "pontiac",
+                        "saab", "chrysler"
                     ],
                     "n_classes": 23,
                     "is_softmax": false
                 },
-                "outputs": [
-                    "BiSeNetHead/brand/segmentation/0"
-                ]
+                "outputs": ["BiSeNetHead/brand/segmentation/0"]
             },
             {
                 "name": "BiSeNetHead_0",
                 "parser": "SegmentationParser",
                 "metadata": {
                     "postprocessor_path": null,
-                    "classes": [
-                        "motorbike",
-                        "car",
-                        "background"
-                    ],
+                    "classes": ["motorbike", "car", "background"],
                     "n_classes": 3,
                     "is_softmax": false
                 },
@@ -183,11 +124,7 @@
                 "parser": "YOLO",
                 "metadata": {
                     "postprocessor_path": null,
-                    "classes": [
-                        "motorbike",
-                        "car",
-                        "background"
-                    ],
+                    "classes": ["motorbike", "car", "background"],
                     "n_classes": 3,
                     "iou_threshold": 0.45,
                     "conf_threshold": 0.25,
@@ -206,9 +143,7 @@
                 "parser": "YOLOExtendedParser",
                 "metadata": {
                     "postprocessor_path": null,
-                    "classes": [
-                        "motorbike"
-                    ],
+                    "classes": ["motorbike"],
                     "n_classes": 1,
                     "iou_threshold": 0.45,
                     "conf_threshold": 0.25,
@@ -218,9 +153,12 @@
                     "n_keypoints": 3
                 },
                 "outputs": [
-                    "EfficientKeypointBBoxHead/outputs/0",
-                    "EfficientKeypointBBoxHead/outputs/1",
-                    "EfficientKeypointBBoxHead/outputs/2"
+                    "output1_yolov6",
+                    "output2_yolov6",
+                    "output3_yolov6",
+                    "kpt_output1",
+                    "kpt_output2",
+                    "kpt_output3"
                 ]
             },
             {
@@ -228,19 +166,12 @@
                 "parser": "SegmentationParser",
                 "metadata": {
                     "postprocessor_path": null,
-                    "classes": [
-                        "background",
-                        "blue",
-                        "green",
-                        "red"
-                    ],
+                    "classes": ["background", "blue", "green", "red"],
                     "n_classes": 4,
                     "is_softmax": false
                 },
-                "outputs": [
-                    "SegmentationHead/color/segmentation/0"
-                ]
+                "outputs": ["SegmentationHead/color/segmentation/0"]
             }
         ]
     }
-}
+}
\ No newline at end of file
diff --git a/tests/integration/test_detection.py b/tests/integration/test_detection.py
index 6360ec79..24e12e2b 100644
--- a/tests/integration/test_detection.py
+++ b/tests/integration/test_detection.py
@@ -110,6 +110,7 @@ def test_backbones(
 ):
     opts = get_opts_backbone(backbone)
     opts["loader.params.dataset_name"] = parking_lot_dataset.identifier
+    opts["trainer.epochs"] = 1
     train_and_test(config, opts)
 
 
@@ -121,4 +122,5 @@ def test_variants(
 ):
     opts = get_opts_variant(variant)
     opts["loader.params.dataset_name"] = parking_lot_dataset.identifier
+    opts["trainer.epochs"] = 1
     train_and_test(config, opts)
diff --git a/tests/integration/test_simple.py b/tests/integration/test_simple.py
index 72331fcc..cef74ba3 100644
--- a/tests/integration/test_simple.py
+++ b/tests/integration/test_simple.py
@@ -58,6 +58,8 @@ def clear_files():
         "detection_light_model",
         "keypoint_bbox_heavy_model",
         "keypoint_bbox_light_model",
+        "instance_segmentation_heavy_model",
+        "instance_segmentation_light_model",
     ],
 )
 def test_predefined_models(
diff --git a/tests/unittests/test_assigners/test_tal_assigner.py b/tests/unittests/test_assigners/test_tal_assigner.py
index cb94b62d..ab64302b 100644
--- a/tests/unittests/test_assigners/test_tal_assigner.py
+++ b/tests/unittests/test_assigners/test_tal_assigner.py
@@ -34,6 +34,12 @@ def test_forward():
         pred_scores, pred_bboxes, anchor_points, gt_labels, gt_bboxes, mask_gt
     )
 
+    labels = torch.where(
+        mask,
+        labels,
+        torch.full_like(labels, n_classes),
+    )
+
     assert labels.shape == (batch_size, n_anchors)
     assert bboxes.shape == (batch_size, n_anchors, 4)
     assert scores.shape == (