Switch to SlimSAM

luxonis · Oct 24, 2024 · 4fae718 · 4fae718
1 parent 7879220
commit 4fae718
Show file tree

Hide file tree

Showing 13 changed files with 73 additions and 149 deletions.
diff --git a/README.md b/README.md
@@ -157,13 +157,13 @@ datadreamer --config <path-to-config>
 
 ### 🔧 Additional Parameters
 
-- `--task`: Choose between detection, classification and instance segmentation. Default is `detection`.
+- `--task`: Choose between detection, classification, instance segmentation and semantic segmentation. Default is `detection`.
 - `--dataset_format`: Format of the dataset. Defaults to `raw`. Supported values: `raw`, `yolo`, `coco`, `luxonis-dataset`, `cls-single`.
 - `--split_ratios`: Split ratios for train, validation, and test sets. Defaults to `[0.8, 0.1, 0.1]`.
 - `--num_objects_range`: Range of objects in a prompt. Default is 1 to 3.
 - `--prompt_generator`: Choose between `simple`, `lm` (Mistral-7B), `tiny` (tiny LM), and `qwen2` (Qwen2.5 LM). Default is `qwen2`.
 - `--image_generator`: Choose image generator, e.g., `sdxl`, `sdxl-turbo` or `sdxl-lightning`. Default is `sdxl-turbo`.
-- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `clip` for image classification or `owlv2-fastsam` for instance segmentation. Default is `owlv2`.
+- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `clip` for image classification or `owlv2-slimsam` for instance segmentation. Default is `owlv2`.
 - `--conf_threshold`: Confidence threshold for annotation. Default is `0.15`.
 - `--annotation_iou_threshold`: Intersection over Union (IoU) threshold for annotation. Default is `0.2`.
 - `--prompt_prefix`: Prefix to add to every image generation prompt. Default is `""`.
@@ -199,7 +199,7 @@ datadreamer --config <path-to-config>
 |                   | [SDXL-Lightning](https://huggingface.co/ByteDance/SDXL-Lightning)                     | Fast and accurate (1024x1024 images)    |
 | Image Annotation  | [OWLv2](https://huggingface.co/google/owlv2-base-patch16-ensemble)                    | Open-Vocabulary object detector         |
 |                   | [CLIP](https://huggingface.co/openai/clip-vit-base-patch32)                           | Zero-shot-image-classification          |
-|                   | [FastSAM](https://docs.ultralytics.com/models/fast-sam)                               | Zero-shot-instance-segmentation         |
+|                   | [SlimSAM](https://huggingface.co/Zigeng/SlimSAM-uniform-50)                           | Zero-shot-instance-segmentation         |
 
 <a name="example"></a>
 

diff --git a/datadreamer/dataset_annotation/__init__.py b/datadreamer/dataset_annotation/__init__.py
@@ -1,14 +1,14 @@
 from __future__ import annotations
 
 from .clip_annotator import CLIPAnnotator
-from .fastsam_annotator import FastSAMAnnotator
 from .image_annotator import BaseAnnotator, TaskList
 from .owlv2_annotator import OWLv2Annotator
+from .slimsam_annotator import SlimSAMAnnotator
 
 __all__ = [
     "BaseAnnotator",
     "TaskList",
     "OWLv2Annotator",
     "CLIPAnnotator",
-    "FastSAMAnnotator",
+    "SlimSAMAnnotator",
 ]
diff --git a/datadreamer/dataset_annotation/fastsam_annotator.py b/datadreamer/dataset_annotation/fastsam_annotator.py
diff --git a/datadreamer/dataset_annotation/utils.py b/datadreamer/dataset_annotation/utils.py
@@ -2,6 +2,8 @@
 
 from typing import List
 
+import cv2
+import numpy as np
 from torchvision import transforms
 
 
@@ -32,3 +34,26 @@ def apply_tta(image) -> List[transforms.Compose]:
 
     augmented_images = [t(image) for t in tta_transforms]
     return augmented_images
+
+
+def mask_to_polygon(mask: np.ndarray) -> List[List[int]]:
+    """Converts a binary mask to a polygon.
+
+    Args:
+        mask: The binary mask to be converted.
+
+    Returns:
+        List: A list of vertices of the polygon.
+    """
+    # Find contours in the binary mask
+    contours, _ = cv2.findContours(
+        mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+    )
+
+    # Find the contour with the largest area
+    largest_contour = max(contours, key=cv2.contourArea)
+
+    # Extract the vertices of the contour
+    polygon = largest_contour.reshape(-1, 2).tolist()
+
+    return polygon
diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py
@@ -18,8 +18,8 @@
 
 from datadreamer.dataset_annotation import (
     CLIPAnnotator,
-    FastSAMAnnotator,
     OWLv2Annotator,
+    SlimSAMAnnotator,
 )
 from datadreamer.image_generation import (
     StableDiffusionImageGenerator,
@@ -58,8 +58,8 @@
 
 det_annotators = {"owlv2": OWLv2Annotator}
 clf_annotators = {"clip": CLIPAnnotator}
-inst_seg_annotators = {"owlv2-fastsam": FastSAMAnnotator}
-inst_seg_to_det = {"owlv2-fastsam": OWLv2Annotator}
+inst_seg_annotators = {"owlv2-slimsam": SlimSAMAnnotator}
+inst_seg_to_det = {"owlv2-slimsam": OWLv2Annotator}
 
 setup_logging(use_rich=True)
 
@@ -122,7 +122,7 @@ def parse_args():
     parser.add_argument(
         "--image_annotator",
         type=str,
-        choices=["owlv2", "clip", "owlv2-fastsam"],
+        choices=["owlv2", "clip", "owlv2-slimsam"],
         help="Image annotator to use",
     )
 
@@ -637,7 +637,6 @@ def read_image_batch(image_batch, batch_num, batch_size):
                 masks_batch = inst_seg_annotator.annotate_batch(
                     images=images,
                     boxes_batch=boxes_batch,
-                    conf_threshold=args.conf_threshold,
                     iou_threshold=args.annotation_iou_threshold,
                 )
                 segment_list.extend(masks_batch)

diff --git a/datadreamer/utils/coco_converter.py b/datadreamer/utils/coco_converter.py
@@ -113,7 +113,7 @@ def process_data(
                 ):
                     bbox = [box[0], box[1], box[2] - box[0], box[3] - box[1]]
                     segmentation = (
-                        np.array(mask).reshape(-1).tolist()
+                        np.array(mask).reshape(1, -1).tolist()
                         if mask is not None
                         else None
                     )

diff --git a/datadreamer/utils/config.py b/datadreamer/utils/config.py
@@ -39,7 +39,7 @@ class Config(LuxonisConfig):
     # Profanity filter arguments
     disable_lm_filter: bool = False
     # Annotation arguments
-    image_annotator: Literal["owlv2", "clip", "owlv2-fastsam"] = "owlv2"
+    image_annotator: Literal["owlv2", "clip", "owlv2-slimsam"] = "owlv2"
     conf_threshold: float = 0.15
     annotation_iou_threshold: float = 0.2
     use_tta: bool = False

diff --git a/datadreamer/utils/luxonis_dataset_converter.py b/datadreamer/utils/luxonis_dataset_converter.py
@@ -89,10 +89,10 @@ def dataset_generator():
                     masks = data[image_path]["masks"]
                     for mask, label in zip(masks, labels):
                         poly = []
-                        for m in mask:
-                            poly += [
-                                (point[0] / width, point[1] / height) for point in m
-                            ]
+                        print(mask)
+                        poly += [
+                            (point[0] / width, point[1] / height) for point in mask
+                        ]
                         yield {
                             "file": image_full_path,
                             "annotation": {

diff --git a/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb b/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb
@@ -298,7 +298,7 @@
         "             --disable_lm_filter \\\n",
         "             --annotator_size base \\\n",
         "             --use_tta \\\n",
-        "             --image_annotator owlv2-fastsam \\\n",
+        "             --image_annotator owlv2-slimsam \\\n",
         "             --conf_threshold 0.2 \\\n",
         "             --seed 42"
       ]

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 torch>=2.0.0
 torchvision>=0.16.0
-transformers>=4.37.0
+transformers>=4.45.2
 diffusers>=0.24.0
 compel>=2.0.0
 tqdm>=4.0.0
@@ -12,7 +12,6 @@ accelerate>=0.25.0
 scipy>=1.10.0
 bitsandbytes>=0.42.0
 nltk>=3.8.1
-luxonis-ml[all]>=0.3.0
+luxonis-ml[all]>=0.4.0
 python-box>=7.1.1
-gcsfs>=2023.1.0
-ultralytics>=8.3.13
+gcsfs>=2023.1.0
diff --git a/tests/core_tests/integration/test_pipeline.py b/tests/core_tests/integration/test_pipeline.py
@@ -184,7 +184,7 @@ def test_cpu_simple_sdxl_turbo_config_instance_segmentation_pipeline():
         f"datadreamer --task instance-segmentation "
         f"--save_dir {target_folder} "
         f"--num_objects_range 1 2 "
-        f"--image_annotator owlv2-fastsam "
+        f"--image_annotator owlv2-slimsam "
         f"--config ./tests/core_tests/integration/sample_config.yaml "
         f"--device cpu"
     )
@@ -204,7 +204,7 @@ def test_cuda_simple_sdxl_turbo_config_instance_segmentation_pipeline():
         f"datadreamer --task instance-segmentation "
         f"--save_dir {target_folder} "
         f"--num_objects_range 1 2 "
-        f"--image_annotator owlv2-fastsam "
+        f"--image_annotator owlv2-slimsam "
         f"--config ./tests/core_tests/integration/sample_config.yaml "
         f"--device cuda"
     )

diff --git a/tests/core_tests/unittests/test_annotators.py b/tests/core_tests/unittests/test_annotators.py
@@ -8,7 +8,7 @@
 from PIL import Image
 
 from datadreamer.dataset_annotation.clip_annotator import CLIPAnnotator
-from datadreamer.dataset_annotation.fastsam_annotator import FastSAMAnnotator
+from datadreamer.dataset_annotation.fastsam_annotator import SlimSAMAnnotator
 from datadreamer.dataset_annotation.owlv2_annotator import OWLv2Annotator
 
 # Get the total disk space in GB
@@ -99,10 +99,10 @@ def test_cpu_clip_large_annotator():
     _check_clip_annotator("cpu", size="large")
 
 
-def _check_fastsam_annotator(device: str, size: str = "base"):
+def _check_slimsam_annotator(device: str, size: str = "base"):
     url = "https://ultralytics.com/images/bus.jpg"
     im = Image.open(requests.get(url, stream=True).raw)
-    annotator = FastSAMAnnotator(device=device, size=size)
+    annotator = SlimSAMAnnotator(device=device, size=size)
     masks = annotator.annotate_batch([im], [np.array([[3, 229, 559, 650]])])
     w, h = im.width, im.height
     # Check that the masks are lists
@@ -124,28 +124,28 @@ def _check_fastsam_annotator(device: str, size: str = "base"):
     reason="Test requires GPU and 16GB of HDD",
 )
 def test_cuda_fastsam_base_annotator():
-    _check_fastsam_annotator("cuda")
+    _check_slimsam_annotator("cuda")
 
 
 @pytest.mark.skipif(
     total_disk_space < 16,
     reason="Test requires at least 16GB of HDD",
 )
 def test_cpu_fastsam_base_annotator():
-    _check_fastsam_annotator("cpu")
+    _check_slimsam_annotator("cpu")
 
 
 @pytest.mark.skipif(
     not torch.cuda.is_available() or total_disk_space < 16,
     reason="Test requires GPU and 16GB of HDD",
 )
 def test_cuda_fastsam_large_annotator():
-    _check_fastsam_annotator("cuda", size="large")
+    _check_slimsam_annotator("cuda", size="large")
 
 
 @pytest.mark.skipif(
     total_disk_space < 16,
     reason="Test requires at least 16GB of HDD",
 )
 def test_cpu_fastsam_large_annotator():
-    _check_fastsam_annotator("cpu", size="large")
+    _check_slimsam_annotator("cpu", size="large")