Rename 'keep_empty_images' to 'keep_unlabeled_images'

luxonis · Nov 12, 2024 · e6653d9 · e6653d9
1 parent ec6c167
commit e6653d9
Show file tree

Hide file tree

Showing 11 changed files with 47 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -176,7 +176,7 @@ datadreamer --config <path-to-config>
 - `--lm_quantization`: Quantization to use for Mistral language model. Choose between `none` and `4bit`. Default is `none`.
 - `--annotator_size`: Size of the annotator model to use. Choose between `base` and `large`. Default is `base`.
 - `--disable_lm_filter`: Use only a bad word list for profanity filtering. Default is `False`.
-- `--keep_empty_images`: Whether to keep images without any annotations. Default if `False`.
+- `--keep_unlabeled_images`: Whether to keep images without any annotations. Default if `False`.
 - `--batch_size_prompt`: Batch size for prompt generation. Default is 64.
 - `--batch_size_annotation`: Batch size for annotation. Default is `1`.
 - `--batch_size_image`: Batch size for image generation. Default is `1`.

diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py
@@ -218,7 +218,7 @@ def parse_args():
     )
 
     parser.add_argument(
-        "--keep_empty_images",
+        "--keep_unlabeled_images",
         default=None,
         action="store_true",
         help="Whether to keep images without any annotations",
@@ -725,7 +725,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
                 args.split_ratios,
                 copy_files=False,
                 is_instance_segmentation=args.task == "instance-segmentation",
-                keep_empty_images=args.keep_empty_images,
+                keep_unlabeled_images=args.keep_unlabeled_images,
                 seed=args.seed,
             )
         # Convert annotations to COCO format
@@ -736,7 +736,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
                 "coco",
                 args.split_ratios,
                 is_instance_segmentation=args.task == "instance-segmentation",
-                keep_empty_images=args.keep_empty_images,
+                keep_unlabeled_images=args.keep_unlabeled_images,
                 copy_files=False,
                 seed=args.seed,
             )
@@ -751,7 +751,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
             dataset_plugin=args.dataset_plugin,
             dataset_name=args.dataset_name,
             is_instance_segmentation=args.task == "instance-segmentation",
-            keep_empty_images=args.keep_empty_images,
+            keep_unlabeled_images=args.keep_unlabeled_images,
             copy_files=False,
             seed=args.seed,
         )

diff --git a/datadreamer/utils/base_converter.py b/datadreamer/utils/base_converter.py
@@ -19,7 +19,7 @@ def convert(
         dataset_dir: str,
         output_dir: str,
         split_ratios: List[float],
-        keep_empty_images: bool = False,
+        keep_unlabeled_images: bool = False,
         copy_files: bool = True,
     ) -> None:
         """Converts a dataset into another format.
@@ -28,7 +28,7 @@ def convert(
             dataset_dir (str): The directory where the source dataset is located.
             output_dir (str): The directory where the processed dataset should be saved.
             split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-            keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.

diff --git a/datadreamer/utils/coco_converter.py b/datadreamer/utils/coco_converter.py
@@ -42,7 +42,7 @@ def convert(
         dataset_dir: str,
         output_dir: str,
         split_ratios: List[float],
-        keep_empty_images: bool = False,
+        keep_unlabeled_images: bool = False,
         copy_files: bool = True,
     ) -> None:
         """Converts a dataset into a COCO format.
@@ -51,15 +51,20 @@ def convert(
             dataset_dir (str): The directory where the source dataset is located.
             output_dir (str): The directory where the processed dataset should be saved.
             split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-            keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
         """
         annotation_path = os.path.join(dataset_dir, "annotations.json")
         data = BaseConverter.read_annotations(annotation_path)
         self.process_data(
-            data, dataset_dir, output_dir, split_ratios, keep_empty_images, copy_files
+            data,
+            dataset_dir,
+            output_dir,
+            split_ratios,
+            keep_unlabeled_images,
+            copy_files,
         )
 
     def process_data(
@@ -68,7 +73,7 @@ def process_data(
         image_dir: str,
         output_dir: str,
         split_ratios: List[float],
-        keep_empty_images: bool = False,
+        keep_unlabeled_images: bool = False,
         copy_files: bool = True,
     ) -> None:
         """Processes the data by dividing it into training and validation sets, and
@@ -79,7 +84,7 @@ def process_data(
             image_dir (str): The directory where the source images are located.
             output_dir (str): The base directory where the processed data will be saved.
             split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-            keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
@@ -88,11 +93,11 @@ def process_data(
         images.remove("class_names")
 
         empty_images = list(filter(lambda x: len(data[x]["labels"]) == 0, images))
-        if keep_empty_images and len(empty_images) > 0:
+        if keep_unlabeled_images and len(empty_images) > 0:
             logger.warning(
                 f"{len(empty_images)} images with no annotations will be included in the dataset."
             )
-        elif not keep_empty_images and len(empty_images) > 0:
+        elif not keep_unlabeled_images and len(empty_images) > 0:
             logger.info(
                 f"{len(empty_images)} images with no annotations will be excluded from the dataset."
             )

diff --git a/datadreamer/utils/config.py b/datadreamer/utils/config.py
@@ -50,4 +50,4 @@ class Config(LuxonisConfig):
     dataset_name: str = ""
     dataset_id: str = ""
     # Dataset arguments
-    keep_empty_images: bool = False
+    keep_unlabeled_images: bool = False
diff --git a/datadreamer/utils/convert_dataset.py b/datadreamer/utils/convert_dataset.py
@@ -19,7 +19,7 @@ def convert_dataset(
     dataset_plugin: Optional[str] = None,
     dataset_name: Optional[str] = None,
     is_instance_segmentation: bool = False,
-    keep_empty_images: bool = False,
+    keep_unlabeled_images: bool = False,
     copy_files: bool = True,
     seed: int = 42,
 ) -> None:
@@ -33,7 +33,7 @@ def convert_dataset(
         dataset_plugin (str, optional): Plugin for Luxonis dataset. Defaults to None.
         dataset_name (str, optional): Name of the Luxonis dataset. Defaults to None.
         is_instance_segmentation (bool, optional): Whether the dataset is for instance segmentation. Defaults to False.
-        keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
+        keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
         copy_files (bool, optional): Whether to copy the files to the output directory. Defaults to True.
         seed (int, optional): Random seed. Defaults to 42.
 
@@ -61,7 +61,7 @@ def convert_dataset(
         raise ValueError(f"Invalid dataset format: {dataset_format}")
 
     converter.convert(
-        input_dir, output_dir, split_ratios, keep_empty_images, copy_files
+        input_dir, output_dir, split_ratios, keep_unlabeled_images, copy_files
     )
 
 
@@ -108,7 +108,7 @@ def main():
         help="Whether the dataset is for instance segmentation.",
     )
     parser.add_argument(
-        "--keep_empty_images",
+        "--keep_unlabeled_images",
         default=None,
         action="store_true",
         help="Whether to keep images without any annotations",
@@ -136,7 +136,7 @@ def main():
         dataset_plugin=args.dataset_plugin,
         dataset_name=args.dataset_name,
         is_instance_segmentation=args.is_instance_segmentation,
-        keep_empty_images=args.keep_empty_images,
+        keep_unlabeled_images=args.keep_unlabeled_images,
         copy_files=args.copy_files,
         seed=args.seed,
     )

diff --git a/datadreamer/utils/luxonis_dataset_converter.py b/datadreamer/utils/luxonis_dataset_converter.py
@@ -38,7 +38,7 @@ def convert(
         dataset_dir: str,
         output_dir: str,
         split_ratios: List[float],
-        keep_empty_images: bool = False,
+        keep_unlabeled_images: bool = False,
         copy_files: bool = True,
     ) -> None:
         """Converts a dataset into a LuxonisDataset format.
@@ -47,15 +47,15 @@ def convert(
             dataset_dir (str): The directory where the source dataset is located.
             output_dir (str): The directory where the processed dataset should be saved.
             split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-            keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
         """
         annotation_path = os.path.join(dataset_dir, "annotations.json")
         data = BaseConverter.read_annotations(annotation_path)
         self.process_data(
-            data, dataset_dir, output_dir, split_ratios, keep_empty_images
+            data, dataset_dir, output_dir, split_ratios, keep_unlabeled_images
         )
 
     def process_data(
@@ -64,7 +64,7 @@ def process_data(
         dataset_dir: str,
         output_dir: str,
         split_ratios: List[float],
-        keep_empty_images: bool = False,
+        keep_unlabeled_images: bool = False,
     ) -> None:
         """Processes the data into LuxonisDataset format.
 
@@ -90,7 +90,7 @@ def dataset_generator():
                 width, height = Image.open(image_full_path).size
                 labels = data[image_path]["labels"]
 
-                if len(labels) == 0 and keep_empty_images:
+                if len(labels) == 0 and keep_unlabeled_images:
                     logger.warning(
                         f"Image {image_path} has no annotations. Training on empty images with `luxonis-train` will result in an error."
                     )
@@ -174,7 +174,7 @@ def dataset_generator():
 
         dataset.add(dataset_generator())
 
-        if not keep_empty_images:
+        if not keep_unlabeled_images:
             n_empty_images = len(
                 list(filter(lambda x: len(data[x]["labels"]) == 0, image_paths))
             )

diff --git a/datadreamer/utils/single_label_cls_converter.py b/datadreamer/utils/single_label_cls_converter.py
@@ -41,7 +41,7 @@ def convert(
         dataset_dir: str,
         output_dir: str,
         split_ratios: List[float],
-        keep_empty_images: bool = False,
+        keep_unlabeled_images: bool = False,
         copy_files: bool = True,
     ) -> None:
         """Converts a dataset into a format suitable for single-label classification.
@@ -50,7 +50,7 @@ def convert(
             dataset_dir (str): The directory where the source dataset is located.
             output_dir (str): The directory where the processed dataset should be saved.
             split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-            keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.

diff --git a/datadreamer/utils/yolo_converter.py b/datadreamer/utils/yolo_converter.py
@@ -43,7 +43,7 @@ def convert(
         dataset_dir: str,
         output_dir: str,
         split_ratios: List[float],
-        keep_empty_images: bool = False,
+        keep_unlabeled_images: bool = False,
         copy_files: bool = True,
     ):
         """Converts a dataset into a format suitable for training with YOLO, including
@@ -53,15 +53,20 @@ def convert(
             dataset_dir (str): The directory where the source dataset is located.
             output_dir (str): The directory where the processed dataset should be saved.
             split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-            keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
         """
         annotation_path = os.path.join(dataset_dir, "annotations.json")
         data = BaseConverter.read_annotations(annotation_path)
         self.process_data(
-            data, dataset_dir, output_dir, split_ratios, keep_empty_images, copy_files
+            data,
+            dataset_dir,
+            output_dir,
+            split_ratios,
+            keep_unlabeled_images,
+            copy_files,
         )
 
     def convert_to_yolo_format(
@@ -104,7 +109,7 @@ def process_data(
         image_dir: str,
         output_dir: str,
         split_ratios: List[float],
-        keep_empty_images: bool = False,
+        keep_unlabeled_images: bool = False,
         copy_files: bool = True,
     ) -> None:
         """Processes the data by dividing it into training and validation sets, and
@@ -115,7 +120,7 @@ def process_data(
             image_dir (str): The directory where the source images are located.
             output_dir (str): The base directory where the processed data will be saved.
             split_ratios (float): The ratio to split the data into training, validation, and test sets.
-            keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
@@ -124,11 +129,11 @@ def process_data(
         images.remove("class_names")
 
         empty_images = list(filter(lambda x: len(data[x]["labels"]) == 0, images))
-        if keep_empty_images and len(empty_images) > 0:
+        if keep_unlabeled_images and len(empty_images) > 0:
             logger.warning(
                 f"{len(empty_images)} images with no annotations will be included in the dataset."
             )
-        elif not keep_empty_images and len(empty_images) > 0:
+        elif not keep_unlabeled_images and len(empty_images) > 0:
             logger.info(
                 f"{len(empty_images)} images with no annotations will be excluded from the dataset."
             )

diff --git a/examples/generate_dataset_and_train_yolo.ipynb b/examples/generate_dataset_and_train_yolo.ipynb
@@ -97,7 +97,7 @@
     "- `--lm_quantization`: Quantization to use for Mistral language model. Choose between `none` and `4bit`. Default is `none`.\n",
     "- `--annotator_size`: Size of the annotator model to use. Choose between `base` and `large`. Default is `base`.\n",
     "- `--disable_lm_filter`: Use only a bad word list for profanity filtering. Default is `False`.\n",
-    "- `--keep_empty_images`: Whether to keep images without any annotations. Default if `False`.\n",
+    "- `--keep_unlabeled_images`: Whether to keep images without any annotations. Default if `False`.\n",
     "- `--batch_size_prompt`: Batch size for prompt generation. Default is 64.\n",
     "- `--batch_size_annotation`: Batch size for annotation. Default is `1`.\n",
     "- `--batch_size_image`: Batch size for image generation. Default is `1`.\n",

diff --git a/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb b/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb
@@ -112,7 +112,7 @@
     "- `--lm_quantization`: Quantization to use for Mistral language model. Choose between `none` and `4bit`. Default is `none`.\n",
     "- `--annotator_size`: Size of the annotator model to use. Choose between `base` and `large`. Default is `base`.\n",
     "- `--disable_lm_filter`: Use only a bad word list for profanity filtering. Default is `False`.\n",
-    "- `--keep_empty_images`: Whether to keep images without any annotations. Default if `False`.\n",
+    "- `--keep_unlabeled_images`: Whether to keep images without any annotations. Default if `False`.\n",
     "- `--batch_size_prompt`: Batch size for prompt generation. Default is 64.\n",
     "- `--batch_size_annotation`: Batch size for annotation. Default is `1`.\n",
     "- `--batch_size_image`: Batch size for image generation. Default is `1`.\n",