From e6653d9e4b7add286d4c0e3210a2cc0d1f29a224 Mon Sep 17 00:00:00 2001
From: HonzaCuhel <jan.cuhel@protonmail.com>
Date: Tue, 12 Nov 2024 10:50:52 +0100
Subject: [PATCH] Rename 'keep_empty_images' to 'keep_unlabeled_images'

---
 README.md                                     |  2 +-
 .../generate_dataset_from_scratch.py          |  8 ++++----
 datadreamer/utils/base_converter.py           |  4 ++--
 datadreamer/utils/coco_converter.py           | 19 ++++++++++++-------
 datadreamer/utils/config.py                   |  2 +-
 datadreamer/utils/convert_dataset.py          | 10 +++++-----
 .../utils/luxonis_dataset_converter.py        | 12 ++++++------
 .../utils/single_label_cls_converter.py       |  4 ++--
 datadreamer/utils/yolo_converter.py           | 19 ++++++++++++-------
 .../generate_dataset_and_train_yolo.ipynb     |  2 +-
 ..._segmentation_dataset_and_train_yolo.ipynb |  2 +-
 11 files changed, 47 insertions(+), 37 deletions(-)
diff --git a/README.md b/README.md
index e34a5e3..17921df 100644
--- a/README.md
+++ b/README.md
@@ -176,7 +176,7 @@ datadreamer --config <path-to-config>
 - `--lm_quantization`: Quantization to use for Mistral language model. Choose between `none` and `4bit`. Default is `none`.
 - `--annotator_size`: Size of the annotator model to use. Choose between `base` and `large`. Default is `base`.
 - `--disable_lm_filter`: Use only a bad word list for profanity filtering. Default is `False`.
-- `--keep_empty_images`: Whether to keep images without any annotations. Default if `False`.
+- `--keep_unlabeled_images`: Whether to keep images without any annotations. Default if `False`.
 - `--batch_size_prompt`: Batch size for prompt generation. Default is 64.
 - `--batch_size_annotation`: Batch size for annotation. Default is `1`.
 - `--batch_size_image`: Batch size for image generation. Default is `1`.
diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py
index 404999d..4d52acb 100644
--- a/datadreamer/pipelines/generate_dataset_from_scratch.py
+++ b/datadreamer/pipelines/generate_dataset_from_scratch.py
@@ -218,7 +218,7 @@ def parse_args():
     )
 
     parser.add_argument(
-        "--keep_empty_images",
+        "--keep_unlabeled_images",
         default=None,
         action="store_true",
         help="Whether to keep images without any annotations",
@@ -725,7 +725,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
                 args.split_ratios,
                 copy_files=False,
                 is_instance_segmentation=args.task == "instance-segmentation",
-                keep_empty_images=args.keep_empty_images,
+                keep_unlabeled_images=args.keep_unlabeled_images,
                 seed=args.seed,
             )
         # Convert annotations to COCO format
@@ -736,7 +736,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
                 "coco",
                 args.split_ratios,
                 is_instance_segmentation=args.task == "instance-segmentation",
-                keep_empty_images=args.keep_empty_images,
+                keep_unlabeled_images=args.keep_unlabeled_images,
                 copy_files=False,
                 seed=args.seed,
             )
@@ -751,7 +751,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
             dataset_plugin=args.dataset_plugin,
             dataset_name=args.dataset_name,
             is_instance_segmentation=args.task == "instance-segmentation",
-            keep_empty_images=args.keep_empty_images,
+            keep_unlabeled_images=args.keep_unlabeled_images,
             copy_files=False,
             seed=args.seed,
         )
diff --git a/datadreamer/utils/base_converter.py b/datadreamer/utils/base_converter.py
index 1885575..40003ed 100644
--- a/datadreamer/utils/base_converter.py
+++ b/datadreamer/utils/base_converter.py
@@ -19,7 +19,7 @@ def convert(
         dataset_dir: str,
         output_dir: str,
         split_ratios: List[float],
-        keep_empty_images: bool = False,
+        keep_unlabeled_images: bool = False,
         copy_files: bool = True,
     ) -> None:
         """Converts a dataset into another format.
@@ -28,7 +28,7 @@ def convert(
             dataset_dir (str): The directory where the source dataset is located.
             output_dir (str): The directory where the processed dataset should be saved.
             split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-            keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
diff --git a/datadreamer/utils/coco_converter.py b/datadreamer/utils/coco_converter.py
index 1b61b8b..40d599a 100644
--- a/datadreamer/utils/coco_converter.py
+++ b/datadreamer/utils/coco_converter.py
@@ -42,7 +42,7 @@ def convert(
         dataset_dir: str,
         output_dir: str,
         split_ratios: List[float],
-        keep_empty_images: bool = False,
+        keep_unlabeled_images: bool = False,
         copy_files: bool = True,
     ) -> None:
         """Converts a dataset into a COCO format.
@@ -51,7 +51,7 @@ def convert(
             dataset_dir (str): The directory where the source dataset is located.
             output_dir (str): The directory where the processed dataset should be saved.
             split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-            keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
@@ -59,7 +59,12 @@ def convert(
         annotation_path = os.path.join(dataset_dir, "annotations.json")
         data = BaseConverter.read_annotations(annotation_path)
         self.process_data(
-            data, dataset_dir, output_dir, split_ratios, keep_empty_images, copy_files
+            data,
+            dataset_dir,
+            output_dir,
+            split_ratios,
+            keep_unlabeled_images,
+            copy_files,
         )
 
     def process_data(
@@ -68,7 +73,7 @@ def process_data(
         image_dir: str,
         output_dir: str,
         split_ratios: List[float],
-        keep_empty_images: bool = False,
+        keep_unlabeled_images: bool = False,
         copy_files: bool = True,
     ) -> None:
         """Processes the data by dividing it into training and validation sets, and
@@ -79,7 +84,7 @@ def process_data(
             image_dir (str): The directory where the source images are located.
             output_dir (str): The base directory where the processed data will be saved.
             split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-            keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
@@ -88,11 +93,11 @@ def process_data(
         images.remove("class_names")
 
         empty_images = list(filter(lambda x: len(data[x]["labels"]) == 0, images))
-        if keep_empty_images and len(empty_images) > 0:
+        if keep_unlabeled_images and len(empty_images) > 0:
             logger.warning(
                 f"{len(empty_images)} images with no annotations will be included in the dataset."
             )
-        elif not keep_empty_images and len(empty_images) > 0:
+        elif not keep_unlabeled_images and len(empty_images) > 0:
             logger.info(
                 f"{len(empty_images)} images with no annotations will be excluded from the dataset."
             )
diff --git a/datadreamer/utils/config.py b/datadreamer/utils/config.py
index 59f3a9d..6227b61 100644
--- a/datadreamer/utils/config.py
+++ b/datadreamer/utils/config.py
@@ -50,4 +50,4 @@ class Config(LuxonisConfig):
     dataset_name: str = ""
     dataset_id: str = ""
     # Dataset arguments
-    keep_empty_images: bool = False
+    keep_unlabeled_images: bool = False
diff --git a/datadreamer/utils/convert_dataset.py b/datadreamer/utils/convert_dataset.py
index 1154417..1bcea34 100644
--- a/datadreamer/utils/convert_dataset.py
+++ b/datadreamer/utils/convert_dataset.py
@@ -19,7 +19,7 @@ def convert_dataset(
     dataset_plugin: Optional[str] = None,
     dataset_name: Optional[str] = None,
     is_instance_segmentation: bool = False,
-    keep_empty_images: bool = False,
+    keep_unlabeled_images: bool = False,
     copy_files: bool = True,
     seed: int = 42,
 ) -> None:
@@ -33,7 +33,7 @@ def convert_dataset(
         dataset_plugin (str, optional): Plugin for Luxonis dataset. Defaults to None.
         dataset_name (str, optional): Name of the Luxonis dataset. Defaults to None.
         is_instance_segmentation (bool, optional): Whether the dataset is for instance segmentation. Defaults to False.
-        keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
+        keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
         copy_files (bool, optional): Whether to copy the files to the output directory. Defaults to True.
         seed (int, optional): Random seed. Defaults to 42.
 
@@ -61,7 +61,7 @@ def convert_dataset(
         raise ValueError(f"Invalid dataset format: {dataset_format}")
 
     converter.convert(
-        input_dir, output_dir, split_ratios, keep_empty_images, copy_files
+        input_dir, output_dir, split_ratios, keep_unlabeled_images, copy_files
     )
 
 
@@ -108,7 +108,7 @@ def main():
         help="Whether the dataset is for instance segmentation.",
     )
     parser.add_argument(
-        "--keep_empty_images",
+        "--keep_unlabeled_images",
         default=None,
         action="store_true",
         help="Whether to keep images without any annotations",
@@ -136,7 +136,7 @@ def main():
         dataset_plugin=args.dataset_plugin,
         dataset_name=args.dataset_name,
         is_instance_segmentation=args.is_instance_segmentation,
-        keep_empty_images=args.keep_empty_images,
+        keep_unlabeled_images=args.keep_unlabeled_images,
         copy_files=args.copy_files,
         seed=args.seed,
     )
diff --git a/datadreamer/utils/luxonis_dataset_converter.py b/datadreamer/utils/luxonis_dataset_converter.py
index 6f728d0..fec3d36 100644
--- a/datadreamer/utils/luxonis_dataset_converter.py
+++ b/datadreamer/utils/luxonis_dataset_converter.py
@@ -38,7 +38,7 @@ def convert(
         dataset_dir: str,
         output_dir: str,
         split_ratios: List[float],
-        keep_empty_images: bool = False,
+        keep_unlabeled_images: bool = False,
         copy_files: bool = True,
     ) -> None:
         """Converts a dataset into a LuxonisDataset format.
@@ -47,7 +47,7 @@ def convert(
             dataset_dir (str): The directory where the source dataset is located.
             output_dir (str): The directory where the processed dataset should be saved.
             split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-            keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
@@ -55,7 +55,7 @@ def convert(
         annotation_path = os.path.join(dataset_dir, "annotations.json")
         data = BaseConverter.read_annotations(annotation_path)
         self.process_data(
-            data, dataset_dir, output_dir, split_ratios, keep_empty_images
+            data, dataset_dir, output_dir, split_ratios, keep_unlabeled_images
         )
 
     def process_data(
@@ -64,7 +64,7 @@ def process_data(
         dataset_dir: str,
         output_dir: str,
         split_ratios: List[float],
-        keep_empty_images: bool = False,
+        keep_unlabeled_images: bool = False,
     ) -> None:
         """Processes the data into LuxonisDataset format.
 
@@ -90,7 +90,7 @@ def dataset_generator():
                 width, height = Image.open(image_full_path).size
                 labels = data[image_path]["labels"]
 
-                if len(labels) == 0 and keep_empty_images:
+                if len(labels) == 0 and keep_unlabeled_images:
                     logger.warning(
                         f"Image {image_path} has no annotations. Training on empty images with `luxonis-train` will result in an error."
                     )
@@ -174,7 +174,7 @@ def dataset_generator():
 
         dataset.add(dataset_generator())
 
-        if not keep_empty_images:
+        if not keep_unlabeled_images:
             n_empty_images = len(
                 list(filter(lambda x: len(data[x]["labels"]) == 0, image_paths))
             )
diff --git a/datadreamer/utils/single_label_cls_converter.py b/datadreamer/utils/single_label_cls_converter.py
index 66a0817..daa3bd8 100644
--- a/datadreamer/utils/single_label_cls_converter.py
+++ b/datadreamer/utils/single_label_cls_converter.py
@@ -41,7 +41,7 @@ def convert(
         dataset_dir: str,
         output_dir: str,
         split_ratios: List[float],
-        keep_empty_images: bool = False,
+        keep_unlabeled_images: bool = False,
         copy_files: bool = True,
     ) -> None:
         """Converts a dataset into a format suitable for single-label classification.
@@ -50,7 +50,7 @@ def convert(
             dataset_dir (str): The directory where the source dataset is located.
             output_dir (str): The directory where the processed dataset should be saved.
             split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-            keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
diff --git a/datadreamer/utils/yolo_converter.py b/datadreamer/utils/yolo_converter.py
index 2fed968..e4ad15a 100644
--- a/datadreamer/utils/yolo_converter.py
+++ b/datadreamer/utils/yolo_converter.py
@@ -43,7 +43,7 @@ def convert(
         dataset_dir: str,
         output_dir: str,
         split_ratios: List[float],
-        keep_empty_images: bool = False,
+        keep_unlabeled_images: bool = False,
         copy_files: bool = True,
     ):
         """Converts a dataset into a format suitable for training with YOLO, including
@@ -53,7 +53,7 @@ def convert(
             dataset_dir (str): The directory where the source dataset is located.
             output_dir (str): The directory where the processed dataset should be saved.
             split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-            keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
@@ -61,7 +61,12 @@ def convert(
         annotation_path = os.path.join(dataset_dir, "annotations.json")
         data = BaseConverter.read_annotations(annotation_path)
         self.process_data(
-            data, dataset_dir, output_dir, split_ratios, keep_empty_images, copy_files
+            data,
+            dataset_dir,
+            output_dir,
+            split_ratios,
+            keep_unlabeled_images,
+            copy_files,
         )
 
     def convert_to_yolo_format(
@@ -104,7 +109,7 @@ def process_data(
         image_dir: str,
         output_dir: str,
         split_ratios: List[float],
-        keep_empty_images: bool = False,
+        keep_unlabeled_images: bool = False,
         copy_files: bool = True,
     ) -> None:
         """Processes the data by dividing it into training and validation sets, and
@@ -115,7 +120,7 @@ def process_data(
             image_dir (str): The directory where the source images are located.
             output_dir (str): The base directory where the processed data will be saved.
             split_ratios (float): The ratio to split the data into training, validation, and test sets.
-            keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
+            keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
             copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
@@ -124,11 +129,11 @@ def process_data(
         images.remove("class_names")
 
         empty_images = list(filter(lambda x: len(data[x]["labels"]) == 0, images))
-        if keep_empty_images and len(empty_images) > 0:
+        if keep_unlabeled_images and len(empty_images) > 0:
             logger.warning(
                 f"{len(empty_images)} images with no annotations will be included in the dataset."
             )
-        elif not keep_empty_images and len(empty_images) > 0:
+        elif not keep_unlabeled_images and len(empty_images) > 0:
             logger.info(
                 f"{len(empty_images)} images with no annotations will be excluded from the dataset."
             )
diff --git a/examples/generate_dataset_and_train_yolo.ipynb b/examples/generate_dataset_and_train_yolo.ipynb
index 4324214..2a08030 100644
--- a/examples/generate_dataset_and_train_yolo.ipynb
+++ b/examples/generate_dataset_and_train_yolo.ipynb
@@ -97,7 +97,7 @@
     "- `--lm_quantization`: Quantization to use for Mistral language model. Choose between `none` and `4bit`. Default is `none`.\n",
     "- `--annotator_size`: Size of the annotator model to use. Choose between `base` and `large`. Default is `base`.\n",
     "- `--disable_lm_filter`: Use only a bad word list for profanity filtering. Default is `False`.\n",
-    "- `--keep_empty_images`: Whether to keep images without any annotations. Default if `False`.\n",
+    "- `--keep_unlabeled_images`: Whether to keep images without any annotations. Default if `False`.\n",
     "- `--batch_size_prompt`: Batch size for prompt generation. Default is 64.\n",
     "- `--batch_size_annotation`: Batch size for annotation. Default is `1`.\n",
     "- `--batch_size_image`: Batch size for image generation. Default is `1`.\n",
diff --git a/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb b/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb
index 951e9c9..1588001 100644
--- a/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb
+++ b/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb
@@ -112,7 +112,7 @@
     "- `--lm_quantization`: Quantization to use for Mistral language model. Choose between `none` and `4bit`. Default is `none`.\n",
     "- `--annotator_size`: Size of the annotator model to use. Choose between `base` and `large`. Default is `base`.\n",
     "- `--disable_lm_filter`: Use only a bad word list for profanity filtering. Default is `False`.\n",
-    "- `--keep_empty_images`: Whether to keep images without any annotations. Default if `False`.\n",
+    "- `--keep_unlabeled_images`: Whether to keep images without any annotations. Default if `False`.\n",
     "- `--batch_size_prompt`: Batch size for prompt generation. Default is 64.\n",
     "- `--batch_size_annotation`: Batch size for annotation. Default is `1`.\n",
     "- `--batch_size_image`: Batch size for image generation. Default is `1`.\n",