From e6653d9e4b7add286d4c0e3210a2cc0d1f29a224 Mon Sep 17 00:00:00 2001 From: HonzaCuhel Date: Tue, 12 Nov 2024 10:50:52 +0100 Subject: [PATCH] Rename 'keep_empty_images' to 'keep_unlabeled_images' --- README.md | 2 +- .../generate_dataset_from_scratch.py | 8 ++++---- datadreamer/utils/base_converter.py | 4 ++-- datadreamer/utils/coco_converter.py | 19 ++++++++++++------- datadreamer/utils/config.py | 2 +- datadreamer/utils/convert_dataset.py | 10 +++++----- .../utils/luxonis_dataset_converter.py | 12 ++++++------ .../utils/single_label_cls_converter.py | 4 ++-- datadreamer/utils/yolo_converter.py | 19 ++++++++++++------- .../generate_dataset_and_train_yolo.ipynb | 2 +- ..._segmentation_dataset_and_train_yolo.ipynb | 2 +- 11 files changed, 47 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index e34a5e3..17921df 100644 --- a/README.md +++ b/README.md @@ -176,7 +176,7 @@ datadreamer --config - `--lm_quantization`: Quantization to use for Mistral language model. Choose between `none` and `4bit`. Default is `none`. - `--annotator_size`: Size of the annotator model to use. Choose between `base` and `large`. Default is `base`. - `--disable_lm_filter`: Use only a bad word list for profanity filtering. Default is `False`. -- `--keep_empty_images`: Whether to keep images without any annotations. Default if `False`. +- `--keep_unlabeled_images`: Whether to keep images without any annotations. Default if `False`. - `--batch_size_prompt`: Batch size for prompt generation. Default is 64. - `--batch_size_annotation`: Batch size for annotation. Default is `1`. - `--batch_size_image`: Batch size for image generation. Default is `1`. diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py index 404999d..4d52acb 100644 --- a/datadreamer/pipelines/generate_dataset_from_scratch.py +++ b/datadreamer/pipelines/generate_dataset_from_scratch.py @@ -218,7 +218,7 @@ def parse_args(): ) parser.add_argument( - "--keep_empty_images", + "--keep_unlabeled_images", default=None, action="store_true", help="Whether to keep images without any annotations", @@ -725,7 +725,7 @@ def read_image_batch(image_batch, batch_num, batch_size): args.split_ratios, copy_files=False, is_instance_segmentation=args.task == "instance-segmentation", - keep_empty_images=args.keep_empty_images, + keep_unlabeled_images=args.keep_unlabeled_images, seed=args.seed, ) # Convert annotations to COCO format @@ -736,7 +736,7 @@ def read_image_batch(image_batch, batch_num, batch_size): "coco", args.split_ratios, is_instance_segmentation=args.task == "instance-segmentation", - keep_empty_images=args.keep_empty_images, + keep_unlabeled_images=args.keep_unlabeled_images, copy_files=False, seed=args.seed, ) @@ -751,7 +751,7 @@ def read_image_batch(image_batch, batch_num, batch_size): dataset_plugin=args.dataset_plugin, dataset_name=args.dataset_name, is_instance_segmentation=args.task == "instance-segmentation", - keep_empty_images=args.keep_empty_images, + keep_unlabeled_images=args.keep_unlabeled_images, copy_files=False, seed=args.seed, ) diff --git a/datadreamer/utils/base_converter.py b/datadreamer/utils/base_converter.py index 1885575..40003ed 100644 --- a/datadreamer/utils/base_converter.py +++ b/datadreamer/utils/base_converter.py @@ -19,7 +19,7 @@ def convert( dataset_dir: str, output_dir: str, split_ratios: List[float], - keep_empty_images: bool = False, + keep_unlabeled_images: bool = False, copy_files: bool = True, ) -> None: """Converts a dataset into another format. @@ -28,7 +28,7 @@ def convert( dataset_dir (str): The directory where the source dataset is located. output_dir (str): The directory where the processed dataset should be saved. split_ratios (list of float): The ratios to split the data into training, validation, and test sets. - keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False. + keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False. copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True. No return value. diff --git a/datadreamer/utils/coco_converter.py b/datadreamer/utils/coco_converter.py index 1b61b8b..40d599a 100644 --- a/datadreamer/utils/coco_converter.py +++ b/datadreamer/utils/coco_converter.py @@ -42,7 +42,7 @@ def convert( dataset_dir: str, output_dir: str, split_ratios: List[float], - keep_empty_images: bool = False, + keep_unlabeled_images: bool = False, copy_files: bool = True, ) -> None: """Converts a dataset into a COCO format. @@ -51,7 +51,7 @@ def convert( dataset_dir (str): The directory where the source dataset is located. output_dir (str): The directory where the processed dataset should be saved. split_ratios (list of float): The ratios to split the data into training, validation, and test sets. - keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False. + keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False. copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True. No return value. @@ -59,7 +59,12 @@ def convert( annotation_path = os.path.join(dataset_dir, "annotations.json") data = BaseConverter.read_annotations(annotation_path) self.process_data( - data, dataset_dir, output_dir, split_ratios, keep_empty_images, copy_files + data, + dataset_dir, + output_dir, + split_ratios, + keep_unlabeled_images, + copy_files, ) def process_data( @@ -68,7 +73,7 @@ def process_data( image_dir: str, output_dir: str, split_ratios: List[float], - keep_empty_images: bool = False, + keep_unlabeled_images: bool = False, copy_files: bool = True, ) -> None: """Processes the data by dividing it into training and validation sets, and @@ -79,7 +84,7 @@ def process_data( image_dir (str): The directory where the source images are located. output_dir (str): The base directory where the processed data will be saved. split_ratios (list of float): The ratios to split the data into training, validation, and test sets. - keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False. + keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False. copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True. No return value. @@ -88,11 +93,11 @@ def process_data( images.remove("class_names") empty_images = list(filter(lambda x: len(data[x]["labels"]) == 0, images)) - if keep_empty_images and len(empty_images) > 0: + if keep_unlabeled_images and len(empty_images) > 0: logger.warning( f"{len(empty_images)} images with no annotations will be included in the dataset." ) - elif not keep_empty_images and len(empty_images) > 0: + elif not keep_unlabeled_images and len(empty_images) > 0: logger.info( f"{len(empty_images)} images with no annotations will be excluded from the dataset." ) diff --git a/datadreamer/utils/config.py b/datadreamer/utils/config.py index 59f3a9d..6227b61 100644 --- a/datadreamer/utils/config.py +++ b/datadreamer/utils/config.py @@ -50,4 +50,4 @@ class Config(LuxonisConfig): dataset_name: str = "" dataset_id: str = "" # Dataset arguments - keep_empty_images: bool = False + keep_unlabeled_images: bool = False diff --git a/datadreamer/utils/convert_dataset.py b/datadreamer/utils/convert_dataset.py index 1154417..1bcea34 100644 --- a/datadreamer/utils/convert_dataset.py +++ b/datadreamer/utils/convert_dataset.py @@ -19,7 +19,7 @@ def convert_dataset( dataset_plugin: Optional[str] = None, dataset_name: Optional[str] = None, is_instance_segmentation: bool = False, - keep_empty_images: bool = False, + keep_unlabeled_images: bool = False, copy_files: bool = True, seed: int = 42, ) -> None: @@ -33,7 +33,7 @@ def convert_dataset( dataset_plugin (str, optional): Plugin for Luxonis dataset. Defaults to None. dataset_name (str, optional): Name of the Luxonis dataset. Defaults to None. is_instance_segmentation (bool, optional): Whether the dataset is for instance segmentation. Defaults to False. - keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False. + keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False. copy_files (bool, optional): Whether to copy the files to the output directory. Defaults to True. seed (int, optional): Random seed. Defaults to 42. @@ -61,7 +61,7 @@ def convert_dataset( raise ValueError(f"Invalid dataset format: {dataset_format}") converter.convert( - input_dir, output_dir, split_ratios, keep_empty_images, copy_files + input_dir, output_dir, split_ratios, keep_unlabeled_images, copy_files ) @@ -108,7 +108,7 @@ def main(): help="Whether the dataset is for instance segmentation.", ) parser.add_argument( - "--keep_empty_images", + "--keep_unlabeled_images", default=None, action="store_true", help="Whether to keep images without any annotations", @@ -136,7 +136,7 @@ def main(): dataset_plugin=args.dataset_plugin, dataset_name=args.dataset_name, is_instance_segmentation=args.is_instance_segmentation, - keep_empty_images=args.keep_empty_images, + keep_unlabeled_images=args.keep_unlabeled_images, copy_files=args.copy_files, seed=args.seed, ) diff --git a/datadreamer/utils/luxonis_dataset_converter.py b/datadreamer/utils/luxonis_dataset_converter.py index 6f728d0..fec3d36 100644 --- a/datadreamer/utils/luxonis_dataset_converter.py +++ b/datadreamer/utils/luxonis_dataset_converter.py @@ -38,7 +38,7 @@ def convert( dataset_dir: str, output_dir: str, split_ratios: List[float], - keep_empty_images: bool = False, + keep_unlabeled_images: bool = False, copy_files: bool = True, ) -> None: """Converts a dataset into a LuxonisDataset format. @@ -47,7 +47,7 @@ def convert( dataset_dir (str): The directory where the source dataset is located. output_dir (str): The directory where the processed dataset should be saved. split_ratios (list of float): The ratios to split the data into training, validation, and test sets. - keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False. + keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False. copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True. No return value. @@ -55,7 +55,7 @@ def convert( annotation_path = os.path.join(dataset_dir, "annotations.json") data = BaseConverter.read_annotations(annotation_path) self.process_data( - data, dataset_dir, output_dir, split_ratios, keep_empty_images + data, dataset_dir, output_dir, split_ratios, keep_unlabeled_images ) def process_data( @@ -64,7 +64,7 @@ def process_data( dataset_dir: str, output_dir: str, split_ratios: List[float], - keep_empty_images: bool = False, + keep_unlabeled_images: bool = False, ) -> None: """Processes the data into LuxonisDataset format. @@ -90,7 +90,7 @@ def dataset_generator(): width, height = Image.open(image_full_path).size labels = data[image_path]["labels"] - if len(labels) == 0 and keep_empty_images: + if len(labels) == 0 and keep_unlabeled_images: logger.warning( f"Image {image_path} has no annotations. Training on empty images with `luxonis-train` will result in an error." ) @@ -174,7 +174,7 @@ def dataset_generator(): dataset.add(dataset_generator()) - if not keep_empty_images: + if not keep_unlabeled_images: n_empty_images = len( list(filter(lambda x: len(data[x]["labels"]) == 0, image_paths)) ) diff --git a/datadreamer/utils/single_label_cls_converter.py b/datadreamer/utils/single_label_cls_converter.py index 66a0817..daa3bd8 100644 --- a/datadreamer/utils/single_label_cls_converter.py +++ b/datadreamer/utils/single_label_cls_converter.py @@ -41,7 +41,7 @@ def convert( dataset_dir: str, output_dir: str, split_ratios: List[float], - keep_empty_images: bool = False, + keep_unlabeled_images: bool = False, copy_files: bool = True, ) -> None: """Converts a dataset into a format suitable for single-label classification. @@ -50,7 +50,7 @@ def convert( dataset_dir (str): The directory where the source dataset is located. output_dir (str): The directory where the processed dataset should be saved. split_ratios (list of float): The ratios to split the data into training, validation, and test sets. - keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False. + keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False. copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True. No return value. diff --git a/datadreamer/utils/yolo_converter.py b/datadreamer/utils/yolo_converter.py index 2fed968..e4ad15a 100644 --- a/datadreamer/utils/yolo_converter.py +++ b/datadreamer/utils/yolo_converter.py @@ -43,7 +43,7 @@ def convert( dataset_dir: str, output_dir: str, split_ratios: List[float], - keep_empty_images: bool = False, + keep_unlabeled_images: bool = False, copy_files: bool = True, ): """Converts a dataset into a format suitable for training with YOLO, including @@ -53,7 +53,7 @@ def convert( dataset_dir (str): The directory where the source dataset is located. output_dir (str): The directory where the processed dataset should be saved. split_ratios (list of float): The ratios to split the data into training, validation, and test sets. - keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False. + keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False. copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True. No return value. @@ -61,7 +61,12 @@ def convert( annotation_path = os.path.join(dataset_dir, "annotations.json") data = BaseConverter.read_annotations(annotation_path) self.process_data( - data, dataset_dir, output_dir, split_ratios, keep_empty_images, copy_files + data, + dataset_dir, + output_dir, + split_ratios, + keep_unlabeled_images, + copy_files, ) def convert_to_yolo_format( @@ -104,7 +109,7 @@ def process_data( image_dir: str, output_dir: str, split_ratios: List[float], - keep_empty_images: bool = False, + keep_unlabeled_images: bool = False, copy_files: bool = True, ) -> None: """Processes the data by dividing it into training and validation sets, and @@ -115,7 +120,7 @@ def process_data( image_dir (str): The directory where the source images are located. output_dir (str): The base directory where the processed data will be saved. split_ratios (float): The ratio to split the data into training, validation, and test sets. - keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False. + keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False. copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True. No return value. @@ -124,11 +129,11 @@ def process_data( images.remove("class_names") empty_images = list(filter(lambda x: len(data[x]["labels"]) == 0, images)) - if keep_empty_images and len(empty_images) > 0: + if keep_unlabeled_images and len(empty_images) > 0: logger.warning( f"{len(empty_images)} images with no annotations will be included in the dataset." ) - elif not keep_empty_images and len(empty_images) > 0: + elif not keep_unlabeled_images and len(empty_images) > 0: logger.info( f"{len(empty_images)} images with no annotations will be excluded from the dataset." ) diff --git a/examples/generate_dataset_and_train_yolo.ipynb b/examples/generate_dataset_and_train_yolo.ipynb index 4324214..2a08030 100644 --- a/examples/generate_dataset_and_train_yolo.ipynb +++ b/examples/generate_dataset_and_train_yolo.ipynb @@ -97,7 +97,7 @@ "- `--lm_quantization`: Quantization to use for Mistral language model. Choose between `none` and `4bit`. Default is `none`.\n", "- `--annotator_size`: Size of the annotator model to use. Choose between `base` and `large`. Default is `base`.\n", "- `--disable_lm_filter`: Use only a bad word list for profanity filtering. Default is `False`.\n", - "- `--keep_empty_images`: Whether to keep images without any annotations. Default if `False`.\n", + "- `--keep_unlabeled_images`: Whether to keep images without any annotations. Default if `False`.\n", "- `--batch_size_prompt`: Batch size for prompt generation. Default is 64.\n", "- `--batch_size_annotation`: Batch size for annotation. Default is `1`.\n", "- `--batch_size_image`: Batch size for image generation. Default is `1`.\n", diff --git a/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb b/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb index 951e9c9..1588001 100644 --- a/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb +++ b/examples/generate_instance_segmentation_dataset_and_train_yolo.ipynb @@ -112,7 +112,7 @@ "- `--lm_quantization`: Quantization to use for Mistral language model. Choose between `none` and `4bit`. Default is `none`.\n", "- `--annotator_size`: Size of the annotator model to use. Choose between `base` and `large`. Default is `base`.\n", "- `--disable_lm_filter`: Use only a bad word list for profanity filtering. Default is `False`.\n", - "- `--keep_empty_images`: Whether to keep images without any annotations. Default if `False`.\n", + "- `--keep_unlabeled_images`: Whether to keep images without any annotations. Default if `False`.\n", "- `--batch_size_prompt`: Batch size for prompt generation. Default is 64.\n", "- `--batch_size_annotation`: Batch size for annotation. Default is `1`.\n", "- `--batch_size_image`: Batch size for image generation. Default is `1`.\n",