Skip to content

Commit

Permalink
Rename 'keep_empty_images' to 'keep_unlabeled_images'
Browse files Browse the repository at this point in the history
  • Loading branch information
HonzaCuhel committed Nov 12, 2024
1 parent ec6c167 commit e6653d9
Show file tree
Hide file tree
Showing 11 changed files with 47 additions and 37 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ datadreamer --config <path-to-config>
- `--lm_quantization`: Quantization to use for Mistral language model. Choose between `none` and `4bit`. Default is `none`.
- `--annotator_size`: Size of the annotator model to use. Choose between `base` and `large`. Default is `base`.
- `--disable_lm_filter`: Use only a bad word list for profanity filtering. Default is `False`.
- `--keep_empty_images`: Whether to keep images without any annotations. Default if `False`.
- `--keep_unlabeled_images`: Whether to keep images without any annotations. Default if `False`.
- `--batch_size_prompt`: Batch size for prompt generation. Default is 64.
- `--batch_size_annotation`: Batch size for annotation. Default is `1`.
- `--batch_size_image`: Batch size for image generation. Default is `1`.
Expand Down
8 changes: 4 additions & 4 deletions datadreamer/pipelines/generate_dataset_from_scratch.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def parse_args():
)

parser.add_argument(
"--keep_empty_images",
"--keep_unlabeled_images",
default=None,
action="store_true",
help="Whether to keep images without any annotations",
Expand Down Expand Up @@ -725,7 +725,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
args.split_ratios,
copy_files=False,
is_instance_segmentation=args.task == "instance-segmentation",
keep_empty_images=args.keep_empty_images,
keep_unlabeled_images=args.keep_unlabeled_images,
seed=args.seed,
)
# Convert annotations to COCO format
Expand All @@ -736,7 +736,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
"coco",
args.split_ratios,
is_instance_segmentation=args.task == "instance-segmentation",
keep_empty_images=args.keep_empty_images,
keep_unlabeled_images=args.keep_unlabeled_images,
copy_files=False,
seed=args.seed,
)
Expand All @@ -751,7 +751,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
dataset_plugin=args.dataset_plugin,
dataset_name=args.dataset_name,
is_instance_segmentation=args.task == "instance-segmentation",
keep_empty_images=args.keep_empty_images,
keep_unlabeled_images=args.keep_unlabeled_images,
copy_files=False,
seed=args.seed,
)
Expand Down
4 changes: 2 additions & 2 deletions datadreamer/utils/base_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def convert(
dataset_dir: str,
output_dir: str,
split_ratios: List[float],
keep_empty_images: bool = False,
keep_unlabeled_images: bool = False,
copy_files: bool = True,
) -> None:
"""Converts a dataset into another format.
Expand All @@ -28,7 +28,7 @@ def convert(
dataset_dir (str): The directory where the source dataset is located.
output_dir (str): The directory where the processed dataset should be saved.
split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
No return value.
Expand Down
19 changes: 12 additions & 7 deletions datadreamer/utils/coco_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def convert(
dataset_dir: str,
output_dir: str,
split_ratios: List[float],
keep_empty_images: bool = False,
keep_unlabeled_images: bool = False,
copy_files: bool = True,
) -> None:
"""Converts a dataset into a COCO format.
Expand All @@ -51,15 +51,20 @@ def convert(
dataset_dir (str): The directory where the source dataset is located.
output_dir (str): The directory where the processed dataset should be saved.
split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
No return value.
"""
annotation_path = os.path.join(dataset_dir, "annotations.json")
data = BaseConverter.read_annotations(annotation_path)
self.process_data(
data, dataset_dir, output_dir, split_ratios, keep_empty_images, copy_files
data,
dataset_dir,
output_dir,
split_ratios,
keep_unlabeled_images,
copy_files,
)

def process_data(
Expand All @@ -68,7 +73,7 @@ def process_data(
image_dir: str,
output_dir: str,
split_ratios: List[float],
keep_empty_images: bool = False,
keep_unlabeled_images: bool = False,
copy_files: bool = True,
) -> None:
"""Processes the data by dividing it into training and validation sets, and
Expand All @@ -79,7 +84,7 @@ def process_data(
image_dir (str): The directory where the source images are located.
output_dir (str): The base directory where the processed data will be saved.
split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
No return value.
Expand All @@ -88,11 +93,11 @@ def process_data(
images.remove("class_names")

empty_images = list(filter(lambda x: len(data[x]["labels"]) == 0, images))
if keep_empty_images and len(empty_images) > 0:
if keep_unlabeled_images and len(empty_images) > 0:
logger.warning(
f"{len(empty_images)} images with no annotations will be included in the dataset."
)
elif not keep_empty_images and len(empty_images) > 0:
elif not keep_unlabeled_images and len(empty_images) > 0:
logger.info(
f"{len(empty_images)} images with no annotations will be excluded from the dataset."
)
Expand Down
2 changes: 1 addition & 1 deletion datadreamer/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,4 @@ class Config(LuxonisConfig):
dataset_name: str = ""
dataset_id: str = ""
# Dataset arguments
keep_empty_images: bool = False
keep_unlabeled_images: bool = False
10 changes: 5 additions & 5 deletions datadreamer/utils/convert_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def convert_dataset(
dataset_plugin: Optional[str] = None,
dataset_name: Optional[str] = None,
is_instance_segmentation: bool = False,
keep_empty_images: bool = False,
keep_unlabeled_images: bool = False,
copy_files: bool = True,
seed: int = 42,
) -> None:
Expand All @@ -33,7 +33,7 @@ def convert_dataset(
dataset_plugin (str, optional): Plugin for Luxonis dataset. Defaults to None.
dataset_name (str, optional): Name of the Luxonis dataset. Defaults to None.
is_instance_segmentation (bool, optional): Whether the dataset is for instance segmentation. Defaults to False.
keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the files to the output directory. Defaults to True.
seed (int, optional): Random seed. Defaults to 42.
Expand Down Expand Up @@ -61,7 +61,7 @@ def convert_dataset(
raise ValueError(f"Invalid dataset format: {dataset_format}")

converter.convert(
input_dir, output_dir, split_ratios, keep_empty_images, copy_files
input_dir, output_dir, split_ratios, keep_unlabeled_images, copy_files
)


Expand Down Expand Up @@ -108,7 +108,7 @@ def main():
help="Whether the dataset is for instance segmentation.",
)
parser.add_argument(
"--keep_empty_images",
"--keep_unlabeled_images",
default=None,
action="store_true",
help="Whether to keep images without any annotations",
Expand Down Expand Up @@ -136,7 +136,7 @@ def main():
dataset_plugin=args.dataset_plugin,
dataset_name=args.dataset_name,
is_instance_segmentation=args.is_instance_segmentation,
keep_empty_images=args.keep_empty_images,
keep_unlabeled_images=args.keep_unlabeled_images,
copy_files=args.copy_files,
seed=args.seed,
)
Expand Down
12 changes: 6 additions & 6 deletions datadreamer/utils/luxonis_dataset_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def convert(
dataset_dir: str,
output_dir: str,
split_ratios: List[float],
keep_empty_images: bool = False,
keep_unlabeled_images: bool = False,
copy_files: bool = True,
) -> None:
"""Converts a dataset into a LuxonisDataset format.
Expand All @@ -47,15 +47,15 @@ def convert(
dataset_dir (str): The directory where the source dataset is located.
output_dir (str): The directory where the processed dataset should be saved.
split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
No return value.
"""
annotation_path = os.path.join(dataset_dir, "annotations.json")
data = BaseConverter.read_annotations(annotation_path)
self.process_data(
data, dataset_dir, output_dir, split_ratios, keep_empty_images
data, dataset_dir, output_dir, split_ratios, keep_unlabeled_images
)

def process_data(
Expand All @@ -64,7 +64,7 @@ def process_data(
dataset_dir: str,
output_dir: str,
split_ratios: List[float],
keep_empty_images: bool = False,
keep_unlabeled_images: bool = False,
) -> None:
"""Processes the data into LuxonisDataset format.
Expand All @@ -90,7 +90,7 @@ def dataset_generator():
width, height = Image.open(image_full_path).size
labels = data[image_path]["labels"]

if len(labels) == 0 and keep_empty_images:
if len(labels) == 0 and keep_unlabeled_images:
logger.warning(
f"Image {image_path} has no annotations. Training on empty images with `luxonis-train` will result in an error."
)
Expand Down Expand Up @@ -174,7 +174,7 @@ def dataset_generator():

dataset.add(dataset_generator())

if not keep_empty_images:
if not keep_unlabeled_images:
n_empty_images = len(
list(filter(lambda x: len(data[x]["labels"]) == 0, image_paths))
)
Expand Down
4 changes: 2 additions & 2 deletions datadreamer/utils/single_label_cls_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def convert(
dataset_dir: str,
output_dir: str,
split_ratios: List[float],
keep_empty_images: bool = False,
keep_unlabeled_images: bool = False,
copy_files: bool = True,
) -> None:
"""Converts a dataset into a format suitable for single-label classification.
Expand All @@ -50,7 +50,7 @@ def convert(
dataset_dir (str): The directory where the source dataset is located.
output_dir (str): The directory where the processed dataset should be saved.
split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
No return value.
Expand Down
19 changes: 12 additions & 7 deletions datadreamer/utils/yolo_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def convert(
dataset_dir: str,
output_dir: str,
split_ratios: List[float],
keep_empty_images: bool = False,
keep_unlabeled_images: bool = False,
copy_files: bool = True,
):
"""Converts a dataset into a format suitable for training with YOLO, including
Expand All @@ -53,15 +53,20 @@ def convert(
dataset_dir (str): The directory where the source dataset is located.
output_dir (str): The directory where the processed dataset should be saved.
split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
No return value.
"""
annotation_path = os.path.join(dataset_dir, "annotations.json")
data = BaseConverter.read_annotations(annotation_path)
self.process_data(
data, dataset_dir, output_dir, split_ratios, keep_empty_images, copy_files
data,
dataset_dir,
output_dir,
split_ratios,
keep_unlabeled_images,
copy_files,
)

def convert_to_yolo_format(
Expand Down Expand Up @@ -104,7 +109,7 @@ def process_data(
image_dir: str,
output_dir: str,
split_ratios: List[float],
keep_empty_images: bool = False,
keep_unlabeled_images: bool = False,
copy_files: bool = True,
) -> None:
"""Processes the data by dividing it into training and validation sets, and
Expand All @@ -115,7 +120,7 @@ def process_data(
image_dir (str): The directory where the source images are located.
output_dir (str): The base directory where the processed data will be saved.
split_ratios (float): The ratio to split the data into training, validation, and test sets.
keep_empty_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
keep_unlabeled_images (bool, optional): Whether to keep images with no annotations. Defaults to False.
copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
No return value.
Expand All @@ -124,11 +129,11 @@ def process_data(
images.remove("class_names")

empty_images = list(filter(lambda x: len(data[x]["labels"]) == 0, images))
if keep_empty_images and len(empty_images) > 0:
if keep_unlabeled_images and len(empty_images) > 0:
logger.warning(
f"{len(empty_images)} images with no annotations will be included in the dataset."
)
elif not keep_empty_images and len(empty_images) > 0:
elif not keep_unlabeled_images and len(empty_images) > 0:
logger.info(
f"{len(empty_images)} images with no annotations will be excluded from the dataset."
)
Expand Down
2 changes: 1 addition & 1 deletion examples/generate_dataset_and_train_yolo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@
"- `--lm_quantization`: Quantization to use for Mistral language model. Choose between `none` and `4bit`. Default is `none`.\n",
"- `--annotator_size`: Size of the annotator model to use. Choose between `base` and `large`. Default is `base`.\n",
"- `--disable_lm_filter`: Use only a bad word list for profanity filtering. Default is `False`.\n",
"- `--keep_empty_images`: Whether to keep images without any annotations. Default if `False`.\n",
"- `--keep_unlabeled_images`: Whether to keep images without any annotations. Default if `False`.\n",
"- `--batch_size_prompt`: Batch size for prompt generation. Default is 64.\n",
"- `--batch_size_annotation`: Batch size for annotation. Default is `1`.\n",
"- `--batch_size_image`: Batch size for image generation. Default is `1`.\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@
"- `--lm_quantization`: Quantization to use for Mistral language model. Choose between `none` and `4bit`. Default is `none`.\n",
"- `--annotator_size`: Size of the annotator model to use. Choose between `base` and `large`. Default is `base`.\n",
"- `--disable_lm_filter`: Use only a bad word list for profanity filtering. Default is `False`.\n",
"- `--keep_empty_images`: Whether to keep images without any annotations. Default if `False`.\n",
"- `--keep_unlabeled_images`: Whether to keep images without any annotations. Default if `False`.\n",
"- `--batch_size_prompt`: Batch size for prompt generation. Default is 64.\n",
"- `--batch_size_annotation`: Batch size for annotation. Default is `1`.\n",
"- `--batch_size_image`: Batch size for image generation. Default is `1`.\n",
Expand Down

0 comments on commit e6653d9

Please sign in to comment.