From 7b47511ddff71ca150f39447cd826bd11464ab8c Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Sun, 7 Apr 2024 22:19:04 +0000 Subject: [PATCH 01/23] feature: add dataset convertets (coco, yolo, ldf) --- datadreamer/utils/base_converter.py | 68 +++++++++++++ datadreamer/utils/coco_converter.py | 129 +++++++++++++++++++++++++ datadreamer/utils/convert_dataset.py | 51 ++++++++++ datadreamer/utils/ldf_convreter.py | 69 ++++++++++++++ datadreamer/utils/yolo_converter.py | 138 +++++++++++++++++++++++++++ 5 files changed, 455 insertions(+) create mode 100644 datadreamer/utils/base_converter.py create mode 100644 datadreamer/utils/coco_converter.py create mode 100644 datadreamer/utils/convert_dataset.py create mode 100644 datadreamer/utils/ldf_convreter.py create mode 100644 datadreamer/utils/yolo_converter.py diff --git a/datadreamer/utils/base_converter.py b/datadreamer/utils/base_converter.py new file mode 100644 index 0000000..945dda8 --- /dev/null +++ b/datadreamer/utils/base_converter.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +import json +import numpy as np + +from abc import ABC, abstractmethod + +class BaseConverter(ABC): + """Abstract base class for converter. + """ + + def __init__(self): + pass + + @abstractmethod + def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): + """Converts a dataset into a format suitable for training with YOLO, including + creating training and validation splits. + + Args: + - dataset_dir (str): The directory where the source dataset is located. + - output_dir (str): The directory where the processed dataset should be saved. + - split_ratios (list of float): The ratios to split the data into training, validation, and test sets. + - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True. + + + No return value. + """ + pass + + @staticmethod + def read_annotations(annotation_path): + """Reads annotations from a JSON file located at the specified path. + + Args: + - annotation_path (str): The path to the JSON file containing annotations. + + Returns: + - dict: A dictionary containing the data loaded from the JSON file. + """ + with open(annotation_path) as f: + data = json.load(f) + return data + + @staticmethod + def make_splits(images, split_ratios): + """Splits the list of images into training, validation, and test sets. + + Args: + - images (list of str): A list of image paths. + - split_ratios (list of float): The ratios to split the data into training, validation, and test sets. + + Returns: + - list of str: A list of image paths for the training set. + - list of str: A list of image paths for the validation set. + - list of str: A list of image paths for the test set. + """ + np.random.shuffle(images) + + train_images = images[: int(len(images) * split_ratios[0])] + val_images = images[ + int(len(images) * split_ratios[0]) : int( + len(images) * (split_ratios[0] + split_ratios[1]) + ) + ] + test_images = images[int(len(images) * (split_ratios[0] + split_ratios[1])) :] + + return train_images, val_images, test_images diff --git a/datadreamer/utils/coco_converter.py b/datadreamer/utils/coco_converter.py new file mode 100644 index 0000000..5ab3b71 --- /dev/null +++ b/datadreamer/utils/coco_converter.py @@ -0,0 +1,129 @@ +from __future__ import annotations + + +import os +import shutil +import numpy as np +import json +from PIL import Image + +from datadreamer.utils.base_converter import BaseConverter + +class COCOConverter(BaseConverter): + """Class for converting a dataset to COCO format. + + Format: + + dataset_dir + ├── train + │ ├── data + │ │ ├── 0.jpg + │ │ ├── 1.jpg + │ ├── labels.json + ├── validation + │ ├── data + │ ├── labels.json + ├── test + │ ├── data + │ ├── labels.json + """ + + def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): + """Converts a dataset into a format suitable for training with YOLO, including + creating training and validation splits. + + Args: + - dataset_dir (str): The directory where the source dataset is located. + - output_dir (str): The directory where the processed dataset should be saved. + - split_ratios (list of float): The ratios to split the data into training, validation, and test sets. + - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True. + + No return value. + """ + annotation_path = os.path.join(dataset_dir, "annotations.json") + data = BaseConverter.read_annotations(annotation_path) + self.process_data(data, dataset_dir, output_dir, split_ratios, copy_files) + + def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=True): + """Processes the data by dividing it into training and validation sets, and saves + the images and labels in COCO format. + + Args: + - data (dict): The dictionary containing image annotations. + - image_dir (str): The directory where the source images are located. + - output_dir (str): The base directory where the processed data will be saved. + - split_ratios (float): The ratio to split the data into training, validation, and test sets. + - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True. + + + No return value. + """ + images = list(data.keys()) + images.remove("class_names") + + train_images, val_images, test_images = BaseConverter.make_splits(images, split_ratios) + + for dataset_type, image_set in [("train", train_images), ("validation", val_images), ("test", test_images)]: + dataset_output_dir = os.path.join(output_dir, dataset_type) + data_output_dir = os.path.join(dataset_output_dir, "data") + + if os.path.exists(data_output_dir): + shutil.rmtree(image_output_dir) + + os.makedirs(data_output_dir) + + images_info = [] + annotations = [] + annotation_id = 0 + + for image_name in image_set: + + image_full_path = os.path.join(image_dir, image_name) + annotation = data[image_name] + image = Image.open(image_full_path) + image_width, image_height = image.size + + images_info.append({ + "id": len(images_info) + 1, + "file_name": image_name, + "width": image_width, + "height": image_height + }) + + for box, label in zip(annotation["boxes"], annotation["labels"]): + annotations.append({ + "id": annotation_id, + "image_id": len(images_info), + "category_id": label, + "bbox": [box[0], box[1], box[2] - box[0], box[3] - box[1]], + "segmentation": None, #[[box[0], box[1], box[2], box[1], box[2], box[3], box[0], box[3]]], # bbox mask + "area": (box[2] - box[0]) * (box[3] - box[1]), + "iscrowd": 0 + }) + annotation_id += 1 + + if copy_files: + shutil.copy(image_full_path, os.path.join(data_output_dir, image_name)) + else: + shutil.move(image_full_path, os.path.join(data_output_dir, image_name)) + + self.save_labels(dataset_output_dir, images_info, annotations, data["class_names"]) + + def save_labels(self, dataset_output_dir, images_info, annotations, class_names): + """Saves the labels to a JSON file. + + Args: + - dataset_output_dir (str): The directory where the labels should be saved. + - images_info (list of dict): A list of dictionaries containing image information. + - annotations (list of dict): A list of dictionaries containing annotation information. + - class_names (list of str): A list of class names. + + No return value. + """ + + with open(os.path.join(dataset_output_dir, "labels.json"), "w") as f: + json.dump({ + "images": images_info, + "annotations": annotations, + "categories": [{"id": i, "name": name} for i, name in enumerate(class_names)] + }, f) \ No newline at end of file diff --git a/datadreamer/utils/convert_dataset.py b/datadreamer/utils/convert_dataset.py new file mode 100644 index 0000000..ed0182f --- /dev/null +++ b/datadreamer/utils/convert_dataset.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import argparse + +from datadreamer.utils import YOLOConverter, COCOConverter, LDFConverter + +def convert_dataset(input_dir, output_dir, dataset_format, split_ratios, copy_files=True): + if dataset_format == "yolo": + converter = YOLOConverter() + elif dataset_format == "coco": + converter = COCOConverter() + elif dataset_format == "ldf": + converter = LDFConverter() + else: + raise ValueError(f"Invalid dataset format: {dataset_format}") + + converter.convert(input_dir, output_dir, split_ratios, copy_files) + +def main(): + parser = argparse.ArgumentParser( + description="Convert raw dataset to another format with train-val-test split." + ) + parser.add_argument( + "--input_dir", type=str, help="Directory containing the images and annotations." + ) + parser.add_argument( + "--output_dir", + type=str, + help="Directory where the processed dataset will be saved.", + ) + parser.add_argument( + "--dataset_format", type=str, default="yolo", choices=["yolo", "coco", "ldf" ], + ) + parser.add_argument( + "--split_ratios", + type=float, + nargs="+", + default=[0.8, 0.1, 0.1], + help="Train-validation-test split ratios (default: 0.8, 0.1, 0.1).", + ) + parser.add_argument( + "--copy_files", type=bool, default=True, help="Copy files to output directory, otherwise move them." + ) + + args = parser.parse_args() + + convert_dataset(args.input_dir, args.output_dir, args.dataset_format, args.split_ratios, args.copy_files) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/datadreamer/utils/ldf_convreter.py b/datadreamer/utils/ldf_convreter.py new file mode 100644 index 0000000..32df801 --- /dev/null +++ b/datadreamer/utils/ldf_convreter.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +import json +import os +from PIL import Image +from luxonis_ml.data import LuxonisDataset + +from datadreamer.utils import BaseConverter + + +class LDFConverter(BaseConverter): + def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): + """Converts a dataset into a format suitable for training with YOLO, including + creating training and validation splits. + + Args: + - dataset_dir (str): The directory where the source dataset is located. + - output_dir (str): The directory where the processed dataset should be saved. + - split_ratios (list of float): The ratios to split the data into training, validation, and test sets. + - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True. + + No return value. + """ + annotation_path = os.path.join(dataset_dir, "annotations.json") + data = BaseConverter.read_annotations(annotation_path) + self.process_data(data, dataset_dir, output_dir, split_ratios) + + def process_data(self, data, dataset_dir, output_dir, split_ratios): + class_names = data["class_names"] + image_paths = list(data.keys()) + image_paths.remove("class_names") + + def dataset_generator(): + # find image paths and load COCO annotations + + for image_path in image_paths: + image_full_path = os.path.join(dataset_dir, image_path) + width, height = Image.open(image_full_path).size + labels = data[image_path]["labels"] + for label in labels: + yield { + "file": image_full_path, + "class": class_names[label], + "type": "classification", + "value": True, + } + + if "boxes" in data[image_path]: + boxes = data[image_path]["boxes"] + for box in boxes: + x, y, w, h = box[0], box[1], box[2] - box[0], box[3] - box[1] + yield { + "file": image_full_path, + "class": class_names[label], + "type": "box", + "value": (x / width, y / height, w / width, h / height), + } + + dataset_name = os.path.basename(output_dir) + if LuxonisDataset.exists(dataset_name): + dataset = LuxonisDataset(dataset_name) + dataset.delete_dataset() + + dataset = LuxonisDataset(dataset_name) + dataset.set_classes(class_names) + + dataset.add(dataset_generator) + + dataset.make_splits(split_ratios) \ No newline at end of file diff --git a/datadreamer/utils/yolo_converter.py b/datadreamer/utils/yolo_converter.py new file mode 100644 index 0000000..4724b3e --- /dev/null +++ b/datadreamer/utils/yolo_converter.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +import os +import shutil +import json +from PIL import Image + +from datadreamer.utils import BaseConverter + +class YOLOConverter(BaseConverter): + """Class for converting a dataset to YOLO format. + + Format: + + dataset_dir + ├── train + │ ├── images + │ │ ├── 0.jpg + │ │ ├── 1.jpg + │ ├── labels + │ │ ├── 0.txt + │ │ ├── 1.txt + ├── val + │ ├── images + │ ├── labels + ├── test + │ ├── images + │ ├── labels + """ + + + + def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): + """Converts a dataset into a format suitable for training with YOLO, including + creating training and validation splits. + + Args: + - dataset_dir (str): The directory where the source dataset is located. + - output_dir (str): The directory where the processed dataset should be saved. + - split_ratios (list of float): The ratios to split the data into training, validation, and test sets. + - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True. + + No return value. + """ + annotation_path = os.path.join(dataset_dir, "annotations.json") + data = BaseConverter.read_annotations(annotation_path) + self.process_data(data, dataset_dir, output_dir, split_ratios, copy_files) + + def convert_to_yolo_format(self, box, image_width, image_height): + """Converts bounding box coordinates to YOLO format. + + Args: + - box (list of float): A list containing the bounding box coordinates [x_min, y_min, x_max, y_max]. + - image_width (int): The width of the image. + - image_height (int): The height of the image. + + Returns: + - list of float: A list containing the bounding box in YOLO format [x_center, y_center, width, height]. + """ + x_center = (box[0] + box[2]) / 2 / image_width + y_center = (box[1] + box[3]) / 2 / image_height + width = (box[2] - box[0]) / image_width + height = (box[3] - box[1]) / image_height + return [x_center, y_center, width, height] + + def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=True): + """Processes the data by dividing it into training and validation sets, and saves + the images and labels in YOLO format. + + Args: + - data (dict): The dictionary containing image annotations. + - image_dir (str): The directory where the source images are located. + - output_dir (str): The base directory where the processed data will be saved. + - split_ratios (float): The ratio to split the data into training, validation, and test sets. + - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True. + + + No return value. + """ + images = list(data.keys()) + images.remove("class_names") + + train_images, val_images, test_images = BaseConverter.make_splits(images, split_ratios) + + for dataset_type, image_set in [("train", train_images), ("val", val_images), ("test", test_images)]: + image_output_dir = os.path.join(output_dir, dataset_type, "images") + label_output_dir = os.path.join(output_dir, dataset_type, "labels") + + # If the output directories already exist, replace them + if os.path.exists(image_output_dir): + shutil.rmtree(image_output_dir) + if os.path.exists(label_output_dir): + shutil.rmtree(label_output_dir) + + os.makedirs(image_output_dir) + os.makedirs(label_output_dir) + + for image_name in image_set: + # extract image name from image path + image_full_path = os.path.join(image_dir, image_name) + annotation = data[image_name] + image = Image.open(image_full_path) + image_width, image_height = image.size + + label_file = os.path.join( + label_output_dir, os.path.splitext(image_name)[0] + ".txt" + ) + with open(label_file, "w") as f: + for box, label in zip(annotation["boxes"], annotation["labels"]): + yolo_box = self.convert_to_yolo_format(box, image_width, image_height) + f.write(f"{label} {' '.join(map(str, yolo_box))}\n") + + if copy_files: + shutil.copy(image_full_path, os.path.join(image_output_dir, image_name)) + else: + shutil.move(image_full_path, os.path.join(image_output_dir, image_name)) + + self.create_data_yaml(output_dir, data["class_names"]) + + + def create_data_yaml(self, root_dir, class_names): + """Creates a YAML file for dataset configuration, specifying paths and class names. + + Args: + - root_dir (str): The root directory where the dataset is located. + - class_names (list of str): A list of class names. + + No return value. + """ + yaml_content = ( + f"train: {os.path.abspath(os.path.join(root_dir, 'train'))}\n" + f"val: {os.path.abspath(os.path.join(root_dir, 'val'))}\n" + f"test: {os.path.abspath(os.path.join(root_dir, 'test'))}\n" + f"nc: {len(class_names)}\n" + f"names: {class_names}" + ) + with open(os.path.join(root_dir, "data.yaml"), "w") as f: + f.write(yaml_content) \ No newline at end of file From ec191ba4e95e77772b0d58d209ebb786a30ba6dc Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Sun, 7 Apr 2024 22:20:30 +0000 Subject: [PATCH 02/23] feature: add dataset utils --- datadreamer/utils/dataset_utils.py | 74 ++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 datadreamer/utils/dataset_utils.py diff --git a/datadreamer/utils/dataset_utils.py b/datadreamer/utils/dataset_utils.py new file mode 100644 index 0000000..7d1963b --- /dev/null +++ b/datadreamer/utils/dataset_utils.py @@ -0,0 +1,74 @@ +import json +import os +from PIL import Image + + +from luxonis_ml.data import LuxonisDataset + +def save_annotations_to_json( + image_paths, + labels_list, + boxes_list=None, + class_names=None, + save_dir=None, + file_name="annotations.json", +): + annotations = {} + for i in range(len(image_paths)): + #for image_path, bboxes, labels in zip(image_paths, boxes_list, labels_list): + image_name = os.path.basename(image_paths[i]) + #image_name = os.path.basename(image_path) + labels = labels_list[i] + annotations[image_name] = { + "labels": labels.tolist(), + } + if boxes_list is not None: + bboxes = boxes_list[i] + annotations[image_name]["boxes"] = bboxes.tolist() + + annotations["class_names"] = class_names + + # Save to JSON file + with open(os.path.join(save_dir, file_name), "w") as f: + json.dump(annotations, f, indent=4) + +def convert_to_ldf(image_paths, labels_list, boxes_list, save_dir, class_names, split_ratios): + width, height = Image.open(image_paths[0]).size + def dataset_generator(): + # find image paths and load COCO annotations + + for i in range(len(image_paths)): + image_path = image_paths[i] + labels = labels_list[i] + for label in labels: + yield { + "file": image_path, + "class": class_names[label], + "type": "classification", + "value": True, + } + + if boxes_list: + boxes = boxes_list[i] + for box in boxes: + x, y, w, h = box[0], box[1], box[2] - box[0], box[3] - box[1] + yield { + "file": image_path, + "class": class_names[label], + "type": "box", + "value": (x / width, y / height, w / width, h / height), + } + + dataset_name = os.path.basename(save_dir) + if LuxonisDataset.exists(dataset_name): + dataset = LuxonisDataset(dataset_name) + dataset.delete_dataset() + + dataset = LuxonisDataset(dataset_name) + dataset.set_classes(class_names) + + dataset.add(dataset_generator) + + dataset.make_splits(split_ratios) + + From e50f250325a622628606514ac980b954d24c6e7c Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Sun, 7 Apr 2024 22:21:09 +0000 Subject: [PATCH 03/23] feature: add raw dataset merge --- datadreamer/utils/merge_raw_datasets.py | 85 +++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 datadreamer/utils/merge_raw_datasets.py diff --git a/datadreamer/utils/merge_raw_datasets.py b/datadreamer/utils/merge_raw_datasets.py new file mode 100644 index 0000000..57935b0 --- /dev/null +++ b/datadreamer/utils/merge_raw_datasets.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import argparse +import os +import json +import shutil + +def merge_datasets(input_dirs, output_dir, copy_files=True): + + config_tasks = [] + config_classes = [] + random_seeds = [] + for input_dir in input_dirs: + with open(os.path.join(input_dir, "generation_args.json")) as f: + generation_args = json.load(f) + config_tasks.append(generation_args["task"]) + config_classes.append(generation_args["class_names"]) + random_seeds.append(generation_args["seed"]) + + # Check if all tasks are the same + if len(set(config_tasks)) != 1: + raise ValueError("All datasets must have the same task") + # Check if all list of classes are the same + if len(set(tuple(sorted(classes)) for classes in config_classes)) != 1: + raise ValueError("All datasets must have the same list of classes") + + # Check if all datasets have different random seeds + if len(set(random_seeds)) != len(input_dirs): + raise ValueError("All datasets must have different random seeds") + + # Create output directory + print(f"Output directory: {output_dir}") + if os.path.exists(output_dir): + shutil.rmtree(output_dir) + os.makedirs(output_dir) + + annotations_merged = {} + for i, input_dir in enumerate(input_dirs): + + with open(os.path.join(input_dir, "annotations.json")) as f: + annotations = json.load(f) + class_names = annotations.pop("class_names") + annotations_merged = {**annotations_merged, **annotations} + + # Copy or move generation_args.json files + if copy_files: + shutil.copy(os.path.join(input_dir, "generation_args.json"), os.path.join(output_dir, f"generation_args_{i}.json")) + else: + shutil.move(os.path.join(input_dir, "generation_args.json"), os.path.join(output_dir, f"generation_args_{i}.json")) + + # Copy or move images + for image_path in annotations: + if copy_files: + shutil.copy(os.path.join(input_dir, image_path), os.path.join(output_dir, image_path)) + else: + shutil.move(os.path.join(input_dir, image_path), os.path.join(output_dir, image_path)) + + annotations_merged["class_names"] = class_names + with open(os.path.join(output_dir, "annotations.json"), "w") as f: + json.dump(annotations_merged, f, indent=4) + + +def main(): + parser = argparse.ArgumentParser( + description="Merge raw datasets" + ) + parser.add_argument( + "--input_dirs", type=str, nargs="+", help="Directories containing the images and annotations." + ) + parser.add_argument( + "--output_dir", + type=str, + help="Directory where the merged dataset will be saved.", + ) + parser.add_argument( + "--copy_files", type=bool, default=True, help="Copy files to output directory, otherwise move them." + ) + + args = parser.parse_args() + + merge_datasets(args.input_dirs, args.output_dir, args.copy_files) + + +if __name__ == "__main__": + main() \ No newline at end of file From 56969440768c63176b2da81c3b8ef3d60133646c Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Sun, 7 Apr 2024 22:22:57 +0000 Subject: [PATCH 04/23] docs: update examples --- examples/generate_dataset_and_train_yolo.ipynb | 12 +++++++----- examples/helmet_detection.ipynb | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/generate_dataset_and_train_yolo.ipynb b/examples/generate_dataset_and_train_yolo.ipynb index b113b7e..44bc495 100644 --- a/examples/generate_dataset_and_train_yolo.ipynb +++ b/examples/generate_dataset_and_train_yolo.ipynb @@ -82,6 +82,8 @@ "- `--prompt_generator`: Choose between `simple`, `lm` (language model) and `tiny` (tiny LM). Default is `simple`.\n", "- `--image_generator`: Choose image generator, e.g., `sdxl`, `sdxl-turbo` or `sdxl-lightning`. Default is `sdxl-turbo`.\n", "- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `clip` for image classification. Default is `owlv2`.\n", + "- `--dataset_format`: Format of the dataset. Defaults to `raw`. Supported values: `raw`, `yolo`, `coco`, `ldf`.\n", + "- `--split_ratios`: Split ratios for train, validation, and test sets. Defaults to `[0.8, 0.1, 0.1]`.\n", "- `--conf_threshold`: Confidence threshold for annotation. Default is `0.15`.\n", "- `--annotation_iou_threshold`: Intersection over Union (IoU) threshold for annotation. Default is `0.2`.\n", "- `--prompt_prefix`: Prefix to add to every image generation prompt. Default is `\"\"`.\n", @@ -144,26 +146,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "3dd01a6a", "metadata": { "id": "3dd01a6a" }, "outputs": [], "source": [ - "from datadreamer.utils.convert_dataset_to_yolo import convert" + "from datadreamer.utils.convert_dataset import convert_dataset" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "9b9bb74d", "metadata": { "id": "9b9bb74d" }, "outputs": [], "source": [ - "convert(dataset_dir=\"generated_dataset\", output_dir=\"generated_dataset_yolo\", train_val_split_ratio=0.8)" + "convert_dataset(input_dir=\"generated_dataset\", output_dir=\"generated_dataset_yolo\", dataset_format=\"yolo\", split_ratios=[0.8, 0.1, 0.1], copy_files=True)" ] }, { @@ -425,7 +427,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/examples/helmet_detection.ipynb b/examples/helmet_detection.ipynb index 7e48cdb..89406b2 100644 --- a/examples/helmet_detection.ipynb +++ b/examples/helmet_detection.ipynb @@ -70,9 +70,9 @@ "metadata": {}, "outputs": [], "source": [ - "from datadreamer.utils.convert_dataset_to_yolo import convert\n", + "from datadreamer.utils.convert_dataset import convert_dataset\n", "# Conversion to YOLO format\n", - "convert(dataset_dir=\"gen_dataset_helmet_10000_turbo_tiny\", output_dir=\"gen_dataset_helmet_10000_turbo_tiny_yolo\", train_val_split_ratio=0.95)" + "convert_dataset(input_dir=\"gen_dataset_helmet_10000_turbo_tiny\", output_dir=\"gen_dataset_helmet_10000_turbo_tiny_yolo\", dataset_format=\"yolo\", split_ratios=[0.95, 0.05, 0.0], copy_files=True)" ] }, { From 3f3883d8e6d51ac3aed65d9fc3b8d047baa486c2 Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Sun, 7 Apr 2024 22:26:17 +0000 Subject: [PATCH 05/23] feature: add LuxonisDataset, COCO, YOLO formats --- README.md | 2 + .../generate_dataset_from_scratch.py | 133 +++++++++++------- requirements.txt | 1 + 3 files changed, 84 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index 821f7ae..465d938 100644 --- a/README.md +++ b/README.md @@ -152,6 +152,8 @@ datadreamer --save_dir --class_names --prompts_number =0.25.0 scipy>=1.10.0 bitsandbytes>=0.42.0 nltk>=3.8.1 +luxonis-ml[all] From c7ec6ef716440a51df443b308a0f84b9640ff00a Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Sun, 7 Apr 2024 22:28:24 +0000 Subject: [PATCH 06/23] fix: remove old yolo conversion script --- datadreamer/utils/convert_dataset_to_yolo.py | 157 ------------------- 1 file changed, 157 deletions(-) delete mode 100644 datadreamer/utils/convert_dataset_to_yolo.py diff --git a/datadreamer/utils/convert_dataset_to_yolo.py b/datadreamer/utils/convert_dataset_to_yolo.py deleted file mode 100644 index 83a985e..0000000 --- a/datadreamer/utils/convert_dataset_to_yolo.py +++ /dev/null @@ -1,157 +0,0 @@ -from __future__ import annotations - -import argparse -import json -import os -import shutil - -import numpy as np -from PIL import Image - - -def read_annotations(annotation_path): - """Reads annotations from a JSON file located at the specified path. - - Args: - - annotation_path (str): The path to the JSON file containing annotations. - - Returns: - - dict: A dictionary containing the data loaded from the JSON file. - """ - with open(annotation_path) as f: - data = json.load(f) - return data - - -def convert_to_yolo_format(box, image_width, image_height): - """Converts bounding box coordinates to YOLO format. - - Args: - - box (list of float): A list containing the bounding box coordinates [x_min, y_min, x_max, y_max]. - - image_width (int): The width of the image. - - image_height (int): The height of the image. - - Returns: - - list of float: A list containing the bounding box in YOLO format [x_center, y_center, width, height]. - """ - x_center = (box[0] + box[2]) / 2 / image_width - y_center = (box[1] + box[3]) / 2 / image_height - width = (box[2] - box[0]) / image_width - height = (box[3] - box[1]) / image_height - return [x_center, y_center, width, height] - - -def process_data(data, image_dir, output_dir, split_ratio): - """Processes the data by dividing it into training and validation sets, and saves - the images and labels in YOLO format. - - Args: - - data (dict): The dictionary containing image annotations. - - image_dir (str): The directory where the source images are located. - - output_dir (str): The base directory where the processed data will be saved. - - split_ratio (float): The ratio to split the data into training and validation sets. - - No return value. - """ - images = list(data.keys()) - np.random.shuffle(images) - - split_index = int(len(images) * split_ratio) - train_images = images[:split_index] - val_images = images[split_index:] - - for dataset_type, image_set in [("train", train_images), ("val", val_images)]: - image_output_dir = os.path.join(output_dir, dataset_type, "images") - label_output_dir = os.path.join(output_dir, dataset_type, "labels") - - # If the output directories already exist, replace them - if os.path.exists(image_output_dir): - shutil.rmtree(image_output_dir) - if os.path.exists(label_output_dir): - shutil.rmtree(label_output_dir) - - os.makedirs(image_output_dir) - os.makedirs(label_output_dir) - - for image_name in image_set: - if image_name == "class_names": - continue - # extract image name from image path - image_full_path = os.path.join(image_dir, image_name) - annotation = data[image_name] - image = Image.open(image_full_path) - image_width, image_height = image.size - - label_file = os.path.join( - label_output_dir, os.path.splitext(image_name)[0] + ".txt" - ) - with open(label_file, "w") as f: - for box, label in zip(annotation["boxes"], annotation["labels"]): - yolo_box = convert_to_yolo_format(box, image_width, image_height) - f.write(f"{label} {' '.join(map(str, yolo_box))}\n") - - shutil.copy(image_full_path, os.path.join(image_output_dir, image_name)) - - -def create_data_yaml(root_dir, class_names): - """Creates a YAML file for dataset configuration, specifying paths and class names. - - Args: - - root_dir (str): The root directory where the dataset is located. - - class_names (list of str): A list of class names. - - No return value. - """ - yaml_content = ( - f"train: {os.path.abspath(os.path.join(root_dir, 'train'))}\n" - f"val: {os.path.abspath(os.path.join(root_dir, 'val'))}\n" - f"nc: {len(class_names)}\n" - f"names: {class_names}" - ) - with open(os.path.join(root_dir, "data.yaml"), "w") as f: - f.write(yaml_content) - - -def convert(dataset_dir, output_dir, train_val_split_ratio): - """Converts a dataset into a format suitable for training with YOLO, including - creating training and validation splits. - - Args: - - dataset_dir (str): The directory where the source dataset is located. - - output_dir (str): The directory where the processed dataset should be saved. - - train_val_split_ratio (float): The ratio to split the dataset into training and validation sets. - - No return value. - """ - annotation_path = os.path.join(dataset_dir, "annotations.json") - data = read_annotations(annotation_path) - process_data(data, dataset_dir, output_dir, train_val_split_ratio) - create_data_yaml(output_dir, data["class_names"]) - - -def main(): - parser = argparse.ArgumentParser( - description="Convert dataset to YOLO format with train-val split." - ) - parser.add_argument( - "--save_dir", type=str, help="Directory containing the images and annotations." - ) - parser.add_argument( - "--output_dir", - type=str, - help="Directory where the processed dataset will be saved.", - ) - parser.add_argument( - "--split_ratio", - type=float, - default=0.8, - help="Train-validation split ratio (default: 0.8)", - ) - - args = parser.parse_args() - - convert(args.save_dir, args.output_dir, args.split_ratio) - - -if __name__ == "__main__": - main() From 16c92efd929f4b506963784489d77a44c8020856 Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Sun, 7 Apr 2024 22:38:03 +0000 Subject: [PATCH 07/23] format: fix formatting --- datadreamer/utils/base_converter.py | 11 ++-- datadreamer/utils/coco_converter.py | 84 +++++++++++++++---------- datadreamer/utils/convert_dataset.py | 28 +++++++-- datadreamer/utils/dataset_utils.py | 20 +++--- datadreamer/utils/ldf_convreter.py | 10 +-- datadreamer/utils/merge_raw_datasets.py | 43 ++++++++----- datadreamer/utils/yolo_converter.py | 39 +++++++----- 7 files changed, 148 insertions(+), 87 deletions(-) diff --git a/datadreamer/utils/base_converter.py b/datadreamer/utils/base_converter.py index 945dda8..dfbf1d8 100644 --- a/datadreamer/utils/base_converter.py +++ b/datadreamer/utils/base_converter.py @@ -1,16 +1,13 @@ from __future__ import annotations import json +from abc import ABC, abstractmethod + import numpy as np -from abc import ABC, abstractmethod class BaseConverter(ABC): - """Abstract base class for converter. - """ - - def __init__(self): - pass + """Abstract base class for converter.""" @abstractmethod def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): @@ -27,7 +24,7 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): No return value. """ pass - + @staticmethod def read_annotations(annotation_path): """Reads annotations from a JSON file located at the specified path. diff --git a/datadreamer/utils/coco_converter.py b/datadreamer/utils/coco_converter.py index 5ab3b71..18897c7 100644 --- a/datadreamer/utils/coco_converter.py +++ b/datadreamer/utils/coco_converter.py @@ -1,14 +1,14 @@ from __future__ import annotations - +import json import os import shutil -import numpy as np -import json + from PIL import Image from datadreamer.utils.base_converter import BaseConverter + class COCOConverter(BaseConverter): """Class for converting a dataset to COCO format. @@ -45,8 +45,8 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): self.process_data(data, dataset_dir, output_dir, split_ratios, copy_files) def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=True): - """Processes the data by dividing it into training and validation sets, and saves - the images and labels in COCO format. + """Processes the data by dividing it into training and validation sets, and + saves the images and labels in COCO format. Args: - data (dict): The dictionary containing image annotations. @@ -61,14 +61,20 @@ def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=Tru images = list(data.keys()) images.remove("class_names") - train_images, val_images, test_images = BaseConverter.make_splits(images, split_ratios) + train_images, val_images, test_images = BaseConverter.make_splits( + images, split_ratios + ) - for dataset_type, image_set in [("train", train_images), ("validation", val_images), ("test", test_images)]: + for dataset_type, image_set in [ + ("train", train_images), + ("validation", val_images), + ("test", test_images), + ]: dataset_output_dir = os.path.join(output_dir, dataset_type) data_output_dir = os.path.join(dataset_output_dir, "data") if os.path.exists(data_output_dir): - shutil.rmtree(image_output_dir) + shutil.rmtree(data_output_dir) os.makedirs(data_output_dir) @@ -77,37 +83,46 @@ def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=Tru annotation_id = 0 for image_name in image_set: - image_full_path = os.path.join(image_dir, image_name) annotation = data[image_name] image = Image.open(image_full_path) image_width, image_height = image.size - images_info.append({ - "id": len(images_info) + 1, - "file_name": image_name, - "width": image_width, - "height": image_height - }) + images_info.append( + { + "id": len(images_info) + 1, + "file_name": image_name, + "width": image_width, + "height": image_height, + } + ) for box, label in zip(annotation["boxes"], annotation["labels"]): - annotations.append({ - "id": annotation_id, - "image_id": len(images_info), - "category_id": label, - "bbox": [box[0], box[1], box[2] - box[0], box[3] - box[1]], - "segmentation": None, #[[box[0], box[1], box[2], box[1], box[2], box[3], box[0], box[3]]], # bbox mask - "area": (box[2] - box[0]) * (box[3] - box[1]), - "iscrowd": 0 - }) + annotations.append( + { + "id": annotation_id, + "image_id": len(images_info), + "category_id": label, + "bbox": [box[0], box[1], box[2] - box[0], box[3] - box[1]], + "segmentation": None, # [[box[0], box[1], box[2], box[1], box[2], box[3], box[0], box[3]]], # bbox mask + "area": (box[2] - box[0]) * (box[3] - box[1]), + "iscrowd": 0, + } + ) annotation_id += 1 if copy_files: - shutil.copy(image_full_path, os.path.join(data_output_dir, image_name)) + shutil.copy( + image_full_path, os.path.join(data_output_dir, image_name) + ) else: - shutil.move(image_full_path, os.path.join(data_output_dir, image_name)) + shutil.move( + image_full_path, os.path.join(data_output_dir, image_name) + ) - self.save_labels(dataset_output_dir, images_info, annotations, data["class_names"]) + self.save_labels( + dataset_output_dir, images_info, annotations, data["class_names"] + ) def save_labels(self, dataset_output_dir, images_info, annotations, class_names): """Saves the labels to a JSON file. @@ -122,8 +137,13 @@ def save_labels(self, dataset_output_dir, images_info, annotations, class_names) """ with open(os.path.join(dataset_output_dir, "labels.json"), "w") as f: - json.dump({ - "images": images_info, - "annotations": annotations, - "categories": [{"id": i, "name": name} for i, name in enumerate(class_names)] - }, f) \ No newline at end of file + json.dump( + { + "images": images_info, + "annotations": annotations, + "categories": [ + {"id": i, "name": name} for i, name in enumerate(class_names) + ], + }, + f, + ) diff --git a/datadreamer/utils/convert_dataset.py b/datadreamer/utils/convert_dataset.py index ed0182f..800c5c3 100644 --- a/datadreamer/utils/convert_dataset.py +++ b/datadreamer/utils/convert_dataset.py @@ -2,9 +2,12 @@ import argparse -from datadreamer.utils import YOLOConverter, COCOConverter, LDFConverter +from datadreamer.utils import COCOConverter, LDFConverter, YOLOConverter -def convert_dataset(input_dir, output_dir, dataset_format, split_ratios, copy_files=True): + +def convert_dataset( + input_dir, output_dir, dataset_format, split_ratios, copy_files=True +): if dataset_format == "yolo": converter = YOLOConverter() elif dataset_format == "coco": @@ -16,6 +19,7 @@ def convert_dataset(input_dir, output_dir, dataset_format, split_ratios, copy_fi converter.convert(input_dir, output_dir, split_ratios, copy_files) + def main(): parser = argparse.ArgumentParser( description="Convert raw dataset to another format with train-val-test split." @@ -29,7 +33,10 @@ def main(): help="Directory where the processed dataset will be saved.", ) parser.add_argument( - "--dataset_format", type=str, default="yolo", choices=["yolo", "coco", "ldf" ], + "--dataset_format", + type=str, + default="yolo", + choices=["yolo", "coco", "ldf"], ) parser.add_argument( "--split_ratios", @@ -39,13 +46,22 @@ def main(): help="Train-validation-test split ratios (default: 0.8, 0.1, 0.1).", ) parser.add_argument( - "--copy_files", type=bool, default=True, help="Copy files to output directory, otherwise move them." + "--copy_files", + type=bool, + default=True, + help="Copy files to output directory, otherwise move them.", ) args = parser.parse_args() - convert_dataset(args.input_dir, args.output_dir, args.dataset_format, args.split_ratios, args.copy_files) + convert_dataset( + args.input_dir, + args.output_dir, + args.dataset_format, + args.split_ratios, + args.copy_files, + ) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/datadreamer/utils/dataset_utils.py b/datadreamer/utils/dataset_utils.py index 7d1963b..e9a0a53 100644 --- a/datadreamer/utils/dataset_utils.py +++ b/datadreamer/utils/dataset_utils.py @@ -1,9 +1,9 @@ import json import os -from PIL import Image - from luxonis_ml.data import LuxonisDataset +from PIL import Image + def save_annotations_to_json( image_paths, @@ -15,9 +15,9 @@ def save_annotations_to_json( ): annotations = {} for i in range(len(image_paths)): - #for image_path, bboxes, labels in zip(image_paths, boxes_list, labels_list): + # for image_path, bboxes, labels in zip(image_paths, boxes_list, labels_list): image_name = os.path.basename(image_paths[i]) - #image_name = os.path.basename(image_path) + # image_name = os.path.basename(image_path) labels = labels_list[i] annotations[image_name] = { "labels": labels.tolist(), @@ -32,11 +32,15 @@ def save_annotations_to_json( with open(os.path.join(save_dir, file_name), "w") as f: json.dump(annotations, f, indent=4) -def convert_to_ldf(image_paths, labels_list, boxes_list, save_dir, class_names, split_ratios): + +def convert_to_ldf( + image_paths, labels_list, boxes_list, save_dir, class_names, split_ratios +): width, height = Image.open(image_paths[0]).size + def dataset_generator(): # find image paths and load COCO annotations - + for i in range(len(image_paths)): image_path = image_paths[i] labels = labels_list[i] @@ -58,7 +62,7 @@ def dataset_generator(): "type": "box", "value": (x / width, y / height, w / width, h / height), } - + dataset_name = os.path.basename(save_dir) if LuxonisDataset.exists(dataset_name): dataset = LuxonisDataset(dataset_name) @@ -70,5 +74,3 @@ def dataset_generator(): dataset.add(dataset_generator) dataset.make_splits(split_ratios) - - diff --git a/datadreamer/utils/ldf_convreter.py b/datadreamer/utils/ldf_convreter.py index 32df801..d596bc7 100644 --- a/datadreamer/utils/ldf_convreter.py +++ b/datadreamer/utils/ldf_convreter.py @@ -1,9 +1,9 @@ from __future__ import annotations -import json import os -from PIL import Image + from luxonis_ml.data import LuxonisDataset +from PIL import Image from datadreamer.utils import BaseConverter @@ -32,7 +32,7 @@ def process_data(self, data, dataset_dir, output_dir, split_ratios): def dataset_generator(): # find image paths and load COCO annotations - + for image_path in image_paths: image_full_path = os.path.join(dataset_dir, image_path) width, height = Image.open(image_full_path).size @@ -55,7 +55,7 @@ def dataset_generator(): "type": "box", "value": (x / width, y / height, w / width, h / height), } - + dataset_name = os.path.basename(output_dir) if LuxonisDataset.exists(dataset_name): dataset = LuxonisDataset(dataset_name) @@ -66,4 +66,4 @@ def dataset_generator(): dataset.add(dataset_generator) - dataset.make_splits(split_ratios) \ No newline at end of file + dataset.make_splits(split_ratios) diff --git a/datadreamer/utils/merge_raw_datasets.py b/datadreamer/utils/merge_raw_datasets.py index 57935b0..bcece2e 100644 --- a/datadreamer/utils/merge_raw_datasets.py +++ b/datadreamer/utils/merge_raw_datasets.py @@ -1,12 +1,12 @@ from __future__ import annotations import argparse -import os import json +import os import shutil + def merge_datasets(input_dirs, output_dir, copy_files=True): - config_tasks = [] config_classes = [] random_seeds = [] @@ -36,7 +36,6 @@ def merge_datasets(input_dirs, output_dir, copy_files=True): annotations_merged = {} for i, input_dir in enumerate(input_dirs): - with open(os.path.join(input_dir, "annotations.json")) as f: annotations = json.load(f) class_names = annotations.pop("class_names") @@ -44,28 +43,41 @@ def merge_datasets(input_dirs, output_dir, copy_files=True): # Copy or move generation_args.json files if copy_files: - shutil.copy(os.path.join(input_dir, "generation_args.json"), os.path.join(output_dir, f"generation_args_{i}.json")) + shutil.copy( + os.path.join(input_dir, "generation_args.json"), + os.path.join(output_dir, f"generation_args_{i}.json"), + ) else: - shutil.move(os.path.join(input_dir, "generation_args.json"), os.path.join(output_dir, f"generation_args_{i}.json")) + shutil.move( + os.path.join(input_dir, "generation_args.json"), + os.path.join(output_dir, f"generation_args_{i}.json"), + ) # Copy or move images for image_path in annotations: if copy_files: - shutil.copy(os.path.join(input_dir, image_path), os.path.join(output_dir, image_path)) + shutil.copy( + os.path.join(input_dir, image_path), + os.path.join(output_dir, image_path), + ) else: - shutil.move(os.path.join(input_dir, image_path), os.path.join(output_dir, image_path)) + shutil.move( + os.path.join(input_dir, image_path), + os.path.join(output_dir, image_path), + ) annotations_merged["class_names"] = class_names with open(os.path.join(output_dir, "annotations.json"), "w") as f: json.dump(annotations_merged, f, indent=4) - + def main(): - parser = argparse.ArgumentParser( - description="Merge raw datasets" - ) + parser = argparse.ArgumentParser(description="Merge raw datasets") parser.add_argument( - "--input_dirs", type=str, nargs="+", help="Directories containing the images and annotations." + "--input_dirs", + type=str, + nargs="+", + help="Directories containing the images and annotations.", ) parser.add_argument( "--output_dir", @@ -73,7 +85,10 @@ def main(): help="Directory where the merged dataset will be saved.", ) parser.add_argument( - "--copy_files", type=bool, default=True, help="Copy files to output directory, otherwise move them." + "--copy_files", + type=bool, + default=True, + help="Copy files to output directory, otherwise move them.", ) args = parser.parse_args() @@ -82,4 +97,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/datadreamer/utils/yolo_converter.py b/datadreamer/utils/yolo_converter.py index 4724b3e..9712a52 100644 --- a/datadreamer/utils/yolo_converter.py +++ b/datadreamer/utils/yolo_converter.py @@ -2,11 +2,12 @@ import os import shutil -import json + from PIL import Image from datadreamer.utils import BaseConverter + class YOLOConverter(BaseConverter): """Class for converting a dataset to YOLO format. @@ -28,8 +29,6 @@ class YOLOConverter(BaseConverter): │ ├── labels """ - - def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): """Converts a dataset into a format suitable for training with YOLO, including creating training and validation splits. @@ -64,8 +63,8 @@ def convert_to_yolo_format(self, box, image_width, image_height): return [x_center, y_center, width, height] def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=True): - """Processes the data by dividing it into training and validation sets, and saves - the images and labels in YOLO format. + """Processes the data by dividing it into training and validation sets, and + saves the images and labels in YOLO format. Args: - data (dict): The dictionary containing image annotations. @@ -79,10 +78,16 @@ def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=Tru """ images = list(data.keys()) images.remove("class_names") - - train_images, val_images, test_images = BaseConverter.make_splits(images, split_ratios) - for dataset_type, image_set in [("train", train_images), ("val", val_images), ("test", test_images)]: + train_images, val_images, test_images = BaseConverter.make_splits( + images, split_ratios + ) + + for dataset_type, image_set in [ + ("train", train_images), + ("val", val_images), + ("test", test_images), + ]: image_output_dir = os.path.join(output_dir, dataset_type, "images") label_output_dir = os.path.join(output_dir, dataset_type, "labels") @@ -107,19 +112,25 @@ def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=Tru ) with open(label_file, "w") as f: for box, label in zip(annotation["boxes"], annotation["labels"]): - yolo_box = self.convert_to_yolo_format(box, image_width, image_height) + yolo_box = self.convert_to_yolo_format( + box, image_width, image_height + ) f.write(f"{label} {' '.join(map(str, yolo_box))}\n") if copy_files: - shutil.copy(image_full_path, os.path.join(image_output_dir, image_name)) + shutil.copy( + image_full_path, os.path.join(image_output_dir, image_name) + ) else: - shutil.move(image_full_path, os.path.join(image_output_dir, image_name)) + shutil.move( + image_full_path, os.path.join(image_output_dir, image_name) + ) self.create_data_yaml(output_dir, data["class_names"]) - def create_data_yaml(self, root_dir, class_names): - """Creates a YAML file for dataset configuration, specifying paths and class names. + """Creates a YAML file for dataset configuration, specifying paths and class + names. Args: - root_dir (str): The root directory where the dataset is located. @@ -135,4 +146,4 @@ def create_data_yaml(self, root_dir, class_names): f"names: {class_names}" ) with open(os.path.join(root_dir, "data.yaml"), "w") as f: - f.write(yaml_content) \ No newline at end of file + f.write(yaml_content) From e3140b720f211387bd482f58cb23c874c24973b8 Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Sun, 7 Apr 2024 22:39:38 +0000 Subject: [PATCH 08/23] chore: update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 7759c34..dac496a 100644 --- a/.gitignore +++ b/.gitignore @@ -153,5 +153,6 @@ Thumbs.db # Others node_modules/ **generated_dataset*/ +**gen_dataset*/ **runs/ **wandb/ From 996a445b7973f060e939127d0abef6398516d9a9 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Sun, 7 Apr 2024 22:52:13 +0000 Subject: [PATCH 09/23] [Automated] Updated coverage badge --- media/coverage_badge.svg | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/media/coverage_badge.svg b/media/coverage_badge.svg index 4f8c185..2d1c743 100644 --- a/media/coverage_badge.svg +++ b/media/coverage_badge.svg @@ -9,13 +9,13 @@ - + coverage coverage - 50% - 50% + 37% + 37% From a1f11d39c259d4d77fd438598e32db6b5df6942d Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Mon, 8 Apr 2024 12:24:19 +0000 Subject: [PATCH 10/23] fix: import from utils --- datadreamer/utils/__init__.py | 12 ++++++ datadreamer/utils/ldf_converter.py | 69 ++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 datadreamer/utils/ldf_converter.py diff --git a/datadreamer/utils/__init__.py b/datadreamer/utils/__init__.py index 9d48db4..f7cf8df 100644 --- a/datadreamer/utils/__init__.py +++ b/datadreamer/utils/__init__.py @@ -1 +1,13 @@ from __future__ import annotations + +from .base_converter import BaseConverter +from .coco_converter import COCOConverter +from .ldf_converter import LDFConverter +from .yolo_converter import YOLOConverter + +__all__ = [ + "BaseConverter", + "COCOConverter", + "LDFConverter", + "YOLOConverter", +] diff --git a/datadreamer/utils/ldf_converter.py b/datadreamer/utils/ldf_converter.py new file mode 100644 index 0000000..d596bc7 --- /dev/null +++ b/datadreamer/utils/ldf_converter.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +import os + +from luxonis_ml.data import LuxonisDataset +from PIL import Image + +from datadreamer.utils import BaseConverter + + +class LDFConverter(BaseConverter): + def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): + """Converts a dataset into a format suitable for training with YOLO, including + creating training and validation splits. + + Args: + - dataset_dir (str): The directory where the source dataset is located. + - output_dir (str): The directory where the processed dataset should be saved. + - split_ratios (list of float): The ratios to split the data into training, validation, and test sets. + - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True. + + No return value. + """ + annotation_path = os.path.join(dataset_dir, "annotations.json") + data = BaseConverter.read_annotations(annotation_path) + self.process_data(data, dataset_dir, output_dir, split_ratios) + + def process_data(self, data, dataset_dir, output_dir, split_ratios): + class_names = data["class_names"] + image_paths = list(data.keys()) + image_paths.remove("class_names") + + def dataset_generator(): + # find image paths and load COCO annotations + + for image_path in image_paths: + image_full_path = os.path.join(dataset_dir, image_path) + width, height = Image.open(image_full_path).size + labels = data[image_path]["labels"] + for label in labels: + yield { + "file": image_full_path, + "class": class_names[label], + "type": "classification", + "value": True, + } + + if "boxes" in data[image_path]: + boxes = data[image_path]["boxes"] + for box in boxes: + x, y, w, h = box[0], box[1], box[2] - box[0], box[3] - box[1] + yield { + "file": image_full_path, + "class": class_names[label], + "type": "box", + "value": (x / width, y / height, w / width, h / height), + } + + dataset_name = os.path.basename(output_dir) + if LuxonisDataset.exists(dataset_name): + dataset = LuxonisDataset(dataset_name) + dataset.delete_dataset() + + dataset = LuxonisDataset(dataset_name) + dataset.set_classes(class_names) + + dataset.add(dataset_generator) + + dataset.make_splits(split_ratios) From 68e9f79dd51c960b740f9e8b88d4d7820795b9de Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Mon, 8 Apr 2024 12:25:25 +0000 Subject: [PATCH 11/23] fix: not removing dir when --annotate_only --- datadreamer/pipelines/generate_dataset_from_scratch.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py index 99c3f1a..d2a618e 100644 --- a/datadreamer/pipelines/generate_dataset_from_scratch.py +++ b/datadreamer/pipelines/generate_dataset_from_scratch.py @@ -361,11 +361,14 @@ def main(): # Directories for saving images and bboxes save_dir = args.save_dir - if os.path.exists(save_dir): - shutil.rmtree(save_dir) - os.makedirs(save_dir) + if not args.annotate_only: + if os.path.exists(save_dir): + shutil.rmtree(save_dir) + os.makedirs(save_dir) bbox_dir = os.path.join(save_dir, "bboxes_visualization") + if os.path.exists(bbox_dir): + shutil.rmtree(bbox_dir) os.makedirs(bbox_dir) # Save arguments From 4dc3bca51c097e7d6b000c307dc3c76d31ed37ae Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Mon, 8 Apr 2024 12:38:02 +0000 Subject: [PATCH 12/23] [Automated] Updated coverage badge --- media/coverage_badge.svg | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/media/coverage_badge.svg b/media/coverage_badge.svg index 2d1c743..0cbe944 100644 --- a/media/coverage_badge.svg +++ b/media/coverage_badge.svg @@ -9,13 +9,13 @@ - + coverage coverage - 37% - 37% + 45% + 45% From d79968ed91438e0be81f1dff058bb9c245e53c88 Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Mon, 8 Apr 2024 17:21:47 +0000 Subject: [PATCH 13/23] docs: fix docstrings in converters --- datadreamer/utils/base_converter.py | 3 +-- datadreamer/utils/coco_converter.py | 3 +-- datadreamer/utils/ldf_converter.py | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/datadreamer/utils/base_converter.py b/datadreamer/utils/base_converter.py index dfbf1d8..61aa3aa 100644 --- a/datadreamer/utils/base_converter.py +++ b/datadreamer/utils/base_converter.py @@ -11,8 +11,7 @@ class BaseConverter(ABC): @abstractmethod def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): - """Converts a dataset into a format suitable for training with YOLO, including - creating training and validation splits. + """Converts a dataset into another format. Args: - dataset_dir (str): The directory where the source dataset is located. diff --git a/datadreamer/utils/coco_converter.py b/datadreamer/utils/coco_converter.py index 18897c7..d9b554c 100644 --- a/datadreamer/utils/coco_converter.py +++ b/datadreamer/utils/coco_converter.py @@ -29,8 +29,7 @@ class COCOConverter(BaseConverter): """ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): - """Converts a dataset into a format suitable for training with YOLO, including - creating training and validation splits. + """Converts a dataset into a COCO format. Args: - dataset_dir (str): The directory where the source dataset is located. diff --git a/datadreamer/utils/ldf_converter.py b/datadreamer/utils/ldf_converter.py index d596bc7..edde585 100644 --- a/datadreamer/utils/ldf_converter.py +++ b/datadreamer/utils/ldf_converter.py @@ -10,8 +10,7 @@ class LDFConverter(BaseConverter): def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): - """Converts a dataset into a format suitable for training with YOLO, including - creating training and validation splits. + """Converts a dataset into a LuxonisDataset format. Args: - dataset_dir (str): The directory where the source dataset is located. From bde5feccff9f29f4b059d9e6cadc68bf8b97b430 Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Mon, 8 Apr 2024 17:23:15 +0000 Subject: [PATCH 14/23] refactor: remove file with a typo in name --- datadreamer/utils/ldf_convreter.py | 69 ------------------------------ 1 file changed, 69 deletions(-) delete mode 100644 datadreamer/utils/ldf_convreter.py diff --git a/datadreamer/utils/ldf_convreter.py b/datadreamer/utils/ldf_convreter.py deleted file mode 100644 index d596bc7..0000000 --- a/datadreamer/utils/ldf_convreter.py +++ /dev/null @@ -1,69 +0,0 @@ -from __future__ import annotations - -import os - -from luxonis_ml.data import LuxonisDataset -from PIL import Image - -from datadreamer.utils import BaseConverter - - -class LDFConverter(BaseConverter): - def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): - """Converts a dataset into a format suitable for training with YOLO, including - creating training and validation splits. - - Args: - - dataset_dir (str): The directory where the source dataset is located. - - output_dir (str): The directory where the processed dataset should be saved. - - split_ratios (list of float): The ratios to split the data into training, validation, and test sets. - - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True. - - No return value. - """ - annotation_path = os.path.join(dataset_dir, "annotations.json") - data = BaseConverter.read_annotations(annotation_path) - self.process_data(data, dataset_dir, output_dir, split_ratios) - - def process_data(self, data, dataset_dir, output_dir, split_ratios): - class_names = data["class_names"] - image_paths = list(data.keys()) - image_paths.remove("class_names") - - def dataset_generator(): - # find image paths and load COCO annotations - - for image_path in image_paths: - image_full_path = os.path.join(dataset_dir, image_path) - width, height = Image.open(image_full_path).size - labels = data[image_path]["labels"] - for label in labels: - yield { - "file": image_full_path, - "class": class_names[label], - "type": "classification", - "value": True, - } - - if "boxes" in data[image_path]: - boxes = data[image_path]["boxes"] - for box in boxes: - x, y, w, h = box[0], box[1], box[2] - box[0], box[3] - box[1] - yield { - "file": image_full_path, - "class": class_names[label], - "type": "box", - "value": (x / width, y / height, w / width, h / height), - } - - dataset_name = os.path.basename(output_dir) - if LuxonisDataset.exists(dataset_name): - dataset = LuxonisDataset(dataset_name) - dataset.delete_dataset() - - dataset = LuxonisDataset(dataset_name) - dataset.set_classes(class_names) - - dataset.add(dataset_generator) - - dataset.make_splits(split_ratios) From 9ee7ef2f60ee493252adb8fafd31aafac72eea0d Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Mon, 8 Apr 2024 17:24:55 +0000 Subject: [PATCH 15/23] chore: set the minimum required version of luxonis-ml --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d2bcfdb..5767fd8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,4 @@ accelerate>=0.25.0 scipy>=1.10.0 bitsandbytes>=0.42.0 nltk>=3.8.1 -luxonis-ml[all] +luxonis-ml[all]>=0.1.0 From e74f20599327aad9e438383bcd29949a3aac691e Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Mon, 8 Apr 2024 17:26:05 +0000 Subject: [PATCH 16/23] fix: remove redundant function --- datadreamer/utils/dataset_utils.py | 42 ------------------------------ 1 file changed, 42 deletions(-) diff --git a/datadreamer/utils/dataset_utils.py b/datadreamer/utils/dataset_utils.py index e9a0a53..0284f3f 100644 --- a/datadreamer/utils/dataset_utils.py +++ b/datadreamer/utils/dataset_utils.py @@ -32,45 +32,3 @@ def save_annotations_to_json( with open(os.path.join(save_dir, file_name), "w") as f: json.dump(annotations, f, indent=4) - -def convert_to_ldf( - image_paths, labels_list, boxes_list, save_dir, class_names, split_ratios -): - width, height = Image.open(image_paths[0]).size - - def dataset_generator(): - # find image paths and load COCO annotations - - for i in range(len(image_paths)): - image_path = image_paths[i] - labels = labels_list[i] - for label in labels: - yield { - "file": image_path, - "class": class_names[label], - "type": "classification", - "value": True, - } - - if boxes_list: - boxes = boxes_list[i] - for box in boxes: - x, y, w, h = box[0], box[1], box[2] - box[0], box[3] - box[1] - yield { - "file": image_path, - "class": class_names[label], - "type": "box", - "value": (x / width, y / height, w / width, h / height), - } - - dataset_name = os.path.basename(save_dir) - if LuxonisDataset.exists(dataset_name): - dataset = LuxonisDataset(dataset_name) - dataset.delete_dataset() - - dataset = LuxonisDataset(dataset_name) - dataset.set_classes(class_names) - - dataset.add(dataset_generator) - - dataset.make_splits(split_ratios) From b3d2f345dd4a8d7e66585892f9b95f399a15b025 Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Mon, 8 Apr 2024 17:37:02 +0000 Subject: [PATCH 17/23] refactor: rename ldf to luxonis-dataset --- README.md | 2 +- datadreamer/pipelines/generate_dataset_from_scratch.py | 8 ++++---- datadreamer/utils/__init__.py | 4 ++-- datadreamer/utils/convert_dataset.py | 8 ++++---- .../{ldf_converter.py => luxonis_dataset_converter.py} | 2 +- examples/generate_dataset_and_train_yolo.ipynb | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) rename datadreamer/utils/{ldf_converter.py => luxonis_dataset_converter.py} (98%) diff --git a/README.md b/README.md index 465d938..1b68498 100644 --- a/README.md +++ b/README.md @@ -152,7 +152,7 @@ datadreamer --save_dir --class_names --prompts_number Date: Mon, 8 Apr 2024 17:44:16 +0000 Subject: [PATCH 18/23] format: black and ruff --- datadreamer/pipelines/generate_dataset_from_scratch.py | 6 +++++- datadreamer/utils/dataset_utils.py | 4 ---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py index 6651928..253e322 100644 --- a/datadreamer/pipelines/generate_dataset_from_scratch.py +++ b/datadreamer/pipelines/generate_dataset_from_scratch.py @@ -579,7 +579,11 @@ def main(): # Convert annotations to LuxonisDataset format if args.dataset_format == "luxonis-dataset": convert_dataset.convert_dataset( - args.save_dir, args.save_dir, "luxonis-dataset", args.split_ratios, copy_files=False + args.save_dir, + args.save_dir, + "luxonis-dataset", + args.split_ratios, + copy_files=False, ) diff --git a/datadreamer/utils/dataset_utils.py b/datadreamer/utils/dataset_utils.py index 0284f3f..a396ae0 100644 --- a/datadreamer/utils/dataset_utils.py +++ b/datadreamer/utils/dataset_utils.py @@ -1,9 +1,6 @@ import json import os -from luxonis_ml.data import LuxonisDataset -from PIL import Image - def save_annotations_to_json( image_paths, @@ -31,4 +28,3 @@ def save_annotations_to_json( # Save to JSON file with open(os.path.join(save_dir, file_name), "w") as f: json.dump(annotations, f, indent=4) - From e376152d28dd6247fec2a51338ae80fa032ffa02 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Mon, 8 Apr 2024 17:56:46 +0000 Subject: [PATCH 19/23] [Automated] Updated coverage badge --- media/coverage_badge.svg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/media/coverage_badge.svg b/media/coverage_badge.svg index 0cbe944..d7667be 100644 --- a/media/coverage_badge.svg +++ b/media/coverage_badge.svg @@ -15,7 +15,7 @@ coverage coverage - 45% - 45% + 47% + 47% From 3adf29e96ba5b37c8ec82f53d0bd37701cf8d1b1 Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Tue, 9 Apr 2024 14:15:11 +0000 Subject: [PATCH 20/23] feature: add reproducibility with a random seed to converters --- .../pipelines/generate_dataset_from_scratch.py | 3 +++ datadreamer/utils/base_converter.py | 9 +++++++-- datadreamer/utils/coco_converter.py | 3 +++ datadreamer/utils/convert_dataset.py | 14 ++++++++++---- datadreamer/utils/luxonis_dataset_converter.py | 6 ++++++ datadreamer/utils/yolo_converter.py | 3 +++ 6 files changed, 32 insertions(+), 6 deletions(-) diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py index 253e322..2038961 100644 --- a/datadreamer/pipelines/generate_dataset_from_scratch.py +++ b/datadreamer/pipelines/generate_dataset_from_scratch.py @@ -565,6 +565,7 @@ def main(): "yolo", args.split_ratios, copy_files=False, + seed=args.seed, ) # Convert annotations to COCO format elif args.dataset_format == "coco": @@ -574,6 +575,7 @@ def main(): "coco", args.split_ratios, copy_files=False, + seed=args.seed, ) # Convert annotations to LuxonisDataset format @@ -584,6 +586,7 @@ def main(): "luxonis-dataset", args.split_ratios, copy_files=False, + seed=args.seed, ) diff --git a/datadreamer/utils/base_converter.py b/datadreamer/utils/base_converter.py index 61aa3aa..3d97199 100644 --- a/datadreamer/utils/base_converter.py +++ b/datadreamer/utils/base_converter.py @@ -9,6 +9,9 @@ class BaseConverter(ABC): """Abstract base class for converter.""" + def __init__(self, seed=42): + np.random.seed(seed) + @abstractmethod def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): """Converts a dataset into another format. @@ -39,19 +42,21 @@ def read_annotations(annotation_path): return data @staticmethod - def make_splits(images, split_ratios): + def make_splits(images, split_ratios, shuffle=True): """Splits the list of images into training, validation, and test sets. Args: - images (list of str): A list of image paths. - split_ratios (list of float): The ratios to split the data into training, validation, and test sets. + - shuffle (bool, optional): Whether to shuffle the list of images. Defaults to True. Returns: - list of str: A list of image paths for the training set. - list of str: A list of image paths for the validation set. - list of str: A list of image paths for the test set. """ - np.random.shuffle(images) + if shuffle: + np.random.shuffle(images) train_images = images[: int(len(images) * split_ratios[0])] val_images = images[ diff --git a/datadreamer/utils/coco_converter.py b/datadreamer/utils/coco_converter.py index d9b554c..ba02d97 100644 --- a/datadreamer/utils/coco_converter.py +++ b/datadreamer/utils/coco_converter.py @@ -28,6 +28,9 @@ class COCOConverter(BaseConverter): │ ├── labels.json """ + def __init__(self, seed=42): + super().__init__(seed) + def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): """Converts a dataset into a COCO format. diff --git a/datadreamer/utils/convert_dataset.py b/datadreamer/utils/convert_dataset.py index 7e18e35..fa3fd05 100644 --- a/datadreamer/utils/convert_dataset.py +++ b/datadreamer/utils/convert_dataset.py @@ -6,14 +6,14 @@ def convert_dataset( - input_dir, output_dir, dataset_format, split_ratios, copy_files=True + input_dir, output_dir, dataset_format, split_ratios, copy_files=True, seed=42 ): if dataset_format == "yolo": - converter = YOLOConverter() + converter = YOLOConverter(seed=seed) elif dataset_format == "coco": - converter = COCOConverter() + converter = COCOConverter(seed=seed) elif dataset_format == "luxonis-dataset": - converter = LuxonisDatasetConverter() + converter = LuxonisDatasetConverter(seed=seed) else: raise ValueError(f"Invalid dataset format: {dataset_format}") @@ -51,6 +51,12 @@ def main(): default=True, help="Copy files to output directory, otherwise move them.", ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Random seed for reproducibility.", + ) args = parser.parse_args() diff --git a/datadreamer/utils/luxonis_dataset_converter.py b/datadreamer/utils/luxonis_dataset_converter.py index e68be86..fdbdc79 100644 --- a/datadreamer/utils/luxonis_dataset_converter.py +++ b/datadreamer/utils/luxonis_dataset_converter.py @@ -9,6 +9,12 @@ class LuxonisDatasetConverter(BaseConverter): + """Class for converting a dataset to LuxonisDataset format. + """ + + def __init__(self, seed=42): + super().__init__(seed) + def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): """Converts a dataset into a LuxonisDataset format. diff --git a/datadreamer/utils/yolo_converter.py b/datadreamer/utils/yolo_converter.py index 9712a52..36452da 100644 --- a/datadreamer/utils/yolo_converter.py +++ b/datadreamer/utils/yolo_converter.py @@ -29,6 +29,9 @@ class YOLOConverter(BaseConverter): │ ├── labels """ + def __init__(self, seed=42): + super().__init__(seed) + def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): """Converts a dataset into a format suitable for training with YOLO, including creating training and validation splits. From ce685a7503c12c73bf5160760c09e2d3a24265dd Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Tue, 9 Apr 2024 14:15:56 +0000 Subject: [PATCH 21/23] test: add converter tests --- tests/unittests/test_converters.py | 234 +++++++++++++++++++++++++++++ 1 file changed, 234 insertions(+) create mode 100644 tests/unittests/test_converters.py diff --git a/tests/unittests/test_converters.py b/tests/unittests/test_converters.py new file mode 100644 index 0000000..5e32a00 --- /dev/null +++ b/tests/unittests/test_converters.py @@ -0,0 +1,234 @@ +import os +import json +import shutil +import unittest +from unittest.mock import patch, MagicMock +from PIL import Image +from datadreamer.utils import BaseConverter, COCOConverter, YOLOConverter, LuxonisDatasetConverter +from luxonis_ml.data import LuxonisDataset + + +class TestBaseConverter(unittest.TestCase): + def setUp(self): + self.test_dir = "test_dataset" + os.makedirs(self.test_dir, exist_ok=True) + + # Create sample annotations + self.annotations = { + "class_names": ["cat", "dog"], + "0.jpg": {"boxes": [[10, 10, 50, 50]], "labels": [0]}, + "1.jpg": {"boxes": [[20, 20, 70, 70]], "labels": [1]}, + } + with open(os.path.join(self.test_dir, "annotations.json"), "w") as f: + json.dump(self.annotations, f) + + # Create sample images + open(os.path.join(self.test_dir, "0.jpg"), "a").close() + open(os.path.join(self.test_dir, "1.jpg"), "a").close() + + def tearDown(self): + os.remove(os.path.join(self.test_dir, "annotations.json")) + os.remove(os.path.join(self.test_dir, "0.jpg")) + os.remove(os.path.join(self.test_dir, "1.jpg")) + os.rmdir(self.test_dir) + + def test_read_annotations(self): + annotation_path = os.path.join(self.test_dir, "annotations.json") + data = BaseConverter.read_annotations(annotation_path) + self.assertEqual(data, self.annotations) + + def test_make_splits(self): + images = ["0.jpg", "1.jpg"] + split_ratios = [0.5, 0.5, 0.0] + train_images, val_images, test_images = BaseConverter.make_splits(images, split_ratios, shuffle=False) + + self.assertEqual(len(train_images), 1) + self.assertEqual(len(val_images), 1) + self.assertEqual(len(test_images), 0) + self.assertTrue("0.jpg" in train_images) + self.assertTrue("1.jpg" in val_images) + + +class TestCOCOConverter(unittest.TestCase): + def setUp(self): + self.test_dir = "test_dataset" + os.makedirs(self.test_dir, exist_ok=True) + + # Create sample images + self.image_size = (100, 100) + self.create_sample_image("0.jpg") + self.create_sample_image("1.jpg") + + # Create sample labels + self.labels = { + "class_names": ["cat", "dog"], + "0.jpg": {"boxes": [(10, 10, 50, 50)], "labels": [0]}, + "1.jpg": {"boxes": [(20, 20, 70, 70)], "labels": [1]}, + } + with open(os.path.join(self.test_dir, "annotations.json"), "w") as f: + json.dump(self.labels, f) + + def tearDown(self): + shutil.rmtree(self.test_dir) + if hasattr(self, 'output_dir') and os.path.exists(self.output_dir): + shutil.rmtree(self.output_dir) + + def create_sample_image(self, filename): + image = Image.new("RGB", self.image_size, color="white") + image.save(os.path.join(self.test_dir, filename)) + + def test_convert(self): + self.output_dir = "output_dir" + split_ratios = [0.6, 0.2, 0.2] + converter = COCOConverter() + converter.convert(self.test_dir, self.output_dir, split_ratios, copy_files=True) + + self.assertTrue(os.path.exists(self.output_dir)) + self.assertTrue(os.path.exists(os.path.join(self.output_dir, "train"))) + self.assertTrue(os.path.exists(os.path.join(self.output_dir, "validation"))) + self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test"))) + + # Test whether labels.json files exist in all output directories + self.assertTrue(os.path.exists(os.path.join(self.output_dir, "train", "labels.json"))) + self.assertTrue( + os.path.exists(os.path.join(self.output_dir, "validation", "labels.json")) + ) + self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test", "labels.json"))) + + def test_process_data(self): + self.output_dir = "output_dir" + split_ratios = [0.6, 0.2, 0.2] + converter = COCOConverter() + converter.process_data( + self.labels, self.test_dir, self.output_dir, split_ratios, copy_files=True + ) + + self.assertTrue(os.path.exists(self.output_dir)) + self.assertTrue(os.path.exists(os.path.join(self.output_dir, "train"))) + self.assertTrue(os.path.exists(os.path.join(self.output_dir, "validation"))) + self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test"))) + + # Test whether labels.json files exist in all output directories + self.assertTrue(os.path.exists(os.path.join(self.output_dir, "train", "labels.json"))) + self.assertTrue( + os.path.exists(os.path.join(self.output_dir, "validation", "labels.json")) + ) + self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test", "labels.json"))) + + def test_save_labels(self): + self.output_dir = "output_dir" + converter = COCOConverter() + images_info = [ + {"id": 1, "file_name": "0.jpg", "width": 100, "height": 100}, + {"id": 2, "file_name": "1.jpg", "width": 100, "height": 100}, + ] + annotations = [ + { + "id": 1, + "image_id": 1, + "category_id": 0, + "bbox": [10, 10, 40, 40], + "segmentation": None, + "area": 1200, + "iscrowd": 0, + }, + { + "id": 2, + "image_id": 2, + "category_id": 1, + "bbox": [20, 20, 50, 50], + "segmentation": None, + "area": 1500, + "iscrowd": 0, + }, + ] + class_names = ["cat", "dog"] + + # Test whether labels.json file is saved correctly + os.makedirs(self.output_dir) + converter.save_labels(self.output_dir, images_info, annotations, class_names) + self.assertTrue(os.path.exists(os.path.join(self.output_dir, "labels.json"))) + + # Test whether the content of labels.json is correct + with open(os.path.join(self.output_dir, "labels.json"), "r") as f: + saved_labels = json.load(f) + + self.assertEqual(saved_labels["images"], images_info) + self.assertEqual(saved_labels["annotations"], annotations) + self.assertEqual( + saved_labels["categories"], [{"id": i, "name": name} for i, name in enumerate(class_names)] + ) + +class TestYOLOConverter(unittest.TestCase): + def setUp(self): + self.test_dir = "test_dataset" + os.makedirs(self.test_dir, exist_ok=True) + + # Create sample images + self.image_size = (100, 100) + self.create_sample_image("0.jpg") + self.create_sample_image("1.jpg") + + # Create sample labels + self.labels = { + "class_names": ["cat", "dog"], + "0.jpg": {"boxes": [(10, 10, 50, 50)], "labels": [0]}, + "1.jpg": {"boxes": [(20, 20, 70, 70)], "labels": [1]}, + } + with open(os.path.join(self.test_dir, "annotations.json"), "w") as f: + json.dump(self.labels, f) + + def tearDown(self): + shutil.rmtree(self.test_dir) + if hasattr(self, 'output_dir') and os.path.exists(self.output_dir): + shutil.rmtree(self.output_dir) + + def create_sample_image(self, filename): + image = Image.new("RGB", self.image_size, color="white") + image.save(os.path.join(self.test_dir, filename)) + + def test_convert_to_yolo_format(self): + converter = YOLOConverter() + yolo_format = converter.convert_to_yolo_format([10, 10, 50, 50], 100, 100) + self.assertEqual(yolo_format, [0.3, 0.3, 0.4, 0.4]) + + def test_process_data(self): + self.output_dir = "output_dir" + split_ratios = [1, 0, 0] + converter = YOLOConverter() + converter.process_data( + self.labels, self.test_dir, self.output_dir, split_ratios, copy_files=True + ) + + self.assertTrue(os.path.exists(self.output_dir)) + self.assertTrue(os.path.exists(os.path.join(self.output_dir, "train"))) + self.assertTrue(os.path.exists(os.path.join(self.output_dir, "val"))) + self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test"))) + + # Test whether labels files exist in all output directories + train_label_file = os.path.join(self.output_dir, "train", "labels", "0.txt") + self.assertTrue(os.path.exists(train_label_file)) + with open(train_label_file, "r") as f: + content = f.read() + self.assertEqual(content.strip(), "0 0.3 0.3 0.4 0.4") + + def test_create_data_yaml(self): + self.output_dir = "output_dir" + converter = YOLOConverter() + class_names = ["cat", "dog"] + os.makedirs(self.output_dir, exist_ok=True) + converter.create_data_yaml(self.output_dir, class_names) + + yaml_file = os.path.join(self.output_dir, "data.yaml") + self.assertTrue(os.path.exists(yaml_file)) + + with open(yaml_file, "r") as f: + content = f.read() + self.assertIn("train:", content) + self.assertIn("val:", content) + self.assertIn("test:", content) + self.assertIn("nc: 2", content) + self.assertIn("names: ['cat', 'dog']", content) + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From 896af05735c195ff71cdbe23b55d46eb66942a9b Mon Sep 17 00:00:00 2001 From: Nikita Sokovnin Date: Tue, 9 Apr 2024 14:17:18 +0000 Subject: [PATCH 22/23] format: black --- .../utils/luxonis_dataset_converter.py | 5 +-- tests/unittests/test_converters.py | 43 +++++++++++++------ 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/datadreamer/utils/luxonis_dataset_converter.py b/datadreamer/utils/luxonis_dataset_converter.py index fdbdc79..2e7d2f4 100644 --- a/datadreamer/utils/luxonis_dataset_converter.py +++ b/datadreamer/utils/luxonis_dataset_converter.py @@ -9,12 +9,11 @@ class LuxonisDatasetConverter(BaseConverter): - """Class for converting a dataset to LuxonisDataset format. - """ + """Class for converting a dataset to LuxonisDataset format.""" def __init__(self, seed=42): super().__init__(seed) - + def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True): """Converts a dataset into a LuxonisDataset format. diff --git a/tests/unittests/test_converters.py b/tests/unittests/test_converters.py index 5e32a00..7e724da 100644 --- a/tests/unittests/test_converters.py +++ b/tests/unittests/test_converters.py @@ -1,11 +1,15 @@ -import os import json +import os import shutil import unittest -from unittest.mock import patch, MagicMock + from PIL import Image -from datadreamer.utils import BaseConverter, COCOConverter, YOLOConverter, LuxonisDatasetConverter -from luxonis_ml.data import LuxonisDataset + +from datadreamer.utils import ( + BaseConverter, + COCOConverter, + YOLOConverter, +) class TestBaseConverter(unittest.TestCase): @@ -40,7 +44,9 @@ def test_read_annotations(self): def test_make_splits(self): images = ["0.jpg", "1.jpg"] split_ratios = [0.5, 0.5, 0.0] - train_images, val_images, test_images = BaseConverter.make_splits(images, split_ratios, shuffle=False) + train_images, val_images, test_images = BaseConverter.make_splits( + images, split_ratios, shuffle=False + ) self.assertEqual(len(train_images), 1) self.assertEqual(len(val_images), 1) @@ -70,7 +76,7 @@ def setUp(self): def tearDown(self): shutil.rmtree(self.test_dir) - if hasattr(self, 'output_dir') and os.path.exists(self.output_dir): + if hasattr(self, "output_dir") and os.path.exists(self.output_dir): shutil.rmtree(self.output_dir) def create_sample_image(self, filename): @@ -89,11 +95,15 @@ def test_convert(self): self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test"))) # Test whether labels.json files exist in all output directories - self.assertTrue(os.path.exists(os.path.join(self.output_dir, "train", "labels.json"))) + self.assertTrue( + os.path.exists(os.path.join(self.output_dir, "train", "labels.json")) + ) self.assertTrue( os.path.exists(os.path.join(self.output_dir, "validation", "labels.json")) ) - self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test", "labels.json"))) + self.assertTrue( + os.path.exists(os.path.join(self.output_dir, "test", "labels.json")) + ) def test_process_data(self): self.output_dir = "output_dir" @@ -109,11 +119,15 @@ def test_process_data(self): self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test"))) # Test whether labels.json files exist in all output directories - self.assertTrue(os.path.exists(os.path.join(self.output_dir, "train", "labels.json"))) + self.assertTrue( + os.path.exists(os.path.join(self.output_dir, "train", "labels.json")) + ) self.assertTrue( os.path.exists(os.path.join(self.output_dir, "validation", "labels.json")) ) - self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test", "labels.json"))) + self.assertTrue( + os.path.exists(os.path.join(self.output_dir, "test", "labels.json")) + ) def test_save_labels(self): self.output_dir = "output_dir" @@ -156,9 +170,11 @@ def test_save_labels(self): self.assertEqual(saved_labels["images"], images_info) self.assertEqual(saved_labels["annotations"], annotations) self.assertEqual( - saved_labels["categories"], [{"id": i, "name": name} for i, name in enumerate(class_names)] + saved_labels["categories"], + [{"id": i, "name": name} for i, name in enumerate(class_names)], ) + class TestYOLOConverter(unittest.TestCase): def setUp(self): self.test_dir = "test_dataset" @@ -180,7 +196,7 @@ def setUp(self): def tearDown(self): shutil.rmtree(self.test_dir) - if hasattr(self, 'output_dir') and os.path.exists(self.output_dir): + if hasattr(self, "output_dir") and os.path.exists(self.output_dir): shutil.rmtree(self.output_dir) def create_sample_image(self, filename): @@ -230,5 +246,6 @@ def test_create_data_yaml(self): self.assertIn("nc: 2", content) self.assertIn("names: ['cat', 'dog']", content) + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() From b147b6162b3acfd4351cf1d4e2893274d8476c1e Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Tue, 9 Apr 2024 14:29:56 +0000 Subject: [PATCH 23/23] [Automated] Updated coverage badge --- media/coverage_badge.svg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/media/coverage_badge.svg b/media/coverage_badge.svg index d7667be..d1f8965 100644 --- a/media/coverage_badge.svg +++ b/media/coverage_badge.svg @@ -15,7 +15,7 @@ coverage coverage - 47% - 47% + 53% + 53%