From 7b47511ddff71ca150f39447cd826bd11464ab8c Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Sun, 7 Apr 2024 22:19:04 +0000
Subject: [PATCH 01/23] feature: add dataset convertets (coco, yolo, ldf)

---
 datadreamer/utils/base_converter.py  |  68 +++++++++++++
 datadreamer/utils/coco_converter.py  | 129 +++++++++++++++++++++++++
 datadreamer/utils/convert_dataset.py |  51 ++++++++++
 datadreamer/utils/ldf_convreter.py   |  69 ++++++++++++++
 datadreamer/utils/yolo_converter.py  | 138 +++++++++++++++++++++++++++
 5 files changed, 455 insertions(+)
 create mode 100644 datadreamer/utils/base_converter.py
 create mode 100644 datadreamer/utils/coco_converter.py
 create mode 100644 datadreamer/utils/convert_dataset.py
 create mode 100644 datadreamer/utils/ldf_convreter.py
 create mode 100644 datadreamer/utils/yolo_converter.py

diff --git a/datadreamer/utils/base_converter.py b/datadreamer/utils/base_converter.py
new file mode 100644
index 0000000..945dda8
--- /dev/null
+++ b/datadreamer/utils/base_converter.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+import json
+import numpy as np
+
+from abc import ABC, abstractmethod
+
+class BaseConverter(ABC):
+    """Abstract base class for converter.
+    """
+
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
+        """Converts a dataset into a format suitable for training with YOLO, including
+        creating training and validation splits.
+
+        Args:
+        - dataset_dir (str): The directory where the source dataset is located.
+        - output_dir (str): The directory where the processed dataset should be saved.
+        - split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+        - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
+
+
+        No return value.
+        """
+        pass
+    
+    @staticmethod
+    def read_annotations(annotation_path):
+        """Reads annotations from a JSON file located at the specified path.
+
+        Args:
+        - annotation_path (str): The path to the JSON file containing annotations.
+
+        Returns:
+        - dict: A dictionary containing the data loaded from the JSON file.
+        """
+        with open(annotation_path) as f:
+            data = json.load(f)
+        return data
+
+    @staticmethod
+    def make_splits(images, split_ratios):
+        """Splits the list of images into training, validation, and test sets.
+
+        Args:
+        - images (list of str): A list of image paths.
+        - split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+
+        Returns:
+        - list of str: A list of image paths for the training set.
+        - list of str: A list of image paths for the validation set.
+        - list of str: A list of image paths for the test set.
+        """
+        np.random.shuffle(images)
+
+        train_images = images[: int(len(images) * split_ratios[0])]
+        val_images = images[
+            int(len(images) * split_ratios[0]) : int(
+                len(images) * (split_ratios[0] + split_ratios[1])
+            )
+        ]
+        test_images = images[int(len(images) * (split_ratios[0] + split_ratios[1])) :]
+
+        return train_images, val_images, test_images
diff --git a/datadreamer/utils/coco_converter.py b/datadreamer/utils/coco_converter.py
new file mode 100644
index 0000000..5ab3b71
--- /dev/null
+++ b/datadreamer/utils/coco_converter.py
@@ -0,0 +1,129 @@
+from __future__ import annotations
+
+
+import os
+import shutil
+import numpy as np
+import json
+from PIL import Image
+
+from datadreamer.utils.base_converter import BaseConverter
+
+class COCOConverter(BaseConverter):
+    """Class for converting a dataset to COCO format.
+
+    Format:
+
+    dataset_dir
+    ├── train
+    │   ├── data
+    │   │   ├── 0.jpg
+    │   │   ├── 1.jpg
+    │   ├── labels.json
+    ├── validation
+    │   ├── data
+    │   ├── labels.json
+    ├── test
+    │   ├── data
+    │   ├── labels.json
+    """
+
+    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
+        """Converts a dataset into a format suitable for training with YOLO, including
+        creating training and validation splits.
+
+        Args:
+        - dataset_dir (str): The directory where the source dataset is located.
+        - output_dir (str): The directory where the processed dataset should be saved.
+        - split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+        - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
+
+        No return value.
+        """
+        annotation_path = os.path.join(dataset_dir, "annotations.json")
+        data = BaseConverter.read_annotations(annotation_path)
+        self.process_data(data, dataset_dir, output_dir, split_ratios, copy_files)
+
+    def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=True):
+        """Processes the data by dividing it into training and validation sets, and saves
+        the images and labels in COCO format.
+
+        Args:
+        - data (dict): The dictionary containing image annotations.
+        - image_dir (str): The directory where the source images are located.
+        - output_dir (str): The base directory where the processed data will be saved.
+        - split_ratios (float): The ratio to split the data into training, validation, and test sets.
+        - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
+
+
+        No return value.
+        """
+        images = list(data.keys())
+        images.remove("class_names")
+
+        train_images, val_images, test_images = BaseConverter.make_splits(images, split_ratios)
+
+        for dataset_type, image_set in [("train", train_images), ("validation", val_images), ("test", test_images)]:
+            dataset_output_dir = os.path.join(output_dir, dataset_type)
+            data_output_dir = os.path.join(dataset_output_dir, "data")
+
+            if os.path.exists(data_output_dir):
+                shutil.rmtree(image_output_dir)
+
+            os.makedirs(data_output_dir)
+
+            images_info = []
+            annotations = []
+            annotation_id = 0
+
+            for image_name in image_set:
+
+                image_full_path = os.path.join(image_dir, image_name)
+                annotation = data[image_name]
+                image = Image.open(image_full_path)
+                image_width, image_height = image.size
+
+                images_info.append({
+                    "id": len(images_info) + 1,
+                    "file_name": image_name,
+                    "width": image_width,
+                    "height": image_height
+                })
+
+                for box, label in zip(annotation["boxes"], annotation["labels"]):
+                    annotations.append({
+                        "id": annotation_id,
+                        "image_id": len(images_info),
+                        "category_id": label,
+                        "bbox": [box[0], box[1], box[2] - box[0], box[3] - box[1]],
+                        "segmentation": None, #[[box[0], box[1], box[2], box[1], box[2], box[3], box[0], box[3]]], # bbox mask
+                        "area": (box[2] - box[0]) * (box[3] - box[1]),
+                        "iscrowd": 0
+                    })
+                    annotation_id += 1
+
+                if copy_files:
+                    shutil.copy(image_full_path, os.path.join(data_output_dir, image_name))
+                else:
+                    shutil.move(image_full_path, os.path.join(data_output_dir, image_name))
+
+            self.save_labels(dataset_output_dir, images_info, annotations, data["class_names"])
+
+    def save_labels(self, dataset_output_dir, images_info, annotations, class_names):
+        """Saves the labels to a JSON file.
+
+        Args:
+        - dataset_output_dir (str): The directory where the labels should be saved.
+        - images_info (list of dict): A list of dictionaries containing image information.
+        - annotations (list of dict): A list of dictionaries containing annotation information.
+        - class_names (list of str): A list of class names.
+
+        No return value.
+        """
+
+        with open(os.path.join(dataset_output_dir, "labels.json"), "w") as f:
+                    json.dump({
+                        "images": images_info,
+                        "annotations": annotations,
+                        "categories": [{"id": i, "name": name} for i, name in enumerate(class_names)]
+                    }, f)
\ No newline at end of file
diff --git a/datadreamer/utils/convert_dataset.py b/datadreamer/utils/convert_dataset.py
new file mode 100644
index 0000000..ed0182f
--- /dev/null
+++ b/datadreamer/utils/convert_dataset.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+import argparse
+
+from datadreamer.utils import YOLOConverter, COCOConverter, LDFConverter
+
+def convert_dataset(input_dir, output_dir, dataset_format, split_ratios, copy_files=True):
+    if dataset_format == "yolo":
+        converter = YOLOConverter()
+    elif dataset_format == "coco":
+        converter = COCOConverter()
+    elif dataset_format == "ldf":
+        converter = LDFConverter()
+    else:
+        raise ValueError(f"Invalid dataset format: {dataset_format}")
+
+    converter.convert(input_dir, output_dir, split_ratios, copy_files)
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert raw dataset to another format with train-val-test split."
+    )
+    parser.add_argument(
+        "--input_dir", type=str, help="Directory containing the images and annotations."
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        help="Directory where the processed dataset will be saved.",
+    )
+    parser.add_argument(
+        "--dataset_format", type=str, default="yolo", choices=["yolo", "coco", "ldf" ],
+    )
+    parser.add_argument(
+        "--split_ratios",
+        type=float,
+        nargs="+",
+        default=[0.8, 0.1, 0.1],
+        help="Train-validation-test split ratios (default: 0.8, 0.1, 0.1).",
+    )
+    parser.add_argument(
+        "--copy_files", type=bool, default=True, help="Copy files to output directory, otherwise move them."
+    )
+
+    args = parser.parse_args()
+
+    convert_dataset(args.input_dir, args.output_dir, args.dataset_format, args.split_ratios, args.copy_files)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/datadreamer/utils/ldf_convreter.py b/datadreamer/utils/ldf_convreter.py
new file mode 100644
index 0000000..32df801
--- /dev/null
+++ b/datadreamer/utils/ldf_convreter.py
@@ -0,0 +1,69 @@
+from __future__ import annotations
+
+import json
+import os
+from PIL import Image
+from luxonis_ml.data import LuxonisDataset
+
+from datadreamer.utils import BaseConverter
+
+
+class LDFConverter(BaseConverter):
+    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
+        """Converts a dataset into a format suitable for training with YOLO, including
+        creating training and validation splits.
+
+        Args:
+        - dataset_dir (str): The directory where the source dataset is located.
+        - output_dir (str): The directory where the processed dataset should be saved.
+        - split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+        - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
+
+        No return value.
+        """
+        annotation_path = os.path.join(dataset_dir, "annotations.json")
+        data = BaseConverter.read_annotations(annotation_path)
+        self.process_data(data, dataset_dir, output_dir, split_ratios)
+
+    def process_data(self, data, dataset_dir, output_dir, split_ratios):
+        class_names = data["class_names"]
+        image_paths = list(data.keys())
+        image_paths.remove("class_names")
+
+        def dataset_generator():
+            # find image paths and load COCO annotations
+            
+            for image_path in image_paths:
+                image_full_path = os.path.join(dataset_dir, image_path)
+                width, height = Image.open(image_full_path).size
+                labels = data[image_path]["labels"]
+                for label in labels:
+                    yield {
+                        "file": image_full_path,
+                        "class": class_names[label],
+                        "type": "classification",
+                        "value": True,
+                    }
+
+                if "boxes" in data[image_path]:
+                    boxes = data[image_path]["boxes"]
+                    for box in boxes:
+                        x, y, w, h = box[0], box[1], box[2] - box[0], box[3] - box[1]
+                        yield {
+                            "file": image_full_path,
+                            "class": class_names[label],
+                            "type": "box",
+                            "value": (x / width, y / height, w / width, h / height),
+                        }
+        
+        dataset_name = os.path.basename(output_dir)
+        if LuxonisDataset.exists(dataset_name):
+            dataset = LuxonisDataset(dataset_name)
+            dataset.delete_dataset()
+
+        dataset = LuxonisDataset(dataset_name)
+        dataset.set_classes(class_names)
+
+        dataset.add(dataset_generator)
+
+        dataset.make_splits(split_ratios)
\ No newline at end of file
diff --git a/datadreamer/utils/yolo_converter.py b/datadreamer/utils/yolo_converter.py
new file mode 100644
index 0000000..4724b3e
--- /dev/null
+++ b/datadreamer/utils/yolo_converter.py
@@ -0,0 +1,138 @@
+from __future__ import annotations
+
+import os
+import shutil
+import json
+from PIL import Image
+
+from datadreamer.utils import BaseConverter
+
+class YOLOConverter(BaseConverter):
+    """Class for converting a dataset to YOLO format.
+
+    Format:
+
+    dataset_dir
+    ├── train
+    │   ├── images
+    │   │   ├── 0.jpg
+    │   │   ├── 1.jpg
+    │   ├── labels
+    │   │   ├── 0.txt
+    │   │   ├── 1.txt
+    ├── val
+    │   ├── images
+    │   ├── labels
+    ├── test
+    │   ├── images
+    │   ├── labels
+    """
+
+
+
+    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
+        """Converts a dataset into a format suitable for training with YOLO, including
+        creating training and validation splits.
+
+        Args:
+        - dataset_dir (str): The directory where the source dataset is located.
+        - output_dir (str): The directory where the processed dataset should be saved.
+        - split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+        - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
+
+        No return value.
+        """
+        annotation_path = os.path.join(dataset_dir, "annotations.json")
+        data = BaseConverter.read_annotations(annotation_path)
+        self.process_data(data, dataset_dir, output_dir, split_ratios, copy_files)
+
+    def convert_to_yolo_format(self, box, image_width, image_height):
+        """Converts bounding box coordinates to YOLO format.
+
+        Args:
+        - box (list of float): A list containing the bounding box coordinates [x_min, y_min, x_max, y_max].
+        - image_width (int): The width of the image.
+        - image_height (int): The height of the image.
+
+        Returns:
+        - list of float: A list containing the bounding box in YOLO format [x_center, y_center, width, height].
+        """
+        x_center = (box[0] + box[2]) / 2 / image_width
+        y_center = (box[1] + box[3]) / 2 / image_height
+        width = (box[2] - box[0]) / image_width
+        height = (box[3] - box[1]) / image_height
+        return [x_center, y_center, width, height]
+
+    def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=True):
+        """Processes the data by dividing it into training and validation sets, and saves
+        the images and labels in YOLO format.
+
+        Args:
+        - data (dict): The dictionary containing image annotations.
+        - image_dir (str): The directory where the source images are located.
+        - output_dir (str): The base directory where the processed data will be saved.
+        - split_ratios (float): The ratio to split the data into training, validation, and test sets.
+        - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
+
+
+        No return value.
+        """
+        images = list(data.keys())
+        images.remove("class_names")
+        
+        train_images, val_images, test_images = BaseConverter.make_splits(images, split_ratios)
+
+        for dataset_type, image_set in [("train", train_images), ("val", val_images), ("test", test_images)]:
+            image_output_dir = os.path.join(output_dir, dataset_type, "images")
+            label_output_dir = os.path.join(output_dir, dataset_type, "labels")
+
+            # If the output directories already exist, replace them
+            if os.path.exists(image_output_dir):
+                shutil.rmtree(image_output_dir)
+            if os.path.exists(label_output_dir):
+                shutil.rmtree(label_output_dir)
+
+            os.makedirs(image_output_dir)
+            os.makedirs(label_output_dir)
+
+            for image_name in image_set:
+                # extract image name from image path
+                image_full_path = os.path.join(image_dir, image_name)
+                annotation = data[image_name]
+                image = Image.open(image_full_path)
+                image_width, image_height = image.size
+
+                label_file = os.path.join(
+                    label_output_dir, os.path.splitext(image_name)[0] + ".txt"
+                )
+                with open(label_file, "w") as f:
+                    for box, label in zip(annotation["boxes"], annotation["labels"]):
+                        yolo_box = self.convert_to_yolo_format(box, image_width, image_height)
+                        f.write(f"{label} {' '.join(map(str, yolo_box))}\n")
+
+                if copy_files:
+                    shutil.copy(image_full_path, os.path.join(image_output_dir, image_name))
+                else:
+                    shutil.move(image_full_path, os.path.join(image_output_dir, image_name))
+
+        self.create_data_yaml(output_dir, data["class_names"])
+
+
+    def create_data_yaml(self, root_dir, class_names):
+        """Creates a YAML file for dataset configuration, specifying paths and class names.
+
+        Args:
+        - root_dir (str): The root directory where the dataset is located.
+        - class_names (list of str): A list of class names.
+
+        No return value.
+        """
+        yaml_content = (
+            f"train: {os.path.abspath(os.path.join(root_dir, 'train'))}\n"
+            f"val: {os.path.abspath(os.path.join(root_dir, 'val'))}\n"
+            f"test: {os.path.abspath(os.path.join(root_dir, 'test'))}\n"
+            f"nc: {len(class_names)}\n"
+            f"names: {class_names}"
+        )
+        with open(os.path.join(root_dir, "data.yaml"), "w") as f:
+            f.write(yaml_content)
\ No newline at end of file

From ec191ba4e95e77772b0d58d209ebb786a30ba6dc Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Sun, 7 Apr 2024 22:20:30 +0000
Subject: [PATCH 02/23] feature: add dataset utils

---
 datadreamer/utils/dataset_utils.py | 74 ++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 datadreamer/utils/dataset_utils.py

diff --git a/datadreamer/utils/dataset_utils.py b/datadreamer/utils/dataset_utils.py
new file mode 100644
index 0000000..7d1963b
--- /dev/null
+++ b/datadreamer/utils/dataset_utils.py
@@ -0,0 +1,74 @@
+import json
+import os
+from PIL import Image
+
+
+from luxonis_ml.data import LuxonisDataset
+
+def save_annotations_to_json(
+    image_paths,
+    labels_list,
+    boxes_list=None,
+    class_names=None,
+    save_dir=None,
+    file_name="annotations.json",
+):
+    annotations = {}
+    for i in range(len(image_paths)):
+    #for image_path, bboxes, labels in zip(image_paths, boxes_list, labels_list):
+        image_name = os.path.basename(image_paths[i])
+        #image_name = os.path.basename(image_path)
+        labels = labels_list[i]
+        annotations[image_name] = {
+            "labels": labels.tolist(),
+        }
+        if boxes_list is not None:
+            bboxes = boxes_list[i]
+            annotations[image_name]["boxes"] = bboxes.tolist()
+
+    annotations["class_names"] = class_names
+
+    # Save to JSON file
+    with open(os.path.join(save_dir, file_name), "w") as f:
+        json.dump(annotations, f, indent=4)
+
+def convert_to_ldf(image_paths, labels_list, boxes_list, save_dir, class_names, split_ratios):
+    width, height = Image.open(image_paths[0]).size
+    def dataset_generator():
+        # find image paths and load COCO annotations
+        
+        for i in range(len(image_paths)):
+            image_path = image_paths[i]
+            labels = labels_list[i]
+            for label in labels:
+                yield {
+                    "file": image_path,
+                    "class": class_names[label],
+                    "type": "classification",
+                    "value": True,
+                }
+
+            if boxes_list:
+                boxes = boxes_list[i]
+                for box in boxes:
+                    x, y, w, h = box[0], box[1], box[2] - box[0], box[3] - box[1]
+                    yield {
+                        "file": image_path,
+                        "class": class_names[label],
+                        "type": "box",
+                        "value": (x / width, y / height, w / width, h / height),
+                    }
+    
+    dataset_name = os.path.basename(save_dir)
+    if LuxonisDataset.exists(dataset_name):
+        dataset = LuxonisDataset(dataset_name)
+        dataset.delete_dataset()
+
+    dataset = LuxonisDataset(dataset_name)
+    dataset.set_classes(class_names)
+
+    dataset.add(dataset_generator)
+
+    dataset.make_splits(split_ratios)
+
+

From e50f250325a622628606514ac980b954d24c6e7c Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Sun, 7 Apr 2024 22:21:09 +0000
Subject: [PATCH 03/23] feature:  add raw dataset merge

---
 datadreamer/utils/merge_raw_datasets.py | 85 +++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 datadreamer/utils/merge_raw_datasets.py

diff --git a/datadreamer/utils/merge_raw_datasets.py b/datadreamer/utils/merge_raw_datasets.py
new file mode 100644
index 0000000..57935b0
--- /dev/null
+++ b/datadreamer/utils/merge_raw_datasets.py
@@ -0,0 +1,85 @@
+from __future__ import annotations
+
+import argparse
+import os
+import json
+import shutil
+
+def merge_datasets(input_dirs, output_dir, copy_files=True):
+    
+    config_tasks = []
+    config_classes = []
+    random_seeds = []
+    for input_dir in input_dirs:
+        with open(os.path.join(input_dir, "generation_args.json")) as f:
+            generation_args = json.load(f)
+        config_tasks.append(generation_args["task"])
+        config_classes.append(generation_args["class_names"])
+        random_seeds.append(generation_args["seed"])
+
+    # Check if all tasks are the same
+    if len(set(config_tasks)) != 1:
+        raise ValueError("All datasets must have the same task")
+    # Check if all list of classes are the same
+    if len(set(tuple(sorted(classes)) for classes in config_classes)) != 1:
+        raise ValueError("All datasets must have the same list of classes")
+
+    # Check if all datasets have different random seeds
+    if len(set(random_seeds)) != len(input_dirs):
+        raise ValueError("All datasets must have different random seeds")
+
+    # Create output directory
+    print(f"Output directory: {output_dir}")
+    if os.path.exists(output_dir):
+        shutil.rmtree(output_dir)
+    os.makedirs(output_dir)
+
+    annotations_merged = {}
+    for i, input_dir in enumerate(input_dirs):
+
+        with open(os.path.join(input_dir, "annotations.json")) as f:
+            annotations = json.load(f)
+            class_names = annotations.pop("class_names")
+            annotations_merged = {**annotations_merged, **annotations}
+
+        # Copy or move generation_args.json files
+        if copy_files:
+            shutil.copy(os.path.join(input_dir, "generation_args.json"), os.path.join(output_dir, f"generation_args_{i}.json"))
+        else:
+            shutil.move(os.path.join(input_dir, "generation_args.json"), os.path.join(output_dir, f"generation_args_{i}.json"))
+
+        # Copy or move images
+        for image_path in annotations:
+            if copy_files:
+                shutil.copy(os.path.join(input_dir, image_path), os.path.join(output_dir, image_path))
+            else:
+                shutil.move(os.path.join(input_dir, image_path), os.path.join(output_dir, image_path))
+
+    annotations_merged["class_names"] = class_names
+    with open(os.path.join(output_dir, "annotations.json"), "w") as f:
+        json.dump(annotations_merged, f, indent=4)
+   
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Merge raw datasets"
+    )
+    parser.add_argument(
+        "--input_dirs", type=str, nargs="+", help="Directories containing the images and annotations."
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        help="Directory where the merged dataset will be saved.",
+    )
+    parser.add_argument(
+        "--copy_files", type=bool, default=True, help="Copy files to output directory, otherwise move them."
+    )
+
+    args = parser.parse_args()
+
+    merge_datasets(args.input_dirs, args.output_dir, args.copy_files)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 56969440768c63176b2da81c3b8ef3d60133646c Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Sun, 7 Apr 2024 22:22:57 +0000
Subject: [PATCH 04/23] docs: update examples

---
 examples/generate_dataset_and_train_yolo.ipynb | 12 +++++++-----
 examples/helmet_detection.ipynb                |  4 ++--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/examples/generate_dataset_and_train_yolo.ipynb b/examples/generate_dataset_and_train_yolo.ipynb
index b113b7e..44bc495 100644
--- a/examples/generate_dataset_and_train_yolo.ipynb
+++ b/examples/generate_dataset_and_train_yolo.ipynb
@@ -82,6 +82,8 @@
         "- `--prompt_generator`: Choose between `simple`, `lm` (language model) and `tiny` (tiny LM). Default is `simple`.\n",
         "- `--image_generator`: Choose image generator, e.g., `sdxl`, `sdxl-turbo` or `sdxl-lightning`. Default is `sdxl-turbo`.\n",
         "- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `clip` for image classification. Default is `owlv2`.\n",
+        "- `--dataset_format`: Format of the dataset. Defaults to `raw`. Supported values: `raw`, `yolo`, `coco`, `ldf`.\n",
+        "- `--split_ratios`: Split ratios for train, validation, and test sets. Defaults to `[0.8, 0.1, 0.1]`.\n",
         "- `--conf_threshold`: Confidence threshold for annotation. Default is `0.15`.\n",
         "- `--annotation_iou_threshold`: Intersection over Union (IoU) threshold for annotation. Default is `0.2`.\n",
         "- `--prompt_prefix`: Prefix to add to every image generation prompt. Default is `\"\"`.\n",
@@ -144,26 +146,26 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 1,
       "id": "3dd01a6a",
       "metadata": {
         "id": "3dd01a6a"
       },
       "outputs": [],
       "source": [
-        "from datadreamer.utils.convert_dataset_to_yolo import convert"
+        "from datadreamer.utils.convert_dataset import convert_dataset"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 5,
       "id": "9b9bb74d",
       "metadata": {
         "id": "9b9bb74d"
       },
       "outputs": [],
       "source": [
-        "convert(dataset_dir=\"generated_dataset\", output_dir=\"generated_dataset_yolo\", train_val_split_ratio=0.8)"
+        "convert_dataset(input_dir=\"generated_dataset\", output_dir=\"generated_dataset_yolo\", dataset_format=\"yolo\", split_ratios=[0.8, 0.1, 0.1], copy_files=True)"
       ]
     },
     {
@@ -425,7 +427,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.8.10"
+      "version": "3.11.7"
     }
   },
   "nbformat": 4,
diff --git a/examples/helmet_detection.ipynb b/examples/helmet_detection.ipynb
index 7e48cdb..89406b2 100644
--- a/examples/helmet_detection.ipynb
+++ b/examples/helmet_detection.ipynb
@@ -70,9 +70,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from datadreamer.utils.convert_dataset_to_yolo import convert\n",
+    "from datadreamer.utils.convert_dataset import convert_dataset\n",
     "# Conversion to YOLO format\n",
-    "convert(dataset_dir=\"gen_dataset_helmet_10000_turbo_tiny\", output_dir=\"gen_dataset_helmet_10000_turbo_tiny_yolo\", train_val_split_ratio=0.95)"
+    "convert_dataset(input_dir=\"gen_dataset_helmet_10000_turbo_tiny\", output_dir=\"gen_dataset_helmet_10000_turbo_tiny_yolo\", dataset_format=\"yolo\", split_ratios=[0.95, 0.05, 0.0], copy_files=True)"
    ]
   },
   {

From 3f3883d8e6d51ac3aed65d9fc3b8d047baa486c2 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Sun, 7 Apr 2024 22:26:17 +0000
Subject: [PATCH 05/23] feature: add LuxonisDataset, COCO, YOLO formats

---
 README.md                                     |   2 +
 .../generate_dataset_from_scratch.py          | 133 +++++++++++-------
 requirements.txt                              |   1 +
 3 files changed, 84 insertions(+), 52 deletions(-)

diff --git a/README.md b/README.md
index 821f7ae..465d938 100644
--- a/README.md
+++ b/README.md
@@ -152,6 +152,8 @@ datadreamer --save_dir <directory> --class_names <objects> --prompts_number <num
 ### 🔧 Additional Parameters
 
 - `--task`: Choose between detection and classification. Default is `detection`.
+- `--dataset_format`: Format of the dataset. Defaults to `raw`. Supported values: `raw`, `yolo`, `coco`, `ldf`.
+- `--split_ratios`: Split ratios for train, validation, and test sets. Defaults to `[0.8, 0.1, 0.1]`.
 - `--num_objects_range`: Range of objects in a prompt. Default is 1 to 3.
 - `--prompt_generator`: Choose between `simple`, `lm` (language model) and `tiny` (tiny LM). Default is `simple`.
 - `--image_generator`: Choose image generator, e.g., `sdxl`, `sdxl-turbo` or `sdxl-lightning`. Default is `sdxl-turbo`.
diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py
index f1d3ee2..99c3f1a 100644
--- a/datadreamer/pipelines/generate_dataset_from_scratch.py
+++ b/datadreamer/pipelines/generate_dataset_from_scratch.py
@@ -3,6 +3,8 @@
 import argparse
 import json
 import os
+import shutil
+import uuid
 
 import matplotlib.patches as patches
 import matplotlib.pyplot as plt
@@ -24,6 +26,8 @@
     TinyLlamaLMPromptGenerator,
     WordNetSynonymGenerator,
 )
+from datadreamer.utils import convert_dataset
+from datadreamer.utils.dataset_utils import save_annotations_to_json
 
 prompt_generators = {
     "simple": SimplePromptGenerator,
@@ -112,6 +116,21 @@ def parse_args():
         help="Image annotator to use",
     )
 
+    parser.add_argument(
+        "--dataset_format",
+        type=str,
+        default="raw",
+        choices=["raw", "yolo", "coco", "ldf"],
+        help="Dataset format to use",
+    )
+    parser.add_argument(
+        "--split_ratios",
+        type=float,
+        nargs="+",
+        default=[0.8, 0.1, 0.1],
+        help="Train-validation-test split ratios (default: 0.8, 0.1, 0.1).",
+    )
+
     parser.add_argument(
         "--synonym_generator",
         type=str,
@@ -319,57 +338,35 @@ def check_args(args):
             "--image_annotator must be one of the available annotators for classification task"
         )
 
+    # Check coorect task and dataset_format
+    if args.task == "classification" and args.dataset_format in ["coco", "yolo"]:
+        raise ValueError(
+            "--dataset_format must be one of the available dataset formats for classification task"
+        )
 
-def save_det_annotations_to_json(
-    image_paths,
-    boxes_list,
-    labels_list,
-    class_names,
-    save_dir,
-    file_name="annotations.json",
-):
-    annotations = {}
-    for image_path, bboxes, labels in zip(image_paths, boxes_list, labels_list):
-        image_name = os.path.basename(image_path)
-        annotations[image_name] = {
-            "boxes": bboxes.tolist(),
-            "labels": labels.tolist(),
-        }
-    annotations["class_names"] = class_names
-
-    # Save to JSON file
-    with open(os.path.join(save_dir, file_name), "w") as f:
-        json.dump(annotations, f, indent=4)
-
-
-def save_clf_annotations_to_json(
-    image_paths, labels_list, class_names, save_dir, file_name="annotations.json"
-):
-    annotations = {}
-    for image_path, labels in zip(image_paths, labels_list):
-        image_name = os.path.basename(image_path)
-        annotations[image_name] = {
-            "labels": labels.tolist(),
-        }
-    annotations["class_names"] = class_names
-
-    # Save to JSON file
-    with open(os.path.join(save_dir, file_name), "w") as f:
-        json.dump(annotations, f, indent=4)
+    # Check split_ratios
+    if (
+        len(args.split_ratios) != 3
+        or not all(0 <= ratio <= 1 for ratio in args.split_ratios)
+        or sum(args.split_ratios) != 1
+    ):
+        raise ValueError(
+            "--split_ratios must be a list of three floats that sum up to 1"
+        )
 
 
 def main():
     args = parse_args()
     check_args(args)
 
+    # Directories for saving images and bboxes
     save_dir = args.save_dir
+    if os.path.exists(save_dir):
+        shutil.rmtree(save_dir)
+    os.makedirs(save_dir)
 
-    # Directories for saving images and bboxes
     bbox_dir = os.path.join(save_dir, "bboxes_visualization")
-    if not os.path.exists(save_dir):
-        os.makedirs(save_dir)
-    if not os.path.exists(bbox_dir):
-        os.makedirs(bbox_dir)
+    os.makedirs(bbox_dir)
 
     # Save arguments
     with open(os.path.join(save_dir, "generation_args.json"), "w") as f:
@@ -417,7 +414,9 @@ def main():
             prompts, prompt_objects
         ):
             for generated_image in generated_images_batch:
-                image_path = os.path.join(save_dir, f"image_{num_generated_images}.jpg")
+                unique_id = uuid.uuid4().hex
+                unique_filename = f"image_{num_generated_images}_{unique_id}.jpg"
+                image_path = os.path.join(save_dir, unique_filename)
                 generated_image.save(image_path)
                 image_paths.append(image_path)
                 num_generated_images += 1
@@ -442,12 +441,15 @@ def main():
             synonym_dict, os.path.join(save_dir, "synonyms.json")
         )
 
+    boxes_list = []
+    scores_list = []
+    labels_list = []
+
     if args.task == "classification":
         # Classification annotation
         annotator_class = clf_annotators[args.image_annotator]
         annotator = annotator_class(device=args.device, size=args.annotator_size)
 
-        labels_list = []
         # Split image_paths into batches
         image_batches = [
             image_paths[i : i + args.batch_size_annotation]
@@ -468,24 +470,22 @@ def main():
             )
             labels_list.extend(batch_labels)
 
-        save_clf_annotations_to_json(
-            image_paths, labels_list, args.class_names, save_dir
+        save_annotations_to_json(
+            image_paths=image_paths,
+            labels_list=labels_list,
+            class_names=args.class_names,
+            save_dir=save_dir,
         )
     else:
         # Annotation
         annotator_class = det_annotators[args.image_annotator]
         annotator = annotator_class(device=args.device, size=args.annotator_size)
 
-        boxes_list = []
-        scores_list = []
-        labels_list = []
-
         # Split image_paths into batches
         image_batches = [
             image_paths[i : i + args.batch_size_annotation]
             for i in range(0, len(image_paths), args.batch_size_annotation)
         ]
-
         for i, image_batch in tqdm(
             enumerate(image_batches),
             desc="Annotating images",
@@ -546,8 +546,37 @@ def main():
                 plt.close()
 
         # Save annotations as JSON files
-        save_det_annotations_to_json(
-            image_paths, boxes_list, labels_list, args.class_names, save_dir
+        save_annotations_to_json(
+            image_paths=image_paths,
+            labels_list=labels_list,
+            boxes_list=boxes_list,
+            class_names=args.class_names,
+            save_dir=save_dir,
+        )
+
+        if args.dataset_format == "yolo":
+            # Convert annotations to YOLO format
+            convert_dataset.convert_dataset(
+                args.save_dir,
+                args.save_dir,
+                "yolo",
+                args.split_ratios,
+                copy_files=False,
+            )
+        # Convert annotations to COCO format
+        elif args.dataset_format == "coco":
+            convert_dataset.convert_dataset(
+                args.save_dir,
+                args.save_dir,
+                "coco",
+                args.split_ratios,
+                copy_files=False,
+            )
+
+    # Convert annotations to LDF format
+    if args.dataset_format == "ldf":
+        convert_dataset.convert_dataset(
+            args.save_dir, args.save_dir, "ldf", args.split_ratios, copy_files=False
         )
 
 
diff --git a/requirements.txt b/requirements.txt
index e96fe3d..d2bcfdb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,3 +12,4 @@ accelerate>=0.25.0
 scipy>=1.10.0
 bitsandbytes>=0.42.0
 nltk>=3.8.1
+luxonis-ml[all]

From c7ec6ef716440a51df443b308a0f84b9640ff00a Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Sun, 7 Apr 2024 22:28:24 +0000
Subject: [PATCH 06/23] fix: remove old yolo conversion script

---
 datadreamer/utils/convert_dataset_to_yolo.py | 157 -------------------
 1 file changed, 157 deletions(-)
 delete mode 100644 datadreamer/utils/convert_dataset_to_yolo.py

diff --git a/datadreamer/utils/convert_dataset_to_yolo.py b/datadreamer/utils/convert_dataset_to_yolo.py
deleted file mode 100644
index 83a985e..0000000
--- a/datadreamer/utils/convert_dataset_to_yolo.py
+++ /dev/null
@@ -1,157 +0,0 @@
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import shutil
-
-import numpy as np
-from PIL import Image
-
-
-def read_annotations(annotation_path):
-    """Reads annotations from a JSON file located at the specified path.
-
-    Args:
-    - annotation_path (str): The path to the JSON file containing annotations.
-
-    Returns:
-    - dict: A dictionary containing the data loaded from the JSON file.
-    """
-    with open(annotation_path) as f:
-        data = json.load(f)
-    return data
-
-
-def convert_to_yolo_format(box, image_width, image_height):
-    """Converts bounding box coordinates to YOLO format.
-
-    Args:
-    - box (list of float): A list containing the bounding box coordinates [x_min, y_min, x_max, y_max].
-    - image_width (int): The width of the image.
-    - image_height (int): The height of the image.
-
-    Returns:
-    - list of float: A list containing the bounding box in YOLO format [x_center, y_center, width, height].
-    """
-    x_center = (box[0] + box[2]) / 2 / image_width
-    y_center = (box[1] + box[3]) / 2 / image_height
-    width = (box[2] - box[0]) / image_width
-    height = (box[3] - box[1]) / image_height
-    return [x_center, y_center, width, height]
-
-
-def process_data(data, image_dir, output_dir, split_ratio):
-    """Processes the data by dividing it into training and validation sets, and saves
-    the images and labels in YOLO format.
-
-    Args:
-    - data (dict): The dictionary containing image annotations.
-    - image_dir (str): The directory where the source images are located.
-    - output_dir (str): The base directory where the processed data will be saved.
-    - split_ratio (float): The ratio to split the data into training and validation sets.
-
-    No return value.
-    """
-    images = list(data.keys())
-    np.random.shuffle(images)
-
-    split_index = int(len(images) * split_ratio)
-    train_images = images[:split_index]
-    val_images = images[split_index:]
-
-    for dataset_type, image_set in [("train", train_images), ("val", val_images)]:
-        image_output_dir = os.path.join(output_dir, dataset_type, "images")
-        label_output_dir = os.path.join(output_dir, dataset_type, "labels")
-
-        # If the output directories already exist, replace them
-        if os.path.exists(image_output_dir):
-            shutil.rmtree(image_output_dir)
-        if os.path.exists(label_output_dir):
-            shutil.rmtree(label_output_dir)
-
-        os.makedirs(image_output_dir)
-        os.makedirs(label_output_dir)
-
-        for image_name in image_set:
-            if image_name == "class_names":
-                continue
-            # extract image name from image path
-            image_full_path = os.path.join(image_dir, image_name)
-            annotation = data[image_name]
-            image = Image.open(image_full_path)
-            image_width, image_height = image.size
-
-            label_file = os.path.join(
-                label_output_dir, os.path.splitext(image_name)[0] + ".txt"
-            )
-            with open(label_file, "w") as f:
-                for box, label in zip(annotation["boxes"], annotation["labels"]):
-                    yolo_box = convert_to_yolo_format(box, image_width, image_height)
-                    f.write(f"{label} {' '.join(map(str, yolo_box))}\n")
-
-            shutil.copy(image_full_path, os.path.join(image_output_dir, image_name))
-
-
-def create_data_yaml(root_dir, class_names):
-    """Creates a YAML file for dataset configuration, specifying paths and class names.
-
-    Args:
-    - root_dir (str): The root directory where the dataset is located.
-    - class_names (list of str): A list of class names.
-
-    No return value.
-    """
-    yaml_content = (
-        f"train: {os.path.abspath(os.path.join(root_dir, 'train'))}\n"
-        f"val: {os.path.abspath(os.path.join(root_dir, 'val'))}\n"
-        f"nc: {len(class_names)}\n"
-        f"names: {class_names}"
-    )
-    with open(os.path.join(root_dir, "data.yaml"), "w") as f:
-        f.write(yaml_content)
-
-
-def convert(dataset_dir, output_dir, train_val_split_ratio):
-    """Converts a dataset into a format suitable for training with YOLO, including
-    creating training and validation splits.
-
-    Args:
-    - dataset_dir (str): The directory where the source dataset is located.
-    - output_dir (str): The directory where the processed dataset should be saved.
-    - train_val_split_ratio (float): The ratio to split the dataset into training and validation sets.
-
-    No return value.
-    """
-    annotation_path = os.path.join(dataset_dir, "annotations.json")
-    data = read_annotations(annotation_path)
-    process_data(data, dataset_dir, output_dir, train_val_split_ratio)
-    create_data_yaml(output_dir, data["class_names"])
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Convert dataset to YOLO format with train-val split."
-    )
-    parser.add_argument(
-        "--save_dir", type=str, help="Directory containing the images and annotations."
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        help="Directory where the processed dataset will be saved.",
-    )
-    parser.add_argument(
-        "--split_ratio",
-        type=float,
-        default=0.8,
-        help="Train-validation split ratio (default: 0.8)",
-    )
-
-    args = parser.parse_args()
-
-    convert(args.save_dir, args.output_dir, args.split_ratio)
-
-
-if __name__ == "__main__":
-    main()

From 16c92efd929f4b506963784489d77a44c8020856 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Sun, 7 Apr 2024 22:38:03 +0000
Subject: [PATCH 07/23] format: fix formatting

---
 datadreamer/utils/base_converter.py     | 11 ++--
 datadreamer/utils/coco_converter.py     | 84 +++++++++++++++----------
 datadreamer/utils/convert_dataset.py    | 28 +++++++--
 datadreamer/utils/dataset_utils.py      | 20 +++---
 datadreamer/utils/ldf_convreter.py      | 10 +--
 datadreamer/utils/merge_raw_datasets.py | 43 ++++++++-----
 datadreamer/utils/yolo_converter.py     | 39 +++++++-----
 7 files changed, 148 insertions(+), 87 deletions(-)

diff --git a/datadreamer/utils/base_converter.py b/datadreamer/utils/base_converter.py
index 945dda8..dfbf1d8 100644
--- a/datadreamer/utils/base_converter.py
+++ b/datadreamer/utils/base_converter.py
@@ -1,16 +1,13 @@
 from __future__ import annotations
 
 import json
+from abc import ABC, abstractmethod
+
 import numpy as np
 
-from abc import ABC, abstractmethod
 
 class BaseConverter(ABC):
-    """Abstract base class for converter.
-    """
-
-    def __init__(self):
-        pass
+    """Abstract base class for converter."""
 
     @abstractmethod
     def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
@@ -27,7 +24,7 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
         No return value.
         """
         pass
-    
+
     @staticmethod
     def read_annotations(annotation_path):
         """Reads annotations from a JSON file located at the specified path.
diff --git a/datadreamer/utils/coco_converter.py b/datadreamer/utils/coco_converter.py
index 5ab3b71..18897c7 100644
--- a/datadreamer/utils/coco_converter.py
+++ b/datadreamer/utils/coco_converter.py
@@ -1,14 +1,14 @@
 from __future__ import annotations
 
-
+import json
 import os
 import shutil
-import numpy as np
-import json
+
 from PIL import Image
 
 from datadreamer.utils.base_converter import BaseConverter
 
+
 class COCOConverter(BaseConverter):
     """Class for converting a dataset to COCO format.
 
@@ -45,8 +45,8 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
         self.process_data(data, dataset_dir, output_dir, split_ratios, copy_files)
 
     def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=True):
-        """Processes the data by dividing it into training and validation sets, and saves
-        the images and labels in COCO format.
+        """Processes the data by dividing it into training and validation sets, and
+        saves the images and labels in COCO format.
 
         Args:
         - data (dict): The dictionary containing image annotations.
@@ -61,14 +61,20 @@ def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=Tru
         images = list(data.keys())
         images.remove("class_names")
 
-        train_images, val_images, test_images = BaseConverter.make_splits(images, split_ratios)
+        train_images, val_images, test_images = BaseConverter.make_splits(
+            images, split_ratios
+        )
 
-        for dataset_type, image_set in [("train", train_images), ("validation", val_images), ("test", test_images)]:
+        for dataset_type, image_set in [
+            ("train", train_images),
+            ("validation", val_images),
+            ("test", test_images),
+        ]:
             dataset_output_dir = os.path.join(output_dir, dataset_type)
             data_output_dir = os.path.join(dataset_output_dir, "data")
 
             if os.path.exists(data_output_dir):
-                shutil.rmtree(image_output_dir)
+                shutil.rmtree(data_output_dir)
 
             os.makedirs(data_output_dir)
 
@@ -77,37 +83,46 @@ def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=Tru
             annotation_id = 0
 
             for image_name in image_set:
-
                 image_full_path = os.path.join(image_dir, image_name)
                 annotation = data[image_name]
                 image = Image.open(image_full_path)
                 image_width, image_height = image.size
 
-                images_info.append({
-                    "id": len(images_info) + 1,
-                    "file_name": image_name,
-                    "width": image_width,
-                    "height": image_height
-                })
+                images_info.append(
+                    {
+                        "id": len(images_info) + 1,
+                        "file_name": image_name,
+                        "width": image_width,
+                        "height": image_height,
+                    }
+                )
 
                 for box, label in zip(annotation["boxes"], annotation["labels"]):
-                    annotations.append({
-                        "id": annotation_id,
-                        "image_id": len(images_info),
-                        "category_id": label,
-                        "bbox": [box[0], box[1], box[2] - box[0], box[3] - box[1]],
-                        "segmentation": None, #[[box[0], box[1], box[2], box[1], box[2], box[3], box[0], box[3]]], # bbox mask
-                        "area": (box[2] - box[0]) * (box[3] - box[1]),
-                        "iscrowd": 0
-                    })
+                    annotations.append(
+                        {
+                            "id": annotation_id,
+                            "image_id": len(images_info),
+                            "category_id": label,
+                            "bbox": [box[0], box[1], box[2] - box[0], box[3] - box[1]],
+                            "segmentation": None,  # [[box[0], box[1], box[2], box[1], box[2], box[3], box[0], box[3]]], # bbox mask
+                            "area": (box[2] - box[0]) * (box[3] - box[1]),
+                            "iscrowd": 0,
+                        }
+                    )
                     annotation_id += 1
 
                 if copy_files:
-                    shutil.copy(image_full_path, os.path.join(data_output_dir, image_name))
+                    shutil.copy(
+                        image_full_path, os.path.join(data_output_dir, image_name)
+                    )
                 else:
-                    shutil.move(image_full_path, os.path.join(data_output_dir, image_name))
+                    shutil.move(
+                        image_full_path, os.path.join(data_output_dir, image_name)
+                    )
 
-            self.save_labels(dataset_output_dir, images_info, annotations, data["class_names"])
+            self.save_labels(
+                dataset_output_dir, images_info, annotations, data["class_names"]
+            )
 
     def save_labels(self, dataset_output_dir, images_info, annotations, class_names):
         """Saves the labels to a JSON file.
@@ -122,8 +137,13 @@ def save_labels(self, dataset_output_dir, images_info, annotations, class_names)
         """
 
         with open(os.path.join(dataset_output_dir, "labels.json"), "w") as f:
-                    json.dump({
-                        "images": images_info,
-                        "annotations": annotations,
-                        "categories": [{"id": i, "name": name} for i, name in enumerate(class_names)]
-                    }, f)
\ No newline at end of file
+            json.dump(
+                {
+                    "images": images_info,
+                    "annotations": annotations,
+                    "categories": [
+                        {"id": i, "name": name} for i, name in enumerate(class_names)
+                    ],
+                },
+                f,
+            )
diff --git a/datadreamer/utils/convert_dataset.py b/datadreamer/utils/convert_dataset.py
index ed0182f..800c5c3 100644
--- a/datadreamer/utils/convert_dataset.py
+++ b/datadreamer/utils/convert_dataset.py
@@ -2,9 +2,12 @@
 
 import argparse
 
-from datadreamer.utils import YOLOConverter, COCOConverter, LDFConverter
+from datadreamer.utils import COCOConverter, LDFConverter, YOLOConverter
 
-def convert_dataset(input_dir, output_dir, dataset_format, split_ratios, copy_files=True):
+
+def convert_dataset(
+    input_dir, output_dir, dataset_format, split_ratios, copy_files=True
+):
     if dataset_format == "yolo":
         converter = YOLOConverter()
     elif dataset_format == "coco":
@@ -16,6 +19,7 @@ def convert_dataset(input_dir, output_dir, dataset_format, split_ratios, copy_fi
 
     converter.convert(input_dir, output_dir, split_ratios, copy_files)
 
+
 def main():
     parser = argparse.ArgumentParser(
         description="Convert raw dataset to another format with train-val-test split."
@@ -29,7 +33,10 @@ def main():
         help="Directory where the processed dataset will be saved.",
     )
     parser.add_argument(
-        "--dataset_format", type=str, default="yolo", choices=["yolo", "coco", "ldf" ],
+        "--dataset_format",
+        type=str,
+        default="yolo",
+        choices=["yolo", "coco", "ldf"],
     )
     parser.add_argument(
         "--split_ratios",
@@ -39,13 +46,22 @@ def main():
         help="Train-validation-test split ratios (default: 0.8, 0.1, 0.1).",
     )
     parser.add_argument(
-        "--copy_files", type=bool, default=True, help="Copy files to output directory, otherwise move them."
+        "--copy_files",
+        type=bool,
+        default=True,
+        help="Copy files to output directory, otherwise move them.",
     )
 
     args = parser.parse_args()
 
-    convert_dataset(args.input_dir, args.output_dir, args.dataset_format, args.split_ratios, args.copy_files)
+    convert_dataset(
+        args.input_dir,
+        args.output_dir,
+        args.dataset_format,
+        args.split_ratios,
+        args.copy_files,
+    )
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/datadreamer/utils/dataset_utils.py b/datadreamer/utils/dataset_utils.py
index 7d1963b..e9a0a53 100644
--- a/datadreamer/utils/dataset_utils.py
+++ b/datadreamer/utils/dataset_utils.py
@@ -1,9 +1,9 @@
 import json
 import os
-from PIL import Image
-
 
 from luxonis_ml.data import LuxonisDataset
+from PIL import Image
+
 
 def save_annotations_to_json(
     image_paths,
@@ -15,9 +15,9 @@ def save_annotations_to_json(
 ):
     annotations = {}
     for i in range(len(image_paths)):
-    #for image_path, bboxes, labels in zip(image_paths, boxes_list, labels_list):
+        # for image_path, bboxes, labels in zip(image_paths, boxes_list, labels_list):
         image_name = os.path.basename(image_paths[i])
-        #image_name = os.path.basename(image_path)
+        # image_name = os.path.basename(image_path)
         labels = labels_list[i]
         annotations[image_name] = {
             "labels": labels.tolist(),
@@ -32,11 +32,15 @@ def save_annotations_to_json(
     with open(os.path.join(save_dir, file_name), "w") as f:
         json.dump(annotations, f, indent=4)
 
-def convert_to_ldf(image_paths, labels_list, boxes_list, save_dir, class_names, split_ratios):
+
+def convert_to_ldf(
+    image_paths, labels_list, boxes_list, save_dir, class_names, split_ratios
+):
     width, height = Image.open(image_paths[0]).size
+
     def dataset_generator():
         # find image paths and load COCO annotations
-        
+
         for i in range(len(image_paths)):
             image_path = image_paths[i]
             labels = labels_list[i]
@@ -58,7 +62,7 @@ def dataset_generator():
                         "type": "box",
                         "value": (x / width, y / height, w / width, h / height),
                     }
-    
+
     dataset_name = os.path.basename(save_dir)
     if LuxonisDataset.exists(dataset_name):
         dataset = LuxonisDataset(dataset_name)
@@ -70,5 +74,3 @@ def dataset_generator():
     dataset.add(dataset_generator)
 
     dataset.make_splits(split_ratios)
-
-
diff --git a/datadreamer/utils/ldf_convreter.py b/datadreamer/utils/ldf_convreter.py
index 32df801..d596bc7 100644
--- a/datadreamer/utils/ldf_convreter.py
+++ b/datadreamer/utils/ldf_convreter.py
@@ -1,9 +1,9 @@
 from __future__ import annotations
 
-import json
 import os
-from PIL import Image
+
 from luxonis_ml.data import LuxonisDataset
+from PIL import Image
 
 from datadreamer.utils import BaseConverter
 
@@ -32,7 +32,7 @@ def process_data(self, data, dataset_dir, output_dir, split_ratios):
 
         def dataset_generator():
             # find image paths and load COCO annotations
-            
+
             for image_path in image_paths:
                 image_full_path = os.path.join(dataset_dir, image_path)
                 width, height = Image.open(image_full_path).size
@@ -55,7 +55,7 @@ def dataset_generator():
                             "type": "box",
                             "value": (x / width, y / height, w / width, h / height),
                         }
-        
+
         dataset_name = os.path.basename(output_dir)
         if LuxonisDataset.exists(dataset_name):
             dataset = LuxonisDataset(dataset_name)
@@ -66,4 +66,4 @@ def dataset_generator():
 
         dataset.add(dataset_generator)
 
-        dataset.make_splits(split_ratios)
\ No newline at end of file
+        dataset.make_splits(split_ratios)
diff --git a/datadreamer/utils/merge_raw_datasets.py b/datadreamer/utils/merge_raw_datasets.py
index 57935b0..bcece2e 100644
--- a/datadreamer/utils/merge_raw_datasets.py
+++ b/datadreamer/utils/merge_raw_datasets.py
@@ -1,12 +1,12 @@
 from __future__ import annotations
 
 import argparse
-import os
 import json
+import os
 import shutil
 
+
 def merge_datasets(input_dirs, output_dir, copy_files=True):
-    
     config_tasks = []
     config_classes = []
     random_seeds = []
@@ -36,7 +36,6 @@ def merge_datasets(input_dirs, output_dir, copy_files=True):
 
     annotations_merged = {}
     for i, input_dir in enumerate(input_dirs):
-
         with open(os.path.join(input_dir, "annotations.json")) as f:
             annotations = json.load(f)
             class_names = annotations.pop("class_names")
@@ -44,28 +43,41 @@ def merge_datasets(input_dirs, output_dir, copy_files=True):
 
         # Copy or move generation_args.json files
         if copy_files:
-            shutil.copy(os.path.join(input_dir, "generation_args.json"), os.path.join(output_dir, f"generation_args_{i}.json"))
+            shutil.copy(
+                os.path.join(input_dir, "generation_args.json"),
+                os.path.join(output_dir, f"generation_args_{i}.json"),
+            )
         else:
-            shutil.move(os.path.join(input_dir, "generation_args.json"), os.path.join(output_dir, f"generation_args_{i}.json"))
+            shutil.move(
+                os.path.join(input_dir, "generation_args.json"),
+                os.path.join(output_dir, f"generation_args_{i}.json"),
+            )
 
         # Copy or move images
         for image_path in annotations:
             if copy_files:
-                shutil.copy(os.path.join(input_dir, image_path), os.path.join(output_dir, image_path))
+                shutil.copy(
+                    os.path.join(input_dir, image_path),
+                    os.path.join(output_dir, image_path),
+                )
             else:
-                shutil.move(os.path.join(input_dir, image_path), os.path.join(output_dir, image_path))
+                shutil.move(
+                    os.path.join(input_dir, image_path),
+                    os.path.join(output_dir, image_path),
+                )
 
     annotations_merged["class_names"] = class_names
     with open(os.path.join(output_dir, "annotations.json"), "w") as f:
         json.dump(annotations_merged, f, indent=4)
-   
+
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="Merge raw datasets"
-    )
+    parser = argparse.ArgumentParser(description="Merge raw datasets")
     parser.add_argument(
-        "--input_dirs", type=str, nargs="+", help="Directories containing the images and annotations."
+        "--input_dirs",
+        type=str,
+        nargs="+",
+        help="Directories containing the images and annotations.",
     )
     parser.add_argument(
         "--output_dir",
@@ -73,7 +85,10 @@ def main():
         help="Directory where the merged dataset will be saved.",
     )
     parser.add_argument(
-        "--copy_files", type=bool, default=True, help="Copy files to output directory, otherwise move them."
+        "--copy_files",
+        type=bool,
+        default=True,
+        help="Copy files to output directory, otherwise move them.",
     )
 
     args = parser.parse_args()
@@ -82,4 +97,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/datadreamer/utils/yolo_converter.py b/datadreamer/utils/yolo_converter.py
index 4724b3e..9712a52 100644
--- a/datadreamer/utils/yolo_converter.py
+++ b/datadreamer/utils/yolo_converter.py
@@ -2,11 +2,12 @@
 
 import os
 import shutil
-import json
+
 from PIL import Image
 
 from datadreamer.utils import BaseConverter
 
+
 class YOLOConverter(BaseConverter):
     """Class for converting a dataset to YOLO format.
 
@@ -28,8 +29,6 @@ class YOLOConverter(BaseConverter):
     │   ├── labels
     """
 
-
-
     def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
         """Converts a dataset into a format suitable for training with YOLO, including
         creating training and validation splits.
@@ -64,8 +63,8 @@ def convert_to_yolo_format(self, box, image_width, image_height):
         return [x_center, y_center, width, height]
 
     def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=True):
-        """Processes the data by dividing it into training and validation sets, and saves
-        the images and labels in YOLO format.
+        """Processes the data by dividing it into training and validation sets, and
+        saves the images and labels in YOLO format.
 
         Args:
         - data (dict): The dictionary containing image annotations.
@@ -79,10 +78,16 @@ def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=Tru
         """
         images = list(data.keys())
         images.remove("class_names")
-        
-        train_images, val_images, test_images = BaseConverter.make_splits(images, split_ratios)
 
-        for dataset_type, image_set in [("train", train_images), ("val", val_images), ("test", test_images)]:
+        train_images, val_images, test_images = BaseConverter.make_splits(
+            images, split_ratios
+        )
+
+        for dataset_type, image_set in [
+            ("train", train_images),
+            ("val", val_images),
+            ("test", test_images),
+        ]:
             image_output_dir = os.path.join(output_dir, dataset_type, "images")
             label_output_dir = os.path.join(output_dir, dataset_type, "labels")
 
@@ -107,19 +112,25 @@ def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=Tru
                 )
                 with open(label_file, "w") as f:
                     for box, label in zip(annotation["boxes"], annotation["labels"]):
-                        yolo_box = self.convert_to_yolo_format(box, image_width, image_height)
+                        yolo_box = self.convert_to_yolo_format(
+                            box, image_width, image_height
+                        )
                         f.write(f"{label} {' '.join(map(str, yolo_box))}\n")
 
                 if copy_files:
-                    shutil.copy(image_full_path, os.path.join(image_output_dir, image_name))
+                    shutil.copy(
+                        image_full_path, os.path.join(image_output_dir, image_name)
+                    )
                 else:
-                    shutil.move(image_full_path, os.path.join(image_output_dir, image_name))
+                    shutil.move(
+                        image_full_path, os.path.join(image_output_dir, image_name)
+                    )
 
         self.create_data_yaml(output_dir, data["class_names"])
 
-
     def create_data_yaml(self, root_dir, class_names):
-        """Creates a YAML file for dataset configuration, specifying paths and class names.
+        """Creates a YAML file for dataset configuration, specifying paths and class
+        names.
 
         Args:
         - root_dir (str): The root directory where the dataset is located.
@@ -135,4 +146,4 @@ def create_data_yaml(self, root_dir, class_names):
             f"names: {class_names}"
         )
         with open(os.path.join(root_dir, "data.yaml"), "w") as f:
-            f.write(yaml_content)
\ No newline at end of file
+            f.write(yaml_content)

From e3140b720f211387bd482f58cb23c874c24973b8 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Sun, 7 Apr 2024 22:39:38 +0000
Subject: [PATCH 08/23] chore: update .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 7759c34..dac496a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -153,5 +153,6 @@ Thumbs.db
 # Others
 node_modules/
 **generated_dataset*/
+**gen_dataset*/
 **runs/
 **wandb/

From 996a445b7973f060e939127d0abef6398516d9a9 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Sun, 7 Apr 2024 22:52:13 +0000
Subject: [PATCH 09/23] [Automated] Updated coverage badge

---
 media/coverage_badge.svg | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/media/coverage_badge.svg b/media/coverage_badge.svg
index 4f8c185..2d1c743 100644
--- a/media/coverage_badge.svg
+++ b/media/coverage_badge.svg
@@ -9,13 +9,13 @@
     </mask>
     <g mask="url(#a)">
         <path fill="#555" d="M0 0h63v20H0z"/>
-        <path fill="#fe7d37" d="M63 0h36v20H63z"/>
+        <path fill="#e05d44" d="M63 0h36v20H63z"/>
         <path fill="url(#b)" d="M0 0h99v20H0z"/>
     </g>
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
         <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
         <text x="31.5" y="14">coverage</text>
-        <text x="80" y="15" fill="#010101" fill-opacity=".3">50%</text>
-        <text x="80" y="14">50%</text>
+        <text x="80" y="15" fill="#010101" fill-opacity=".3">37%</text>
+        <text x="80" y="14">37%</text>
     </g>
 </svg>

From a1f11d39c259d4d77fd438598e32db6b5df6942d Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Mon, 8 Apr 2024 12:24:19 +0000
Subject: [PATCH 10/23] fix: import from utils

---
 datadreamer/utils/__init__.py      | 12 ++++++
 datadreamer/utils/ldf_converter.py | 69 ++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 datadreamer/utils/ldf_converter.py

diff --git a/datadreamer/utils/__init__.py b/datadreamer/utils/__init__.py
index 9d48db4..f7cf8df 100644
--- a/datadreamer/utils/__init__.py
+++ b/datadreamer/utils/__init__.py
@@ -1 +1,13 @@
 from __future__ import annotations
+
+from .base_converter import BaseConverter
+from .coco_converter import COCOConverter
+from .ldf_converter import LDFConverter
+from .yolo_converter import YOLOConverter
+
+__all__ = [
+    "BaseConverter",
+    "COCOConverter",
+    "LDFConverter",
+    "YOLOConverter",
+]
diff --git a/datadreamer/utils/ldf_converter.py b/datadreamer/utils/ldf_converter.py
new file mode 100644
index 0000000..d596bc7
--- /dev/null
+++ b/datadreamer/utils/ldf_converter.py
@@ -0,0 +1,69 @@
+from __future__ import annotations
+
+import os
+
+from luxonis_ml.data import LuxonisDataset
+from PIL import Image
+
+from datadreamer.utils import BaseConverter
+
+
+class LDFConverter(BaseConverter):
+    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
+        """Converts a dataset into a format suitable for training with YOLO, including
+        creating training and validation splits.
+
+        Args:
+        - dataset_dir (str): The directory where the source dataset is located.
+        - output_dir (str): The directory where the processed dataset should be saved.
+        - split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+        - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
+
+        No return value.
+        """
+        annotation_path = os.path.join(dataset_dir, "annotations.json")
+        data = BaseConverter.read_annotations(annotation_path)
+        self.process_data(data, dataset_dir, output_dir, split_ratios)
+
+    def process_data(self, data, dataset_dir, output_dir, split_ratios):
+        class_names = data["class_names"]
+        image_paths = list(data.keys())
+        image_paths.remove("class_names")
+
+        def dataset_generator():
+            # find image paths and load COCO annotations
+
+            for image_path in image_paths:
+                image_full_path = os.path.join(dataset_dir, image_path)
+                width, height = Image.open(image_full_path).size
+                labels = data[image_path]["labels"]
+                for label in labels:
+                    yield {
+                        "file": image_full_path,
+                        "class": class_names[label],
+                        "type": "classification",
+                        "value": True,
+                    }
+
+                if "boxes" in data[image_path]:
+                    boxes = data[image_path]["boxes"]
+                    for box in boxes:
+                        x, y, w, h = box[0], box[1], box[2] - box[0], box[3] - box[1]
+                        yield {
+                            "file": image_full_path,
+                            "class": class_names[label],
+                            "type": "box",
+                            "value": (x / width, y / height, w / width, h / height),
+                        }
+
+        dataset_name = os.path.basename(output_dir)
+        if LuxonisDataset.exists(dataset_name):
+            dataset = LuxonisDataset(dataset_name)
+            dataset.delete_dataset()
+
+        dataset = LuxonisDataset(dataset_name)
+        dataset.set_classes(class_names)
+
+        dataset.add(dataset_generator)
+
+        dataset.make_splits(split_ratios)

From 68e9f79dd51c960b740f9e8b88d4d7820795b9de Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Mon, 8 Apr 2024 12:25:25 +0000
Subject: [PATCH 11/23] fix: not removing dir when --annotate_only

---
 datadreamer/pipelines/generate_dataset_from_scratch.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py
index 99c3f1a..d2a618e 100644
--- a/datadreamer/pipelines/generate_dataset_from_scratch.py
+++ b/datadreamer/pipelines/generate_dataset_from_scratch.py
@@ -361,11 +361,14 @@ def main():
 
     # Directories for saving images and bboxes
     save_dir = args.save_dir
-    if os.path.exists(save_dir):
-        shutil.rmtree(save_dir)
-    os.makedirs(save_dir)
+    if not args.annotate_only:
+        if os.path.exists(save_dir):
+            shutil.rmtree(save_dir)
+        os.makedirs(save_dir)
 
     bbox_dir = os.path.join(save_dir, "bboxes_visualization")
+    if os.path.exists(bbox_dir):
+        shutil.rmtree(bbox_dir)
     os.makedirs(bbox_dir)
 
     # Save arguments

From 4dc3bca51c097e7d6b000c307dc3c76d31ed37ae Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Mon, 8 Apr 2024 12:38:02 +0000
Subject: [PATCH 12/23] [Automated] Updated coverage badge

---
 media/coverage_badge.svg | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/media/coverage_badge.svg b/media/coverage_badge.svg
index 2d1c743..0cbe944 100644
--- a/media/coverage_badge.svg
+++ b/media/coverage_badge.svg
@@ -9,13 +9,13 @@
     </mask>
     <g mask="url(#a)">
         <path fill="#555" d="M0 0h63v20H0z"/>
-        <path fill="#e05d44" d="M63 0h36v20H63z"/>
+        <path fill="#fe7d37" d="M63 0h36v20H63z"/>
         <path fill="url(#b)" d="M0 0h99v20H0z"/>
     </g>
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
         <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
         <text x="31.5" y="14">coverage</text>
-        <text x="80" y="15" fill="#010101" fill-opacity=".3">37%</text>
-        <text x="80" y="14">37%</text>
+        <text x="80" y="15" fill="#010101" fill-opacity=".3">45%</text>
+        <text x="80" y="14">45%</text>
     </g>
 </svg>

From d79968ed91438e0be81f1dff058bb9c245e53c88 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Mon, 8 Apr 2024 17:21:47 +0000
Subject: [PATCH 13/23] docs: fix docstrings in converters

---
 datadreamer/utils/base_converter.py | 3 +--
 datadreamer/utils/coco_converter.py | 3 +--
 datadreamer/utils/ldf_converter.py  | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/datadreamer/utils/base_converter.py b/datadreamer/utils/base_converter.py
index dfbf1d8..61aa3aa 100644
--- a/datadreamer/utils/base_converter.py
+++ b/datadreamer/utils/base_converter.py
@@ -11,8 +11,7 @@ class BaseConverter(ABC):
 
     @abstractmethod
     def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
-        """Converts a dataset into a format suitable for training with YOLO, including
-        creating training and validation splits.
+        """Converts a dataset into another format.
 
         Args:
         - dataset_dir (str): The directory where the source dataset is located.
diff --git a/datadreamer/utils/coco_converter.py b/datadreamer/utils/coco_converter.py
index 18897c7..d9b554c 100644
--- a/datadreamer/utils/coco_converter.py
+++ b/datadreamer/utils/coco_converter.py
@@ -29,8 +29,7 @@ class COCOConverter(BaseConverter):
     """
 
     def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
-        """Converts a dataset into a format suitable for training with YOLO, including
-        creating training and validation splits.
+        """Converts a dataset into a COCO format.
 
         Args:
         - dataset_dir (str): The directory where the source dataset is located.
diff --git a/datadreamer/utils/ldf_converter.py b/datadreamer/utils/ldf_converter.py
index d596bc7..edde585 100644
--- a/datadreamer/utils/ldf_converter.py
+++ b/datadreamer/utils/ldf_converter.py
@@ -10,8 +10,7 @@
 
 class LDFConverter(BaseConverter):
     def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
-        """Converts a dataset into a format suitable for training with YOLO, including
-        creating training and validation splits.
+        """Converts a dataset into a LuxonisDataset format.
 
         Args:
         - dataset_dir (str): The directory where the source dataset is located.

From bde5feccff9f29f4b059d9e6cadc68bf8b97b430 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Mon, 8 Apr 2024 17:23:15 +0000
Subject: [PATCH 14/23] refactor: remove file with a typo in name

---
 datadreamer/utils/ldf_convreter.py | 69 ------------------------------
 1 file changed, 69 deletions(-)
 delete mode 100644 datadreamer/utils/ldf_convreter.py

diff --git a/datadreamer/utils/ldf_convreter.py b/datadreamer/utils/ldf_convreter.py
deleted file mode 100644
index d596bc7..0000000
--- a/datadreamer/utils/ldf_convreter.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from __future__ import annotations
-
-import os
-
-from luxonis_ml.data import LuxonisDataset
-from PIL import Image
-
-from datadreamer.utils import BaseConverter
-
-
-class LDFConverter(BaseConverter):
-    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
-        """Converts a dataset into a format suitable for training with YOLO, including
-        creating training and validation splits.
-
-        Args:
-        - dataset_dir (str): The directory where the source dataset is located.
-        - output_dir (str): The directory where the processed dataset should be saved.
-        - split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-        - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
-
-        No return value.
-        """
-        annotation_path = os.path.join(dataset_dir, "annotations.json")
-        data = BaseConverter.read_annotations(annotation_path)
-        self.process_data(data, dataset_dir, output_dir, split_ratios)
-
-    def process_data(self, data, dataset_dir, output_dir, split_ratios):
-        class_names = data["class_names"]
-        image_paths = list(data.keys())
-        image_paths.remove("class_names")
-
-        def dataset_generator():
-            # find image paths and load COCO annotations
-
-            for image_path in image_paths:
-                image_full_path = os.path.join(dataset_dir, image_path)
-                width, height = Image.open(image_full_path).size
-                labels = data[image_path]["labels"]
-                for label in labels:
-                    yield {
-                        "file": image_full_path,
-                        "class": class_names[label],
-                        "type": "classification",
-                        "value": True,
-                    }
-
-                if "boxes" in data[image_path]:
-                    boxes = data[image_path]["boxes"]
-                    for box in boxes:
-                        x, y, w, h = box[0], box[1], box[2] - box[0], box[3] - box[1]
-                        yield {
-                            "file": image_full_path,
-                            "class": class_names[label],
-                            "type": "box",
-                            "value": (x / width, y / height, w / width, h / height),
-                        }
-
-        dataset_name = os.path.basename(output_dir)
-        if LuxonisDataset.exists(dataset_name):
-            dataset = LuxonisDataset(dataset_name)
-            dataset.delete_dataset()
-
-        dataset = LuxonisDataset(dataset_name)
-        dataset.set_classes(class_names)
-
-        dataset.add(dataset_generator)
-
-        dataset.make_splits(split_ratios)

From 9ee7ef2f60ee493252adb8fafd31aafac72eea0d Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Mon, 8 Apr 2024 17:24:55 +0000
Subject: [PATCH 15/23] chore: set the minimum required version of luxonis-ml

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index d2bcfdb..5767fd8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,4 +12,4 @@ accelerate>=0.25.0
 scipy>=1.10.0
 bitsandbytes>=0.42.0
 nltk>=3.8.1
-luxonis-ml[all]
+luxonis-ml[all]>=0.1.0

From e74f20599327aad9e438383bcd29949a3aac691e Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Mon, 8 Apr 2024 17:26:05 +0000
Subject: [PATCH 16/23] fix: remove redundant function

---
 datadreamer/utils/dataset_utils.py | 42 ------------------------------
 1 file changed, 42 deletions(-)

diff --git a/datadreamer/utils/dataset_utils.py b/datadreamer/utils/dataset_utils.py
index e9a0a53..0284f3f 100644
--- a/datadreamer/utils/dataset_utils.py
+++ b/datadreamer/utils/dataset_utils.py
@@ -32,45 +32,3 @@ def save_annotations_to_json(
     with open(os.path.join(save_dir, file_name), "w") as f:
         json.dump(annotations, f, indent=4)
 
-
-def convert_to_ldf(
-    image_paths, labels_list, boxes_list, save_dir, class_names, split_ratios
-):
-    width, height = Image.open(image_paths[0]).size
-
-    def dataset_generator():
-        # find image paths and load COCO annotations
-
-        for i in range(len(image_paths)):
-            image_path = image_paths[i]
-            labels = labels_list[i]
-            for label in labels:
-                yield {
-                    "file": image_path,
-                    "class": class_names[label],
-                    "type": "classification",
-                    "value": True,
-                }
-
-            if boxes_list:
-                boxes = boxes_list[i]
-                for box in boxes:
-                    x, y, w, h = box[0], box[1], box[2] - box[0], box[3] - box[1]
-                    yield {
-                        "file": image_path,
-                        "class": class_names[label],
-                        "type": "box",
-                        "value": (x / width, y / height, w / width, h / height),
-                    }
-
-    dataset_name = os.path.basename(save_dir)
-    if LuxonisDataset.exists(dataset_name):
-        dataset = LuxonisDataset(dataset_name)
-        dataset.delete_dataset()
-
-    dataset = LuxonisDataset(dataset_name)
-    dataset.set_classes(class_names)
-
-    dataset.add(dataset_generator)
-
-    dataset.make_splits(split_ratios)

From b3d2f345dd4a8d7e66585892f9b95f399a15b025 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Mon, 8 Apr 2024 17:37:02 +0000
Subject: [PATCH 17/23] refactor: rename ldf to luxonis-dataset

---
 README.md                                                 | 2 +-
 datadreamer/pipelines/generate_dataset_from_scratch.py    | 8 ++++----
 datadreamer/utils/__init__.py                             | 4 ++--
 datadreamer/utils/convert_dataset.py                      | 8 ++++----
 .../{ldf_converter.py => luxonis_dataset_converter.py}    | 2 +-
 examples/generate_dataset_and_train_yolo.ipynb            | 2 +-
 6 files changed, 13 insertions(+), 13 deletions(-)
 rename datadreamer/utils/{ldf_converter.py => luxonis_dataset_converter.py} (98%)

diff --git a/README.md b/README.md
index 465d938..1b68498 100644
--- a/README.md
+++ b/README.md
@@ -152,7 +152,7 @@ datadreamer --save_dir <directory> --class_names <objects> --prompts_number <num
 ### 🔧 Additional Parameters
 
 - `--task`: Choose between detection and classification. Default is `detection`.
-- `--dataset_format`: Format of the dataset. Defaults to `raw`. Supported values: `raw`, `yolo`, `coco`, `ldf`.
+- `--dataset_format`: Format of the dataset. Defaults to `raw`. Supported values: `raw`, `yolo`, `coco`, `luxonis-dataset`.
 - `--split_ratios`: Split ratios for train, validation, and test sets. Defaults to `[0.8, 0.1, 0.1]`.
 - `--num_objects_range`: Range of objects in a prompt. Default is 1 to 3.
 - `--prompt_generator`: Choose between `simple`, `lm` (language model) and `tiny` (tiny LM). Default is `simple`.
diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py
index d2a618e..6651928 100644
--- a/datadreamer/pipelines/generate_dataset_from_scratch.py
+++ b/datadreamer/pipelines/generate_dataset_from_scratch.py
@@ -120,7 +120,7 @@ def parse_args():
         "--dataset_format",
         type=str,
         default="raw",
-        choices=["raw", "yolo", "coco", "ldf"],
+        choices=["raw", "yolo", "coco", "luxonis-dataset"],
         help="Dataset format to use",
     )
     parser.add_argument(
@@ -576,10 +576,10 @@ def main():
                 copy_files=False,
             )
 
-    # Convert annotations to LDF format
-    if args.dataset_format == "ldf":
+    # Convert annotations to LuxonisDataset format
+    if args.dataset_format == "luxonis-dataset":
         convert_dataset.convert_dataset(
-            args.save_dir, args.save_dir, "ldf", args.split_ratios, copy_files=False
+            args.save_dir, args.save_dir, "luxonis-dataset", args.split_ratios, copy_files=False
         )
 
 
diff --git a/datadreamer/utils/__init__.py b/datadreamer/utils/__init__.py
index f7cf8df..5d63fde 100644
--- a/datadreamer/utils/__init__.py
+++ b/datadreamer/utils/__init__.py
@@ -2,12 +2,12 @@
 
 from .base_converter import BaseConverter
 from .coco_converter import COCOConverter
-from .ldf_converter import LDFConverter
+from .luxonis_dataset_converter import LuxonisDatasetConverter
 from .yolo_converter import YOLOConverter
 
 __all__ = [
     "BaseConverter",
     "COCOConverter",
-    "LDFConverter",
+    "LuxonisDatasetConverter",
     "YOLOConverter",
 ]
diff --git a/datadreamer/utils/convert_dataset.py b/datadreamer/utils/convert_dataset.py
index 800c5c3..7e18e35 100644
--- a/datadreamer/utils/convert_dataset.py
+++ b/datadreamer/utils/convert_dataset.py
@@ -2,7 +2,7 @@
 
 import argparse
 
-from datadreamer.utils import COCOConverter, LDFConverter, YOLOConverter
+from datadreamer.utils import COCOConverter, LuxonisDatasetConverter, YOLOConverter
 
 
 def convert_dataset(
@@ -12,8 +12,8 @@ def convert_dataset(
         converter = YOLOConverter()
     elif dataset_format == "coco":
         converter = COCOConverter()
-    elif dataset_format == "ldf":
-        converter = LDFConverter()
+    elif dataset_format == "luxonis-dataset":
+        converter = LuxonisDatasetConverter()
     else:
         raise ValueError(f"Invalid dataset format: {dataset_format}")
 
@@ -36,7 +36,7 @@ def main():
         "--dataset_format",
         type=str,
         default="yolo",
-        choices=["yolo", "coco", "ldf"],
+        choices=["yolo", "coco", "luxonis-dataset"],
     )
     parser.add_argument(
         "--split_ratios",
diff --git a/datadreamer/utils/ldf_converter.py b/datadreamer/utils/luxonis_dataset_converter.py
similarity index 98%
rename from datadreamer/utils/ldf_converter.py
rename to datadreamer/utils/luxonis_dataset_converter.py
index edde585..e68be86 100644
--- a/datadreamer/utils/ldf_converter.py
+++ b/datadreamer/utils/luxonis_dataset_converter.py
@@ -8,7 +8,7 @@
 from datadreamer.utils import BaseConverter
 
 
-class LDFConverter(BaseConverter):
+class LuxonisDatasetConverter(BaseConverter):
     def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
         """Converts a dataset into a LuxonisDataset format.
 
diff --git a/examples/generate_dataset_and_train_yolo.ipynb b/examples/generate_dataset_and_train_yolo.ipynb
index 44bc495..7cea870 100644
--- a/examples/generate_dataset_and_train_yolo.ipynb
+++ b/examples/generate_dataset_and_train_yolo.ipynb
@@ -82,7 +82,7 @@
         "- `--prompt_generator`: Choose between `simple`, `lm` (language model) and `tiny` (tiny LM). Default is `simple`.\n",
         "- `--image_generator`: Choose image generator, e.g., `sdxl`, `sdxl-turbo` or `sdxl-lightning`. Default is `sdxl-turbo`.\n",
         "- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `clip` for image classification. Default is `owlv2`.\n",
-        "- `--dataset_format`: Format of the dataset. Defaults to `raw`. Supported values: `raw`, `yolo`, `coco`, `ldf`.\n",
+        "- `--dataset_format`: Format of the dataset. Defaults to `raw`. Supported values: `raw`, `yolo`, `coco`, `luxonis-dataset`.\n",
         "- `--split_ratios`: Split ratios for train, validation, and test sets. Defaults to `[0.8, 0.1, 0.1]`.\n",
         "- `--conf_threshold`: Confidence threshold for annotation. Default is `0.15`.\n",
         "- `--annotation_iou_threshold`: Intersection over Union (IoU) threshold for annotation. Default is `0.2`.\n",

From 140ecf37413afca02ef302e0fb15b921a6292353 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Mon, 8 Apr 2024 17:44:16 +0000
Subject: [PATCH 18/23] format: black and ruff

---
 datadreamer/pipelines/generate_dataset_from_scratch.py | 6 +++++-
 datadreamer/utils/dataset_utils.py                     | 4 ----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py
index 6651928..253e322 100644
--- a/datadreamer/pipelines/generate_dataset_from_scratch.py
+++ b/datadreamer/pipelines/generate_dataset_from_scratch.py
@@ -579,7 +579,11 @@ def main():
     # Convert annotations to LuxonisDataset format
     if args.dataset_format == "luxonis-dataset":
         convert_dataset.convert_dataset(
-            args.save_dir, args.save_dir, "luxonis-dataset", args.split_ratios, copy_files=False
+            args.save_dir,
+            args.save_dir,
+            "luxonis-dataset",
+            args.split_ratios,
+            copy_files=False,
         )
 
 
diff --git a/datadreamer/utils/dataset_utils.py b/datadreamer/utils/dataset_utils.py
index 0284f3f..a396ae0 100644
--- a/datadreamer/utils/dataset_utils.py
+++ b/datadreamer/utils/dataset_utils.py
@@ -1,9 +1,6 @@
 import json
 import os
 
-from luxonis_ml.data import LuxonisDataset
-from PIL import Image
-
 
 def save_annotations_to_json(
     image_paths,
@@ -31,4 +28,3 @@ def save_annotations_to_json(
     # Save to JSON file
     with open(os.path.join(save_dir, file_name), "w") as f:
         json.dump(annotations, f, indent=4)
-

From e376152d28dd6247fec2a51338ae80fa032ffa02 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Mon, 8 Apr 2024 17:56:46 +0000
Subject: [PATCH 19/23] [Automated] Updated coverage badge

---
 media/coverage_badge.svg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/media/coverage_badge.svg b/media/coverage_badge.svg
index 0cbe944..d7667be 100644
--- a/media/coverage_badge.svg
+++ b/media/coverage_badge.svg
@@ -15,7 +15,7 @@
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
         <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
         <text x="31.5" y="14">coverage</text>
-        <text x="80" y="15" fill="#010101" fill-opacity=".3">45%</text>
-        <text x="80" y="14">45%</text>
+        <text x="80" y="15" fill="#010101" fill-opacity=".3">47%</text>
+        <text x="80" y="14">47%</text>
     </g>
 </svg>

From 3adf29e96ba5b37c8ec82f53d0bd37701cf8d1b1 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Tue, 9 Apr 2024 14:15:11 +0000
Subject: [PATCH 20/23] feature: add reproducibility with a random seed to
 converters

---
 .../pipelines/generate_dataset_from_scratch.py     |  3 +++
 datadreamer/utils/base_converter.py                |  9 +++++++--
 datadreamer/utils/coco_converter.py                |  3 +++
 datadreamer/utils/convert_dataset.py               | 14 ++++++++++----
 datadreamer/utils/luxonis_dataset_converter.py     |  6 ++++++
 datadreamer/utils/yolo_converter.py                |  3 +++
 6 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py
index 253e322..2038961 100644
--- a/datadreamer/pipelines/generate_dataset_from_scratch.py
+++ b/datadreamer/pipelines/generate_dataset_from_scratch.py
@@ -565,6 +565,7 @@ def main():
                 "yolo",
                 args.split_ratios,
                 copy_files=False,
+                seed=args.seed,
             )
         # Convert annotations to COCO format
         elif args.dataset_format == "coco":
@@ -574,6 +575,7 @@ def main():
                 "coco",
                 args.split_ratios,
                 copy_files=False,
+                seed=args.seed,
             )
 
     # Convert annotations to LuxonisDataset format
@@ -584,6 +586,7 @@ def main():
             "luxonis-dataset",
             args.split_ratios,
             copy_files=False,
+            seed=args.seed,
         )
 
 
diff --git a/datadreamer/utils/base_converter.py b/datadreamer/utils/base_converter.py
index 61aa3aa..3d97199 100644
--- a/datadreamer/utils/base_converter.py
+++ b/datadreamer/utils/base_converter.py
@@ -9,6 +9,9 @@
 class BaseConverter(ABC):
     """Abstract base class for converter."""
 
+    def __init__(self, seed=42):
+        np.random.seed(seed)
+
     @abstractmethod
     def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
         """Converts a dataset into another format.
@@ -39,19 +42,21 @@ def read_annotations(annotation_path):
         return data
 
     @staticmethod
-    def make_splits(images, split_ratios):
+    def make_splits(images, split_ratios, shuffle=True):
         """Splits the list of images into training, validation, and test sets.
 
         Args:
         - images (list of str): A list of image paths.
         - split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+        - shuffle (bool, optional): Whether to shuffle the list of images. Defaults to True.
 
         Returns:
         - list of str: A list of image paths for the training set.
         - list of str: A list of image paths for the validation set.
         - list of str: A list of image paths for the test set.
         """
-        np.random.shuffle(images)
+        if shuffle:
+            np.random.shuffle(images)
 
         train_images = images[: int(len(images) * split_ratios[0])]
         val_images = images[
diff --git a/datadreamer/utils/coco_converter.py b/datadreamer/utils/coco_converter.py
index d9b554c..ba02d97 100644
--- a/datadreamer/utils/coco_converter.py
+++ b/datadreamer/utils/coco_converter.py
@@ -28,6 +28,9 @@ class COCOConverter(BaseConverter):
     │   ├── labels.json
     """
 
+    def __init__(self, seed=42):
+        super().__init__(seed)
+
     def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
         """Converts a dataset into a COCO format.
 
diff --git a/datadreamer/utils/convert_dataset.py b/datadreamer/utils/convert_dataset.py
index 7e18e35..fa3fd05 100644
--- a/datadreamer/utils/convert_dataset.py
+++ b/datadreamer/utils/convert_dataset.py
@@ -6,14 +6,14 @@
 
 
 def convert_dataset(
-    input_dir, output_dir, dataset_format, split_ratios, copy_files=True
+    input_dir, output_dir, dataset_format, split_ratios, copy_files=True, seed=42
 ):
     if dataset_format == "yolo":
-        converter = YOLOConverter()
+        converter = YOLOConverter(seed=seed)
     elif dataset_format == "coco":
-        converter = COCOConverter()
+        converter = COCOConverter(seed=seed)
     elif dataset_format == "luxonis-dataset":
-        converter = LuxonisDatasetConverter()
+        converter = LuxonisDatasetConverter(seed=seed)
     else:
         raise ValueError(f"Invalid dataset format: {dataset_format}")
 
@@ -51,6 +51,12 @@ def main():
         default=True,
         help="Copy files to output directory, otherwise move them.",
     )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for reproducibility.",
+    )
 
     args = parser.parse_args()
 
diff --git a/datadreamer/utils/luxonis_dataset_converter.py b/datadreamer/utils/luxonis_dataset_converter.py
index e68be86..fdbdc79 100644
--- a/datadreamer/utils/luxonis_dataset_converter.py
+++ b/datadreamer/utils/luxonis_dataset_converter.py
@@ -9,6 +9,12 @@
 
 
 class LuxonisDatasetConverter(BaseConverter):
+    """Class for converting a dataset to LuxonisDataset format.
+    """
+
+    def __init__(self, seed=42):
+        super().__init__(seed)
+    
     def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
         """Converts a dataset into a LuxonisDataset format.
 
diff --git a/datadreamer/utils/yolo_converter.py b/datadreamer/utils/yolo_converter.py
index 9712a52..36452da 100644
--- a/datadreamer/utils/yolo_converter.py
+++ b/datadreamer/utils/yolo_converter.py
@@ -29,6 +29,9 @@ class YOLOConverter(BaseConverter):
     │   ├── labels
     """
 
+    def __init__(self, seed=42):
+        super().__init__(seed)
+
     def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
         """Converts a dataset into a format suitable for training with YOLO, including
         creating training and validation splits.

From ce685a7503c12c73bf5160760c09e2d3a24265dd Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Tue, 9 Apr 2024 14:15:56 +0000
Subject: [PATCH 21/23] test: add converter tests

---
 tests/unittests/test_converters.py | 234 +++++++++++++++++++++++++++++
 1 file changed, 234 insertions(+)
 create mode 100644 tests/unittests/test_converters.py

diff --git a/tests/unittests/test_converters.py b/tests/unittests/test_converters.py
new file mode 100644
index 0000000..5e32a00
--- /dev/null
+++ b/tests/unittests/test_converters.py
@@ -0,0 +1,234 @@
+import os
+import json
+import shutil
+import unittest
+from unittest.mock import patch, MagicMock
+from PIL import Image
+from datadreamer.utils import BaseConverter, COCOConverter, YOLOConverter, LuxonisDatasetConverter
+from luxonis_ml.data import LuxonisDataset
+
+
+class TestBaseConverter(unittest.TestCase):
+    def setUp(self):
+        self.test_dir = "test_dataset"
+        os.makedirs(self.test_dir, exist_ok=True)
+
+        # Create sample annotations
+        self.annotations = {
+            "class_names": ["cat", "dog"],
+            "0.jpg": {"boxes": [[10, 10, 50, 50]], "labels": [0]},
+            "1.jpg": {"boxes": [[20, 20, 70, 70]], "labels": [1]},
+        }
+        with open(os.path.join(self.test_dir, "annotations.json"), "w") as f:
+            json.dump(self.annotations, f)
+
+        # Create sample images
+        open(os.path.join(self.test_dir, "0.jpg"), "a").close()
+        open(os.path.join(self.test_dir, "1.jpg"), "a").close()
+
+    def tearDown(self):
+        os.remove(os.path.join(self.test_dir, "annotations.json"))
+        os.remove(os.path.join(self.test_dir, "0.jpg"))
+        os.remove(os.path.join(self.test_dir, "1.jpg"))
+        os.rmdir(self.test_dir)
+
+    def test_read_annotations(self):
+        annotation_path = os.path.join(self.test_dir, "annotations.json")
+        data = BaseConverter.read_annotations(annotation_path)
+        self.assertEqual(data, self.annotations)
+
+    def test_make_splits(self):
+        images = ["0.jpg", "1.jpg"]
+        split_ratios = [0.5, 0.5, 0.0]
+        train_images, val_images, test_images = BaseConverter.make_splits(images, split_ratios, shuffle=False)
+
+        self.assertEqual(len(train_images), 1)
+        self.assertEqual(len(val_images), 1)
+        self.assertEqual(len(test_images), 0)
+        self.assertTrue("0.jpg" in train_images)
+        self.assertTrue("1.jpg" in val_images)
+
+
+class TestCOCOConverter(unittest.TestCase):
+    def setUp(self):
+        self.test_dir = "test_dataset"
+        os.makedirs(self.test_dir, exist_ok=True)
+
+        # Create sample images
+        self.image_size = (100, 100)
+        self.create_sample_image("0.jpg")
+        self.create_sample_image("1.jpg")
+
+        # Create sample labels
+        self.labels = {
+            "class_names": ["cat", "dog"],
+            "0.jpg": {"boxes": [(10, 10, 50, 50)], "labels": [0]},
+            "1.jpg": {"boxes": [(20, 20, 70, 70)], "labels": [1]},
+        }
+        with open(os.path.join(self.test_dir, "annotations.json"), "w") as f:
+            json.dump(self.labels, f)
+
+    def tearDown(self):
+        shutil.rmtree(self.test_dir)
+        if hasattr(self, 'output_dir') and os.path.exists(self.output_dir):
+            shutil.rmtree(self.output_dir)
+
+    def create_sample_image(self, filename):
+        image = Image.new("RGB", self.image_size, color="white")
+        image.save(os.path.join(self.test_dir, filename))
+
+    def test_convert(self):
+        self.output_dir = "output_dir"
+        split_ratios = [0.6, 0.2, 0.2]
+        converter = COCOConverter()
+        converter.convert(self.test_dir, self.output_dir, split_ratios, copy_files=True)
+
+        self.assertTrue(os.path.exists(self.output_dir))
+        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "train")))
+        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "validation")))
+        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test")))
+
+        # Test whether labels.json files exist in all output directories
+        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "train", "labels.json")))
+        self.assertTrue(
+            os.path.exists(os.path.join(self.output_dir, "validation", "labels.json"))
+        )
+        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test", "labels.json")))
+
+    def test_process_data(self):
+        self.output_dir = "output_dir"
+        split_ratios = [0.6, 0.2, 0.2]
+        converter = COCOConverter()
+        converter.process_data(
+            self.labels, self.test_dir, self.output_dir, split_ratios, copy_files=True
+        )
+
+        self.assertTrue(os.path.exists(self.output_dir))
+        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "train")))
+        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "validation")))
+        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test")))
+
+        # Test whether labels.json files exist in all output directories
+        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "train", "labels.json")))
+        self.assertTrue(
+            os.path.exists(os.path.join(self.output_dir, "validation", "labels.json"))
+        )
+        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test", "labels.json")))
+
+    def test_save_labels(self):
+        self.output_dir = "output_dir"
+        converter = COCOConverter()
+        images_info = [
+            {"id": 1, "file_name": "0.jpg", "width": 100, "height": 100},
+            {"id": 2, "file_name": "1.jpg", "width": 100, "height": 100},
+        ]
+        annotations = [
+            {
+                "id": 1,
+                "image_id": 1,
+                "category_id": 0,
+                "bbox": [10, 10, 40, 40],
+                "segmentation": None,
+                "area": 1200,
+                "iscrowd": 0,
+            },
+            {
+                "id": 2,
+                "image_id": 2,
+                "category_id": 1,
+                "bbox": [20, 20, 50, 50],
+                "segmentation": None,
+                "area": 1500,
+                "iscrowd": 0,
+            },
+        ]
+        class_names = ["cat", "dog"]
+
+        # Test whether labels.json file is saved correctly
+        os.makedirs(self.output_dir)
+        converter.save_labels(self.output_dir, images_info, annotations, class_names)
+        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "labels.json")))
+
+        # Test whether the content of labels.json is correct
+        with open(os.path.join(self.output_dir, "labels.json"), "r") as f:
+            saved_labels = json.load(f)
+
+        self.assertEqual(saved_labels["images"], images_info)
+        self.assertEqual(saved_labels["annotations"], annotations)
+        self.assertEqual(
+            saved_labels["categories"], [{"id": i, "name": name} for i, name in enumerate(class_names)]
+        )
+
+class TestYOLOConverter(unittest.TestCase):
+    def setUp(self):
+        self.test_dir = "test_dataset"
+        os.makedirs(self.test_dir, exist_ok=True)
+
+        # Create sample images
+        self.image_size = (100, 100)
+        self.create_sample_image("0.jpg")
+        self.create_sample_image("1.jpg")
+
+        # Create sample labels
+        self.labels = {
+            "class_names": ["cat", "dog"],
+            "0.jpg": {"boxes": [(10, 10, 50, 50)], "labels": [0]},
+            "1.jpg": {"boxes": [(20, 20, 70, 70)], "labels": [1]},
+        }
+        with open(os.path.join(self.test_dir, "annotations.json"), "w") as f:
+            json.dump(self.labels, f)
+
+    def tearDown(self):
+        shutil.rmtree(self.test_dir)
+        if hasattr(self, 'output_dir') and os.path.exists(self.output_dir):
+            shutil.rmtree(self.output_dir)
+
+    def create_sample_image(self, filename):
+        image = Image.new("RGB", self.image_size, color="white")
+        image.save(os.path.join(self.test_dir, filename))
+
+    def test_convert_to_yolo_format(self):
+        converter = YOLOConverter()
+        yolo_format = converter.convert_to_yolo_format([10, 10, 50, 50], 100, 100)
+        self.assertEqual(yolo_format, [0.3, 0.3, 0.4, 0.4])
+
+    def test_process_data(self):
+        self.output_dir = "output_dir"
+        split_ratios = [1, 0, 0]
+        converter = YOLOConverter()
+        converter.process_data(
+            self.labels, self.test_dir, self.output_dir, split_ratios, copy_files=True
+        )
+
+        self.assertTrue(os.path.exists(self.output_dir))
+        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "train")))
+        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "val")))
+        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test")))
+
+        # Test whether labels files exist in all output directories
+        train_label_file = os.path.join(self.output_dir, "train", "labels", "0.txt")
+        self.assertTrue(os.path.exists(train_label_file))
+        with open(train_label_file, "r") as f:
+            content = f.read()
+            self.assertEqual(content.strip(), "0 0.3 0.3 0.4 0.4")
+
+    def test_create_data_yaml(self):
+        self.output_dir = "output_dir"
+        converter = YOLOConverter()
+        class_names = ["cat", "dog"]
+        os.makedirs(self.output_dir, exist_ok=True)
+        converter.create_data_yaml(self.output_dir, class_names)
+
+        yaml_file = os.path.join(self.output_dir, "data.yaml")
+        self.assertTrue(os.path.exists(yaml_file))
+
+        with open(yaml_file, "r") as f:
+            content = f.read()
+            self.assertIn("train:", content)
+            self.assertIn("val:", content)
+            self.assertIn("test:", content)
+            self.assertIn("nc: 2", content)
+            self.assertIn("names: ['cat', 'dog']", content)
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file

From 896af05735c195ff71cdbe23b55d46eb66942a9b Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Tue, 9 Apr 2024 14:17:18 +0000
Subject: [PATCH 22/23] format: black

---
 .../utils/luxonis_dataset_converter.py        |  5 +--
 tests/unittests/test_converters.py            | 43 +++++++++++++------
 2 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/datadreamer/utils/luxonis_dataset_converter.py b/datadreamer/utils/luxonis_dataset_converter.py
index fdbdc79..2e7d2f4 100644
--- a/datadreamer/utils/luxonis_dataset_converter.py
+++ b/datadreamer/utils/luxonis_dataset_converter.py
@@ -9,12 +9,11 @@
 
 
 class LuxonisDatasetConverter(BaseConverter):
-    """Class for converting a dataset to LuxonisDataset format.
-    """
+    """Class for converting a dataset to LuxonisDataset format."""
 
     def __init__(self, seed=42):
         super().__init__(seed)
-    
+
     def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
         """Converts a dataset into a LuxonisDataset format.
 
diff --git a/tests/unittests/test_converters.py b/tests/unittests/test_converters.py
index 5e32a00..7e724da 100644
--- a/tests/unittests/test_converters.py
+++ b/tests/unittests/test_converters.py
@@ -1,11 +1,15 @@
-import os
 import json
+import os
 import shutil
 import unittest
-from unittest.mock import patch, MagicMock
+
 from PIL import Image
-from datadreamer.utils import BaseConverter, COCOConverter, YOLOConverter, LuxonisDatasetConverter
-from luxonis_ml.data import LuxonisDataset
+
+from datadreamer.utils import (
+    BaseConverter,
+    COCOConverter,
+    YOLOConverter,
+)
 
 
 class TestBaseConverter(unittest.TestCase):
@@ -40,7 +44,9 @@ def test_read_annotations(self):
     def test_make_splits(self):
         images = ["0.jpg", "1.jpg"]
         split_ratios = [0.5, 0.5, 0.0]
-        train_images, val_images, test_images = BaseConverter.make_splits(images, split_ratios, shuffle=False)
+        train_images, val_images, test_images = BaseConverter.make_splits(
+            images, split_ratios, shuffle=False
+        )
 
         self.assertEqual(len(train_images), 1)
         self.assertEqual(len(val_images), 1)
@@ -70,7 +76,7 @@ def setUp(self):
 
     def tearDown(self):
         shutil.rmtree(self.test_dir)
-        if hasattr(self, 'output_dir') and os.path.exists(self.output_dir):
+        if hasattr(self, "output_dir") and os.path.exists(self.output_dir):
             shutil.rmtree(self.output_dir)
 
     def create_sample_image(self, filename):
@@ -89,11 +95,15 @@ def test_convert(self):
         self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test")))
 
         # Test whether labels.json files exist in all output directories
-        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "train", "labels.json")))
+        self.assertTrue(
+            os.path.exists(os.path.join(self.output_dir, "train", "labels.json"))
+        )
         self.assertTrue(
             os.path.exists(os.path.join(self.output_dir, "validation", "labels.json"))
         )
-        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test", "labels.json")))
+        self.assertTrue(
+            os.path.exists(os.path.join(self.output_dir, "test", "labels.json"))
+        )
 
     def test_process_data(self):
         self.output_dir = "output_dir"
@@ -109,11 +119,15 @@ def test_process_data(self):
         self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test")))
 
         # Test whether labels.json files exist in all output directories
-        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "train", "labels.json")))
+        self.assertTrue(
+            os.path.exists(os.path.join(self.output_dir, "train", "labels.json"))
+        )
         self.assertTrue(
             os.path.exists(os.path.join(self.output_dir, "validation", "labels.json"))
         )
-        self.assertTrue(os.path.exists(os.path.join(self.output_dir, "test", "labels.json")))
+        self.assertTrue(
+            os.path.exists(os.path.join(self.output_dir, "test", "labels.json"))
+        )
 
     def test_save_labels(self):
         self.output_dir = "output_dir"
@@ -156,9 +170,11 @@ def test_save_labels(self):
         self.assertEqual(saved_labels["images"], images_info)
         self.assertEqual(saved_labels["annotations"], annotations)
         self.assertEqual(
-            saved_labels["categories"], [{"id": i, "name": name} for i, name in enumerate(class_names)]
+            saved_labels["categories"],
+            [{"id": i, "name": name} for i, name in enumerate(class_names)],
         )
 
+
 class TestYOLOConverter(unittest.TestCase):
     def setUp(self):
         self.test_dir = "test_dataset"
@@ -180,7 +196,7 @@ def setUp(self):
 
     def tearDown(self):
         shutil.rmtree(self.test_dir)
-        if hasattr(self, 'output_dir') and os.path.exists(self.output_dir):
+        if hasattr(self, "output_dir") and os.path.exists(self.output_dir):
             shutil.rmtree(self.output_dir)
 
     def create_sample_image(self, filename):
@@ -230,5 +246,6 @@ def test_create_data_yaml(self):
             self.assertIn("nc: 2", content)
             self.assertIn("names: ['cat', 'dog']", content)
 
+
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()

From b147b6162b3acfd4351cf1d4e2893274d8476c1e Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Tue, 9 Apr 2024 14:29:56 +0000
Subject: [PATCH 23/23] [Automated] Updated coverage badge

---
 media/coverage_badge.svg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/media/coverage_badge.svg b/media/coverage_badge.svg
index d7667be..d1f8965 100644
--- a/media/coverage_badge.svg
+++ b/media/coverage_badge.svg
@@ -15,7 +15,7 @@
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
         <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
         <text x="31.5" y="14">coverage</text>
-        <text x="80" y="15" fill="#010101" fill-opacity=".3">47%</text>
-        <text x="80" y="14">47%</text>
+        <text x="80" y="15" fill="#010101" fill-opacity=".3">53%</text>
+        <text x="80" y="14">53%</text>
     </g>
 </svg>