From e9412559bcf5e01b31ffc386e755cd016e10700b Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Tue, 24 Sep 2024 13:49:51 +0000
Subject: [PATCH 01/31] fix: unet from config warning

---
 .../image_generation/sdxl_lightning_image_generator.py       | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/datadreamer/image_generation/sdxl_lightning_image_generator.py b/datadreamer/image_generation/sdxl_lightning_image_generator.py
index 33c5141..f5bf977 100644
--- a/datadreamer/image_generation/sdxl_lightning_image_generator.py
+++ b/datadreamer/image_generation/sdxl_lightning_image_generator.py
@@ -46,16 +46,17 @@ def _init_gen_model(self):
         base = "stabilityai/stable-diffusion-xl-base-1.0"
         repo = "ByteDance/SDXL-Lightning"
         ckpt = "sdxl_lightning_4step_unet.safetensors"  # Use the correct ckpt for your step setting!
+        config = UNet2DConditionModel.load_config(base, subfolder="unet")
 
         # Load model.
         if self.device == "cpu":
             print("Loading SDXL Lightning on CPU...")
-            unet = UNet2DConditionModel.from_config(base, subfolder="unet")
+            unet = UNet2DConditionModel.from_config(config)
             unet.load_state_dict(load_file(hf_hub_download(repo, ckpt)))
             pipe = StableDiffusionXLPipeline.from_pretrained(base, unet=unet)
         else:
             print("Loading SDXL Lightning on GPU...")
-            unet = UNet2DConditionModel.from_config(base, subfolder="unet").to(
+            unet = UNet2DConditionModel.from_config(config).to(
                 self.device, torch.float16
             )
             unet.load_state_dict(

From 6481f04d70ba3763447d0ae8593d69981dd5a202 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Tue, 24 Sep 2024 18:02:26 +0000
Subject: [PATCH 02/31] feat: add logger

---
 .../dataset_annotation/clip_annotator.py       |  4 ++++
 .../dataset_annotation/image_annotator.py      |  9 ++++++---
 .../dataset_annotation/owlv2_annotator.py      |  5 ++++-
 .../image_generation/clip_image_tester.py      |  4 ++++
 .../image_generation/image_generator.py        | 18 +++++++++---------
 .../image_generation/sdxl_image_generator.py   |  9 ++++++---
 .../sdxl_lightning_image_generator.py          |  9 ++++-----
 .../sdxl_turbo_image_generator.py              |  6 ++++--
 .../pipelines/generate_dataset_from_scratch.py |  3 +++
 .../prompt_generation/lm_prompt_generator.py   | 12 +++++++-----
 .../prompt_generation/lm_synonym_generator.py  |  9 ++++++---
 .../prompt_generation/synonym_generator.py     |  4 +++-
 .../tinyllama_lm_prompt_generator.py           |  7 ++++---
 datadreamer/utils/luxonis_dataset_converter.py |  9 ++++++---
 datadreamer/utils/merge_raw_datasets.py        |  5 ++++-
 datadreamer/utils/nms.py                       |  5 ++++-
 .../utils/single_label_cls_converter.py        |  7 +++++--
 17 files changed, 83 insertions(+), 42 deletions(-)

diff --git a/datadreamer/dataset_annotation/clip_annotator.py b/datadreamer/dataset_annotation/clip_annotator.py
index ff7b9aa..3c4e6cd 100644
--- a/datadreamer/dataset_annotation/clip_annotator.py
+++ b/datadreamer/dataset_annotation/clip_annotator.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import logging
 from typing import List
 
 import numpy as np
@@ -10,6 +11,8 @@
 
 from datadreamer.dataset_annotation.image_annotator import BaseAnnotator, TaskList
 
+logger = logging.getLogger(__name__)
+
 
 class CLIPAnnotator(BaseAnnotator):
     """A class for image annotation using the CLIP model, specializing in image
@@ -63,6 +66,7 @@ def _init_model(self):
         Returns:
             CLIPModel: The initialized CLIP model.
         """
+        logger.info(f"Initializing CLIP {self.size} model...")
         if self.size == "large":
             return CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
         return CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
diff --git a/datadreamer/dataset_annotation/image_annotator.py b/datadreamer/dataset_annotation/image_annotator.py
index 4479ffe..757baab 100644
--- a/datadreamer/dataset_annotation/image_annotator.py
+++ b/datadreamer/dataset_annotation/image_annotator.py
@@ -4,15 +4,12 @@
 from abc import ABC, abstractmethod
 
 
-# Enum for different labeling tasks
 class TaskList(enum.Enum):
     CLASSIFICATION = "classification"
     OBJECT_DETECTION = "object_detection"
     SEGMENTATION = "segmentation"
-    # Add more tasks as needed
 
 
-# Abstract base class for data labeling
 class BaseAnnotator(ABC):
     """Abstract base class for creating annotators.
 
@@ -24,6 +21,8 @@ class BaseAnnotator(ABC):
     Methods:
         annotate_batch(): Abstract method to be implemented by subclasses. It should contain
                     the logic for performing annotation based on the task definition.
+        release(): Abstract method to be implemented by subclasses. It should contain
+                    the logic for releasing the resources used by the annotator.
     """
 
     def __init__(
@@ -35,3 +34,7 @@ def __init__(
     @abstractmethod
     def annotate_batch(self):
         pass
+
+    @abstractmethod
+    def release(self, empty_cuda_cache=False) -> None:
+        pass
diff --git a/datadreamer/dataset_annotation/owlv2_annotator.py b/datadreamer/dataset_annotation/owlv2_annotator.py
index 25f247f..231558b 100644
--- a/datadreamer/dataset_annotation/owlv2_annotator.py
+++ b/datadreamer/dataset_annotation/owlv2_annotator.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import logging
 from typing import List, Tuple
 
 import numpy as np
@@ -11,6 +12,8 @@
 from datadreamer.dataset_annotation.utils import apply_tta
 from datadreamer.utils.nms import non_max_suppression
 
+logger = logging.getLogger(__name__)
+
 
 class OWLv2Annotator(BaseAnnotator):
     """A class for image annotation using the OWLv2 model, specializing in object
@@ -54,6 +57,7 @@ def _init_model(self):
         Returns:
             Owlv2ForObjectDetection: The initialized OWLv2 model.
         """
+        logger.info(f"Initializing OWLv2 {self.size} model...")
         if self.size == "large":
             return Owlv2ForObjectDetection.from_pretrained(
                 "google/owlv2-large-patch14-ensemble"
@@ -107,7 +111,6 @@ def _generate_annotations(
         ).to(self.device)
         with torch.no_grad():
             outputs = self.model(**inputs)
-        # print(outputs)
         preds = self.processor.post_process_object_detection(
             outputs=outputs, target_sizes=target_sizes, threshold=conf_threshold
         )
diff --git a/datadreamer/image_generation/clip_image_tester.py b/datadreamer/image_generation/clip_image_tester.py
index 2c67965..c1bf3b6 100644
--- a/datadreamer/image_generation/clip_image_tester.py
+++ b/datadreamer/image_generation/clip_image_tester.py
@@ -1,11 +1,14 @@
 from __future__ import annotations
 
+import logging
 from typing import List
 
 import torch
 from PIL import Image
 from transformers import CLIPModel, CLIPProcessor
 
+logger = logging.getLogger(__name__)
+
 
 class ClipImageTester:
     """A class for testing images against a set of textual objects using the CLIP model.
@@ -22,6 +25,7 @@ class ClipImageTester:
 
     def __init__(self, device: str = "cuda") -> None:
         """Initializes the ClipImageTester with the CLIP model and processor."""
+        logger.info("Initializing CLIP image tester...")
         self.clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
         self.clip_processor = CLIPProcessor.from_pretrained(
             "openai/clip-vit-base-patch32"
diff --git a/datadreamer/image_generation/image_generator.py b/datadreamer/image_generation/image_generator.py
index 4b01f81..3e2aff7 100644
--- a/datadreamer/image_generation/image_generator.py
+++ b/datadreamer/image_generation/image_generator.py
@@ -30,7 +30,7 @@ class ImageGenerator:
         set_seed(seed): Sets the seed for random number generators.
         generate_images(prompts, prompt_objects): Generates images based on provided prompts and optional object prompts.
         release(empty_cuda_cache): Releases resources and optionally empties the CUDA cache. (Abstract method)
-        generate_image(prompt, negative_prompt, prompt_objects): Generates a single image based on the provided prompt. (Abstract method)
+        generate_images_batch(prompts, negative_prompt, prompt_objects): Generates a batch of images based on the provided prompts. Abstract method)
 
     Note:
         The actual model for image generation needs to be defined in the subclass.
@@ -151,20 +151,20 @@ def release(self, empty_cuda_cache=False) -> None:
         pass
 
     @abstractmethod
-    def generate_image(
+    def generate_images_batch(
         self,
-        prompt: str,
+        prompts: List[str],
         negative_prompt: str,
-        prompt_objects: Optional[List[str]] = None,
-    ) -> Image.Image:
-        """Generates a single image based on the provided prompt.
+        prompt_objects: Optional[List[List[str]]] = None,
+    ) -> List[Image.Image]:
+        """Generates a batch of images based on the provided prompts.
 
         Args:
-            prompt (str): The positive prompt to guide image generation.
+            prompts (List[str]): A list of positive prompts to guide image generation.
             negative_prompt (str): The negative prompt to avoid certain features in the image.
-            prompt_objects (Optional[List[str]]): Optional list of objects to be used in CLIP model testing.
+            prompt_objects (Optional[List[List[str]]]): Optional list of objects to be used in CLIP model testing.
 
         Returns:
-            Image.Image: The generated image.
+            List[Image.Image]: A list of generated images.
         """
         pass
diff --git a/datadreamer/image_generation/sdxl_image_generator.py b/datadreamer/image_generation/sdxl_image_generator.py
index 1882f4a..6dc583e 100644
--- a/datadreamer/image_generation/sdxl_image_generator.py
+++ b/datadreamer/image_generation/sdxl_image_generator.py
@@ -1,13 +1,17 @@
 from __future__ import annotations
 
+import logging
 from typing import List, Optional
 
 import torch
 from compel import Compel, ReturnedEmbeddingsType
 from diffusers import DiffusionPipeline
+from PIL import Image
 
 from datadreamer.image_generation.image_generator import ImageGenerator
 
+logger = logging.getLogger(__name__)
+
 
 class StableDiffusionImageGenerator(ImageGenerator):
     """A subclass of ImageGenerator that uses the Stable Diffusion model for image
@@ -38,8 +42,8 @@ def _init_gen_model(self):
         Returns:
             tuple: The base and refiner models.
         """
+        logger.info(f"Initializing SDXL on {self.device}...")
         if self.device == "cpu":
-            print("Loading SDXL on CPU...")
             base = DiffusionPipeline.from_pretrained(
                 "stabilityai/stable-diffusion-xl-base-1.0",
                 # variant="fp16",
@@ -57,7 +61,6 @@ def _init_gen_model(self):
             )
             refiner.to("cpu")
         else:
-            print("Loading SDXL on GPU...")
             base = DiffusionPipeline.from_pretrained(
                 "stabilityai/stable-diffusion-xl-base-1.0",
                 torch_dtype=torch.float16,
@@ -102,7 +105,7 @@ def generate_images_batch(
         prompts: List[str],
         negative_prompt: str,
         prompt_objects: Optional[List[List[str]]] = None,
-    ):
+    ) -> List[Image.Image]:
         """Generates a batch of images based on the provided prompts.
 
         Args:
diff --git a/datadreamer/image_generation/sdxl_lightning_image_generator.py b/datadreamer/image_generation/sdxl_lightning_image_generator.py
index f5bf977..cc43dfe 100644
--- a/datadreamer/image_generation/sdxl_lightning_image_generator.py
+++ b/datadreamer/image_generation/sdxl_lightning_image_generator.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import logging
 from typing import List, Optional
 
 import torch
@@ -15,6 +16,8 @@
 
 from datadreamer.image_generation.image_generator import ImageGenerator
 
+logger = logging.getLogger(__name__)
+
 
 class StableDiffusionLightningImageGenerator(ImageGenerator):
     """A subclass of ImageGenerator specifically designed to use the Stable Diffusion
@@ -48,14 +51,12 @@ def _init_gen_model(self):
         ckpt = "sdxl_lightning_4step_unet.safetensors"  # Use the correct ckpt for your step setting!
         config = UNet2DConditionModel.load_config(base, subfolder="unet")
 
-        # Load model.
+        logger.info(f"Initializing SDXL Lightning on {self.device}...")
         if self.device == "cpu":
-            print("Loading SDXL Lightning on CPU...")
             unet = UNet2DConditionModel.from_config(config)
             unet.load_state_dict(load_file(hf_hub_download(repo, ckpt)))
             pipe = StableDiffusionXLPipeline.from_pretrained(base, unet=unet)
         else:
-            print("Loading SDXL Lightning on GPU...")
             unet = UNet2DConditionModel.from_config(config).to(
                 self.device, torch.float16
             )
@@ -93,7 +94,6 @@ def generate_images_batch(
         prompts: List[str],
         negative_prompt: str,
         prompt_objects: Optional[List[List[str]]] = None,
-        batch_size: int = 1,
     ) -> List[Image.Image]:
         """Generates a batch of images using the Stable Diffusion Lightning model based
         on the provided prompts.
@@ -102,7 +102,6 @@ def generate_images_batch(
             prompts (List[str]): A list of positive prompts to guide image generation.
             negative_prompt (str): The negative prompt to avoid certain features in the image.
             prompt_objects (Optional[List[List[str]]]): Optional list of objects for each prompt for CLIP model testing.
-            batch_size (int): The number of images to generate in each batch.
 
         Returns:
             List[Image.Image]: A list of generated images.
diff --git a/datadreamer/image_generation/sdxl_turbo_image_generator.py b/datadreamer/image_generation/sdxl_turbo_image_generator.py
index e78fa17..54d9dd7 100644
--- a/datadreamer/image_generation/sdxl_turbo_image_generator.py
+++ b/datadreamer/image_generation/sdxl_turbo_image_generator.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import logging
 from typing import List, Optional
 
 import torch
@@ -8,6 +9,8 @@
 
 from datadreamer.image_generation.image_generator import ImageGenerator
 
+logger = logging.getLogger(__name__)
+
 
 class StableDiffusionTurboImageGenerator(ImageGenerator):
     """A subclass of ImageGenerator specifically designed to use the Stable Diffusion
@@ -34,8 +37,8 @@ def _init_gen_model(self):
         Returns:
             AutoPipelineForText2Image: The initialized Stable Diffusion Turbo model.
         """
+        logger.info(f"Initializing SDXL Turbo on {self.device}...")
         if self.device == "cpu":
-            print("Loading SDXL Turbo on CPU...")
             base = AutoPipelineForText2Image.from_pretrained(
                 "stabilityai/sdxl-turbo",
                 # variant="fp16",
@@ -44,7 +47,6 @@ def _init_gen_model(self):
             )
             base.to("cpu")
         else:
-            print("Loading SDXL Turbo on GPU...")
             base = AutoPipelineForText2Image.from_pretrained(
                 "stabilityai/sdxl-turbo",
                 torch_dtype=torch.float16,
diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py
index b1aef52..03e3a7d 100644
--- a/datadreamer/pipelines/generate_dataset_from_scratch.py
+++ b/datadreamer/pipelines/generate_dataset_from_scratch.py
@@ -11,6 +11,7 @@
 import torch
 from box import Box
 from luxonis_ml.data import DATASETS_REGISTRY, LOADERS_REGISTRY
+from luxonis_ml.utils import setup_logging
 from PIL import Image
 from tqdm import tqdm
 
@@ -50,6 +51,8 @@
 det_annotators = {"owlv2": OWLv2Annotator}
 clf_annotators = {"clip": CLIPAnnotator}
 
+setup_logging(use_rich=True)
+
 
 def parse_args():
     # Argument parsing
diff --git a/datadreamer/prompt_generation/lm_prompt_generator.py b/datadreamer/prompt_generation/lm_prompt_generator.py
index 10ca96e..adbd44c 100644
--- a/datadreamer/prompt_generation/lm_prompt_generator.py
+++ b/datadreamer/prompt_generation/lm_prompt_generator.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import logging
 import random
 import re
 from typing import List, Literal, Optional
@@ -16,6 +17,8 @@
 
 from datadreamer.prompt_generation.prompt_generator import PromptGenerator
 
+logger = logging.getLogger(__name__)
+
 
 class LMPromptGenerator(PromptGenerator):
     """A language model-based prompt generator class, extending PromptGenerator.
@@ -69,8 +72,8 @@ def _init_lang_model(self) -> tuple[AutoModelForCausalLM, AutoTokenizer, Pipelin
             tuple: The initialized language model, tokenizer and pipeline.
         """
         selected_dtype = "auto"
+        logger.info(f"Initializing Mistral-7B language model on {self.device}...")
         if self.device == "cpu":
-            print("Loading language model on CPU...")
             model = AutoModelForCausalLM.from_pretrained(
                 "mistralai/Mistral-7B-Instruct-v0.1",
                 torch_dtype="auto",
@@ -79,7 +82,7 @@ def _init_lang_model(self) -> tuple[AutoModelForCausalLM, AutoTokenizer, Pipelin
             )
         else:
             if self.quantization == "none":
-                print("Loading FP16 language model on GPU...")
+                logger.info("Loading FP16 language model...")
                 selected_dtype = torch.float16
                 model = AutoModelForCausalLM.from_pretrained(
                     "mistralai/Mistral-7B-Instruct-v0.1",
@@ -88,7 +91,7 @@ def _init_lang_model(self) -> tuple[AutoModelForCausalLM, AutoTokenizer, Pipelin
                     device_map=self.device,
                 )
             else:
-                print("Loading INT4 language model on GPU...")
+                logger.info("Loading INT4 language model...")
                 # Create the BitsAndBytesConfig object with the dynamically constructed arguments
                 bnb_config = BitsAndBytesConfig(
                     load_in_4bit=True,
@@ -115,7 +118,6 @@ def _init_lang_model(self) -> tuple[AutoModelForCausalLM, AutoTokenizer, Pipelin
             device_map=self.device,
             batch_size=self.batch_size,
         )
-        print("Done!")
         return model, tokenizer, pipe
 
     def _remove_incomplete_sentence(self, text: str) -> str:
@@ -219,7 +221,7 @@ def generate_prompts(self) -> List[str]:
         """
         prompts = []
         progress_bar = tqdm(
-            desc="Generating prompts...", position=0, total=self.prompts_number
+            desc="Generating prompts", position=0, total=self.prompts_number
         )
         while len(prompts) < self.prompts_number:
             selected_objects_batch = [
diff --git a/datadreamer/prompt_generation/lm_synonym_generator.py b/datadreamer/prompt_generation/lm_synonym_generator.py
index fc86db8..a971655 100644
--- a/datadreamer/prompt_generation/lm_synonym_generator.py
+++ b/datadreamer/prompt_generation/lm_synonym_generator.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import logging
 import re
 from typing import List, Optional
 
@@ -13,6 +14,8 @@
 
 from datadreamer.prompt_generation.synonym_generator import SynonymGenerator
 
+logger = logging.getLogger(__name__)
+
 
 class LMSynonymGenerator(SynonymGenerator):
     """Synonym generator that generates synonyms for a list of words using a language
@@ -48,8 +51,8 @@ def _init_lang_model(self) -> tuple[AutoModelForCausalLM, AutoTokenizer, Pipelin
         Returns:
             tuple: The initialized language model, tokenizer and pipeline.
         """
+        logger.info(f"Initializing Mistral-7B language model on {self.device}...")
         if self.device == "cpu":
-            print("Loading language model on CPU...")
             model = AutoModelForCausalLM.from_pretrained(
                 "mistralai/Mistral-7B-Instruct-v0.1",
                 torch_dtype="auto",
@@ -57,7 +60,7 @@ def _init_lang_model(self) -> tuple[AutoModelForCausalLM, AutoTokenizer, Pipelin
                 low_cpu_mem_usage=True,
             )
         else:
-            print("Loading FP16 language model on GPU...")
+            logger.info("Loading FP16 language model...")
             model = AutoModelForCausalLM.from_pretrained(
                 "mistralai/Mistral-7B-Instruct-v0.1",
                 torch_dtype=torch.float16,
@@ -73,7 +76,7 @@ def _init_lang_model(self) -> tuple[AutoModelForCausalLM, AutoTokenizer, Pipelin
             torch_dtype=torch.float16 if self.device == "cuda" else "auto",
             device_map=self.device,
         )
-        print("Done!")
+        logger.info("Done!")
         return model, tokenizer, pipe
 
     def _generate_synonyms(self, prompt_text: str) -> List[str]:
diff --git a/datadreamer/prompt_generation/synonym_generator.py b/datadreamer/prompt_generation/synonym_generator.py
index ec3f306..bb4240e 100644
--- a/datadreamer/prompt_generation/synonym_generator.py
+++ b/datadreamer/prompt_generation/synonym_generator.py
@@ -1,11 +1,14 @@
 from __future__ import annotations
 
 import json
+import logging
 from abc import ABC, abstractmethod
 from typing import List, Optional
 
 from tqdm import tqdm
 
+logger = logging.getLogger(__name__)
+
 
 # Abstract base class for synonym generation
 class SynonymGenerator(ABC):
@@ -51,7 +54,6 @@ def generate_synonyms_for_list(self, words: List[str]) -> dict:
         for word in tqdm(words, desc="Generating synonyms"):
             synonyms = self.generate_synonyms(word)
             synonyms_dict[word] = synonyms
-        print("Synonyms generated")
         return synonyms_dict
 
     def save_synonyms(self, synonyms, save_path: str) -> None:
diff --git a/datadreamer/prompt_generation/tinyllama_lm_prompt_generator.py b/datadreamer/prompt_generation/tinyllama_lm_prompt_generator.py
index 78238e7..ed5fdcf 100644
--- a/datadreamer/prompt_generation/tinyllama_lm_prompt_generator.py
+++ b/datadreamer/prompt_generation/tinyllama_lm_prompt_generator.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import logging
 import re
 from typing import List, Literal, Optional
 
@@ -8,6 +9,8 @@
 
 from datadreamer.prompt_generation.lm_prompt_generator import LMPromptGenerator
 
+logger = logging.getLogger(__name__)
+
 
 class TinyLlamaLMPromptGenerator(LMPromptGenerator):
     """A language model-based prompt generator class, extending PromptGenerator.
@@ -53,8 +56,8 @@ def _init_lang_model(self) -> tuple[AutoModelForCausalLM, AutoTokenizer, Pipelin
         Returns:
             tuple: The initialized language model, tokenizer and pipeline.
         """
+        logger.info(f"Initializing TinyLlama-1.1B language model on {self.device}...")
         if self.device == "cpu":
-            print("Loading language model on CPU...")
             model = AutoModelForCausalLM.from_pretrained(
                 "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
                 torch_dtype="auto",
@@ -62,7 +65,6 @@ def _init_lang_model(self) -> tuple[AutoModelForCausalLM, AutoTokenizer, Pipelin
                 low_cpu_mem_usage=True,
             )
         else:
-            print("Loading language model on GPU...")
             model = AutoModelForCausalLM.from_pretrained(
                 "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
                 torch_dtype=torch.float16,
@@ -82,7 +84,6 @@ def _init_lang_model(self) -> tuple[AutoModelForCausalLM, AutoTokenizer, Pipelin
             device_map=self.device,
             batch_size=self.batch_size,
         )
-        print("Done!")
         return model, tokenizer, pipe
 
     def _remove_caption_sentences(self, text: str) -> str:
diff --git a/datadreamer/utils/luxonis_dataset_converter.py b/datadreamer/utils/luxonis_dataset_converter.py
index 72dc06d..9b55f79 100644
--- a/datadreamer/utils/luxonis_dataset_converter.py
+++ b/datadreamer/utils/luxonis_dataset_converter.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import logging
 import os
 
 from luxonis_ml.data import DATASETS_REGISTRY, LuxonisDataset
@@ -8,6 +9,8 @@
 
 from datadreamer.utils import BaseConverter
 
+logger = logging.getLogger(__name__)
+
 
 class LuxonisDatasetConverter(BaseConverter):
     """Class for converting a dataset to LuxonisDataset format."""
@@ -84,7 +87,7 @@ def dataset_generator():
         # if dataset_plugin is set, use that
         if self.dataset_plugin:
             if "GOOGLE_APPLICATION_CREDENTIALS" in os.environ:
-                print(f"Using {self.dataset_plugin} dataset")
+                logger.info(f"Using {self.dataset_plugin} dataset")
                 dataset_constructor = DATASETS_REGISTRY.get(self.dataset_plugin)
                 dataset = dataset_constructor(dataset_name)
             else:
@@ -96,10 +99,10 @@ def dataset_generator():
             "LUXONISML_BUCKET" in os.environ
             and "GOOGLE_APPLICATION_CREDENTIALS" in os.environ
         ):
-            print("Using GCS bucket")
+            logger.info("Using GCS bucket")
             dataset = LuxonisDataset(dataset_name, bucket_storage=BucketStorage.GCS)
         else:
-            print("Using local dataset")
+            logger.info("Using local dataset")
             dataset = LuxonisDataset(dataset_name)
 
         dataset.add(dataset_generator())
diff --git a/datadreamer/utils/merge_raw_datasets.py b/datadreamer/utils/merge_raw_datasets.py
index 47c1dc0..77f7359 100644
--- a/datadreamer/utils/merge_raw_datasets.py
+++ b/datadreamer/utils/merge_raw_datasets.py
@@ -2,9 +2,12 @@
 
 import argparse
 import json
+import logging
 import os
 import shutil
 
+logger = logging.getLogger(__name__)
+
 
 def merge_datasets(input_dirs, output_dir, copy_files=True):
     config_tasks = []
@@ -29,7 +32,7 @@ def merge_datasets(input_dirs, output_dir, copy_files=True):
         raise ValueError("All datasets must have different random seeds")
 
     # Create output directory
-    print(f"Output directory: {output_dir}")
+    logger.info(f"Output directory: {output_dir}")
     if os.path.exists(output_dir):
         shutil.rmtree(output_dir)
     os.makedirs(output_dir)
diff --git a/datadreamer/utils/nms.py b/datadreamer/utils/nms.py
index 530707c..1373858 100644
--- a/datadreamer/utils/nms.py
+++ b/datadreamer/utils/nms.py
@@ -4,6 +4,7 @@
 # https://github.com/ultralytics/yolov5/blob/master/utils/general.py
 from __future__ import annotations
 
+import logging
 import os
 import time
 
@@ -22,6 +23,8 @@
 )  # prevent OpenCV from multithreading (incompatible with PyTorch DataLoader)
 os.environ["NUMEXPR_MAX_THREADS"] = str(min(os.cpu_count(), 8))  # NumExpr max threads
 
+logger = logging.getLogger(__name__)
+
 
 def xywh2xyxy(x):
     """Convert boxes with shape [n, 4] from [x, y, w, h] to [x1, y1, x2, y2] where x1y1
@@ -131,7 +134,7 @@ def non_max_suppression(
 
         output[img_idx] = x[keep_box_idx]
         if (time.time() - tik) > time_limit:
-            print(f"WARNING: NMS cost time exceed the limited {time_limit}s.")
+            logger.warning(f"WARNING: NMS cost time exceed the limited {time_limit}s.")
             break  # time limit exceeded
 
     return output
diff --git a/datadreamer/utils/single_label_cls_converter.py b/datadreamer/utils/single_label_cls_converter.py
index e5515d5..e447c40 100644
--- a/datadreamer/utils/single_label_cls_converter.py
+++ b/datadreamer/utils/single_label_cls_converter.py
@@ -1,10 +1,13 @@
 from __future__ import annotations
 
+import logging
 import os
 import shutil
 
 from datadreamer.utils import BaseConverter
 
+logger = logging.getLogger(__name__)
+
 
 class SingleLabelClsConverter(BaseConverter):
     """Class for converting a dataset for single-label classification task.
@@ -64,12 +67,12 @@ def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=Tru
         class_names = data["class_names"]
         images.remove("class_names")
 
-        print(f"Number of images: {len(images)}")
+        logger.info(f"Number of images: {len(images)}")
 
         # Remove images with multiple labels
         single_label_images = [img for img in images if len(data[img]["labels"]) == 1]
 
-        print(f"Number of images with single label: {len(single_label_images)}")
+        logger.info(f"Number of images with single label: {len(single_label_images)}")
 
         # Split the data into training, validation, and test sets
         train_images, val_images, test_images = BaseConverter.make_splits(

From 8d47a8ab249549d58b922dfa856f6d8a24946b24 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Tue, 24 Sep 2024 21:31:38 +0000
Subject: [PATCH 03/31] test: add utils tests

---
 tests/unittests/test_utils.py | 184 ++++++++++++++++++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 tests/unittests/test_utils.py

diff --git a/tests/unittests/test_utils.py b/tests/unittests/test_utils.py
new file mode 100644
index 0000000..02faa19
--- /dev/null
+++ b/tests/unittests/test_utils.py
@@ -0,0 +1,184 @@
+import json
+import os
+import shutil
+import unittest
+
+from luxonis_ml.data import LuxonisDataset
+from PIL import Image
+import numpy as np
+import torch
+
+from datadreamer.utils import (
+    dataset_utils,
+    merge_raw_datasets,
+)
+
+def create_sample_image(image_name, image_size=(100, 100), color=(255, 0, 0), save_dir="test_images"):
+    """
+    Create and save a simple image with a solid color.
+    
+    Args:
+        image_name (str): The name of the image file.
+        image_size (tuple): The size of the image (width, height).
+        color (tuple): The RGB color of the image.
+        save_dir (str): The directory to save the images.
+    """
+    # Create the directory if it doesn't exist
+    os.makedirs(save_dir, exist_ok=True)
+    
+    # Create a blank image with the given color
+    img = Image.new('RGB', image_size, color)
+    
+    # Save the image to the specified directory
+    img.save(os.path.join(save_dir, image_name))
+
+
+class TestSaveAnnotationsToJson(unittest.TestCase):
+
+    def setUp(self):
+        # Create a temporary directory for saving images and JSON file
+        self.test_dir = "test_dir"
+        self.image_dir = "test_images"
+        os.makedirs(self.test_dir, exist_ok=True)
+        os.makedirs(self.image_dir, exist_ok=True)
+
+        # Create sample images
+        create_sample_image("image1.jpg", save_dir=self.image_dir)
+        create_sample_image("image2.jpg", save_dir=self.image_dir)
+
+        self.file_name = "annotations.json"
+        self.image_paths = [
+            os.path.join(self.image_dir, "image1.jpg"),
+            os.path.join(self.image_dir, "image2.jpg"),
+        ]
+        self.labels_list = [
+            [0],  # Labels for image1
+            [1],  # Labels for image2
+        ]
+        self.labels_list = np.array(self.labels_list)
+        self.boxes_list = [
+            [[10, 10, 50, 50]],  # Bounding boxes for image1
+            [[20, 20, 40, 40]],  # Bounding boxes for image2
+        ]
+        self.boxes_list = np.array(self.boxes_list)
+        self.class_names = ["class_1", "class_2"]
+
+    def tearDown(self):
+        # Clean up the test directory after each test
+        for file in os.listdir(self.test_dir):
+            os.remove(os.path.join(self.test_dir, file))
+        for file in os.listdir(self.image_dir):
+            os.remove(os.path.join(self.image_dir, file))
+        os.rmdir(self.test_dir)
+        os.rmdir(self.image_dir)
+
+    def test_save_annotations_to_json(self):
+        # Test saving annotations to JSON
+        dataset_utils.save_annotations_to_json(
+            self.image_paths,
+            self.labels_list,
+            boxes_list=self.boxes_list,
+            class_names=self.class_names,
+            save_dir=self.test_dir,
+            file_name=self.file_name,
+        )
+
+        # Load the saved JSON file and check contents
+        with open(os.path.join(self.test_dir, self.file_name), "r") as f:
+            annotations = json.load(f)
+
+        # Check if annotations are correct
+        self.assertEqual(len(annotations), 3)  # 2 images + class_names
+        self.assertIn("image1.jpg", annotations)
+        self.assertIn("image2.jpg", annotations)
+        self.assertEqual(annotations["image1.jpg"]["labels"], [0])
+        self.assertEqual(annotations["image2.jpg"]["labels"], [1])
+        self.assertEqual(annotations["class_names"], self.class_names)
+
+class TestMergeDatasets(unittest.TestCase):
+
+    def setUp(self):
+        # Create temporary directories for test datasets
+        self.input_dir_1 = "input_dir_1"
+        self.input_dir_2 = "input_dir_2"
+        self.input_dir_3 = "input_dir_3"
+        self.output_dir = "output_dir"
+        os.makedirs(self.input_dir_1, exist_ok=True)
+        os.makedirs(self.input_dir_2, exist_ok=True)
+        os.makedirs(self.input_dir_3, exist_ok=True)
+
+        # Create generation_args.json files
+        self.generation_args_1 = {
+            "task": "object_detection",
+            "class_names": ["class_1", "class_2"],
+            "seed": 1,
+        }
+        self.generation_args_2 = {
+            "task": "object_detection",
+            "class_names": ["class_1", "class_2"],
+            "seed": 2,
+        }
+        with open(os.path.join(self.input_dir_1, "generation_args.yaml"), "w") as f:
+            json.dump(self.generation_args_1, f)
+        with open(os.path.join(self.input_dir_2, "generation_args.yaml"), "w") as f:
+            json.dump(self.generation_args_2, f)
+
+        # Create annotations.json files
+        self.annotations_1 = {
+            "image1.jpg": {"labels": [0]},
+            "image2.jpg": {"labels": [1]},
+            "class_names": ["class_1", "class_2"],
+        }
+        self.annotations_2 = {
+            "image3.jpg": {"labels": [0]},
+            "image4.jpg": {"labels": [1]},
+            "class_names": ["class_1", "class_2"],
+        }
+        with open(os.path.join(self.input_dir_1, "annotations.json"), "w") as f:
+            json.dump(self.annotations_1, f)
+        with open(os.path.join(self.input_dir_2, "annotations.json"), "w") as f:
+            json.dump(self.annotations_2, f)
+
+        # Create image files
+        with open(os.path.join(self.input_dir_1, "image1.jpg"), "wb") as f:
+            f.write(os.urandom(1024))  # Dummy image content
+        with open(os.path.join(self.input_dir_1, "image2.jpg"), "wb") as f:
+            f.write(os.urandom(1024))  # Dummy image content
+        with open(os.path.join(self.input_dir_2, "image3.jpg"), "wb") as f:
+            f.write(os.urandom(1024))  # Dummy image content
+        with open(os.path.join(self.input_dir_2, "image4.jpg"), "wb") as f:
+            f.write(os.urandom(1024))  # Dummy image content
+
+    def tearDown(self):
+        # Clean up the test directories after each test
+        shutil.rmtree(self.input_dir_1)
+        shutil.rmtree(self.input_dir_2)
+        if os.path.exists(self.output_dir):
+            shutil.rmtree(self.output_dir)
+
+    def test_merge_datasets(self):
+        # Test merging datasets
+        merge_raw_datasets.merge_datasets([self.input_dir_1, self.input_dir_2], self.output_dir, copy_files=True)
+
+        # Check if output directory is created
+        self.assertTrue(os.path.exists(self.output_dir))
+
+        # Check if annotations.json is merged correctly
+        with open(os.path.join(self.output_dir, "annotations.json"), "r") as f:
+            merged_annotations = json.load(f)
+
+        print(merged_annotations)
+
+        self.assertEqual(len(merged_annotations), 5)  # 4 images in total + class_names
+        self.assertIn("image1.jpg", merged_annotations)
+        self.assertIn("image2.jpg", merged_annotations)
+        self.assertIn("image3.jpg", merged_annotations)
+        self.assertIn("image4.jpg", merged_annotations)
+        self.assertEqual(merged_annotations["class_names"], ["class_1", "class_2"])
+
+        # Check if images are copied correctly
+        for image_name in ["image1.jpg", "image2.jpg", "image3.jpg", "image4.jpg"]:
+            self.assertTrue(os.path.exists(os.path.join(self.output_dir, image_name)))
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file

From 32d16ad11b65d1cb8887977925c53a73fd84694d Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Tue, 24 Sep 2024 21:37:14 +0000
Subject: [PATCH 04/31] style: utils tests formatting

---
 tests/unittests/test_utils.py | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/tests/unittests/test_utils.py b/tests/unittests/test_utils.py
index 02faa19..bac169b 100644
--- a/tests/unittests/test_utils.py
+++ b/tests/unittests/test_utils.py
@@ -3,20 +3,20 @@
 import shutil
 import unittest
 
-from luxonis_ml.data import LuxonisDataset
-from PIL import Image
 import numpy as np
-import torch
+from PIL import Image
 
 from datadreamer.utils import (
     dataset_utils,
     merge_raw_datasets,
 )
 
-def create_sample_image(image_name, image_size=(100, 100), color=(255, 0, 0), save_dir="test_images"):
-    """
-    Create and save a simple image with a solid color.
-    
+
+def create_sample_image(
+    image_name, image_size=(100, 100), color=(255, 0, 0), save_dir="test_images"
+):
+    """Create and save a simple image with a solid color.
+
     Args:
         image_name (str): The name of the image file.
         image_size (tuple): The size of the image (width, height).
@@ -25,16 +25,15 @@ def create_sample_image(image_name, image_size=(100, 100), color=(255, 0, 0), sa
     """
     # Create the directory if it doesn't exist
     os.makedirs(save_dir, exist_ok=True)
-    
+
     # Create a blank image with the given color
-    img = Image.new('RGB', image_size, color)
-    
+    img = Image.new("RGB", image_size, color)
+
     # Save the image to the specified directory
     img.save(os.path.join(save_dir, image_name))
 
 
 class TestSaveAnnotationsToJson(unittest.TestCase):
-
     def setUp(self):
         # Create a temporary directory for saving images and JSON file
         self.test_dir = "test_dir"
@@ -95,8 +94,8 @@ def test_save_annotations_to_json(self):
         self.assertEqual(annotations["image2.jpg"]["labels"], [1])
         self.assertEqual(annotations["class_names"], self.class_names)
 
-class TestMergeDatasets(unittest.TestCase):
 
+class TestMergeDatasets(unittest.TestCase):
     def setUp(self):
         # Create temporary directories for test datasets
         self.input_dir_1 = "input_dir_1"
@@ -158,7 +157,9 @@ def tearDown(self):
 
     def test_merge_datasets(self):
         # Test merging datasets
-        merge_raw_datasets.merge_datasets([self.input_dir_1, self.input_dir_2], self.output_dir, copy_files=True)
+        merge_raw_datasets.merge_datasets(
+            [self.input_dir_1, self.input_dir_2], self.output_dir, copy_files=True
+        )
 
         # Check if output directory is created
         self.assertTrue(os.path.exists(self.output_dir))
@@ -180,5 +181,6 @@ def test_merge_datasets(self):
         for image_name in ["image1.jpg", "image2.jpg", "image3.jpg", "image4.jpg"]:
             self.assertTrue(os.path.exists(os.path.join(self.output_dir, image_name)))
 
+
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()

From 7a3e81eae2d98d33f1e4cc1f8c100fb1a6b9b6ca Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Tue, 24 Sep 2024 21:46:21 +0000
Subject: [PATCH 05/31] fix: args extenstion in merge dataset function

---
 datadreamer/utils/merge_raw_datasets.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/datadreamer/utils/merge_raw_datasets.py b/datadreamer/utils/merge_raw_datasets.py
index 77f7359..4cea85f 100644
--- a/datadreamer/utils/merge_raw_datasets.py
+++ b/datadreamer/utils/merge_raw_datasets.py
@@ -14,7 +14,7 @@ def merge_datasets(input_dirs, output_dir, copy_files=True):
     config_classes = []
     random_seeds = []
     for input_dir in input_dirs:
-        with open(os.path.join(input_dir, "generation_args.json")) as f:
+        with open(os.path.join(input_dir, "generation_args.yaml")) as f:
             generation_args = json.load(f)
         config_tasks.append(generation_args["task"])
         config_classes.append(generation_args["class_names"])
@@ -48,12 +48,12 @@ def merge_datasets(input_dirs, output_dir, copy_files=True):
         if copy_files:
             shutil.copy(
                 os.path.join(input_dir, "generation_args.yaml"),
-                os.path.join(output_dir, f"generation_args_{i}.json"),
+                os.path.join(output_dir, f"generation_args_{i}.yaml"),
             )
         else:
             shutil.move(
                 os.path.join(input_dir, "generation_args.yaml"),
-                os.path.join(output_dir, f"generation_args_{i}.json"),
+                os.path.join(output_dir, f"generation_args_{i}.yaml"),
             )
 
         # Copy or move images

From c5dea80dcbf60088a5213dfa929f5d4b093de322 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Tue, 24 Sep 2024 22:12:41 +0000
Subject: [PATCH 06/31] docs: docstrings and return types

---
 .../dataset_annotation/clip_annotator.py        |  6 +++---
 .../dataset_annotation/owlv2_annotator.py       |  4 ++--
 datadreamer/dataset_annotation/utils.py         |  2 +-
 .../image_generation/clip_image_tester.py       |  4 +++-
 datadreamer/image_generation/image_generator.py |  4 ++--
 .../image_generation/sdxl_image_generator.py    |  4 ++--
 .../sdxl_lightning_image_generator.py           |  4 ++--
 .../sdxl_turbo_image_generator.py               |  2 +-
 datadreamer/utils/base_converter.py             |  6 +++---
 datadreamer/utils/coco_converter.py             | 10 +++++++---
 datadreamer/utils/convert_dataset.py            | 17 ++++++++++++++++-
 datadreamer/utils/dataset_utils.py              | 17 ++++++++++++++++-
 datadreamer/utils/luxonis_dataset_converter.py  | 16 ++++++++++++++--
 datadreamer/utils/merge_raw_datasets.py         | 13 ++++++++++++-
 datadreamer/utils/nms.py                        |  2 +-
 datadreamer/utils/single_label_cls_converter.py |  6 ++++--
 datadreamer/utils/yolo_converter.py             | 10 ++++++----
 17 files changed, 95 insertions(+), 32 deletions(-)

diff --git a/datadreamer/dataset_annotation/clip_annotator.py b/datadreamer/dataset_annotation/clip_annotator.py
index 3c4e6cd..1da779d 100644
--- a/datadreamer/dataset_annotation/clip_annotator.py
+++ b/datadreamer/dataset_annotation/clip_annotator.py
@@ -50,7 +50,7 @@ def __init__(
         self.device = device
         self.model.to(self.device)
 
-    def _init_processor(self):
+    def _init_processor(self) -> CLIPProcessor:
         """Initializes the CLIP processor.
 
         Returns:
@@ -60,7 +60,7 @@ def _init_processor(self):
             return CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
         return CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
-    def _init_model(self):
+    def _init_model(self) -> CLIPModel:
         """Initializes the CLIP model.
 
         Returns:
@@ -87,7 +87,7 @@ def annotate_batch(
             synonym_dict (dict, optional): Dictionary for handling synonyms in labels. Defaults to None.
 
         Returns:
-            List[List[int]]: A list of lists of labels for each image.
+            List[np.ndarray]: A list of the annotations for each image.
         """
         if synonym_dict is not None:
             objs_syn = set()
diff --git a/datadreamer/dataset_annotation/owlv2_annotator.py b/datadreamer/dataset_annotation/owlv2_annotator.py
index 231558b..3537fdc 100644
--- a/datadreamer/dataset_annotation/owlv2_annotator.py
+++ b/datadreamer/dataset_annotation/owlv2_annotator.py
@@ -51,7 +51,7 @@ def __init__(
         self.device = device
         self.model.to(self.device)
 
-    def _init_model(self):
+    def _init_model(self) -> Owlv2ForObjectDetection:
         """Initializes the OWLv2 model for object detection.
 
         Returns:
@@ -66,7 +66,7 @@ def _init_model(self):
             "google/owlv2-base-patch16-ensemble"
         )
 
-    def _init_processor(self):
+    def _init_processor(self) -> Owlv2Processor:
         """Initializes the processor for the OWLv2 model.
 
         Returns:
diff --git a/datadreamer/dataset_annotation/utils.py b/datadreamer/dataset_annotation/utils.py
index 942d1a4..e0edc20 100644
--- a/datadreamer/dataset_annotation/utils.py
+++ b/datadreamer/dataset_annotation/utils.py
@@ -3,7 +3,7 @@
 from torchvision import transforms
 
 
-def apply_tta(image):
+def apply_tta(image) -> list:
     """Apply test-time augmentation (TTA) to the given image.
 
     Args:
diff --git a/datadreamer/image_generation/clip_image_tester.py b/datadreamer/image_generation/clip_image_tester.py
index c1bf3b6..d2aeccb 100644
--- a/datadreamer/image_generation/clip_image_tester.py
+++ b/datadreamer/image_generation/clip_image_tester.py
@@ -33,7 +33,9 @@ def __init__(self, device: str = "cuda") -> None:
         self.device = device
         self.clip.to(self.device)
 
-    def test_image(self, image: Image.Image, objects: List[str], conf_threshold=0.05):
+    def test_image(
+        self, image: Image.Image, objects: List[str], conf_threshold=0.05
+    ) -> tuple:
         """Tests the generated image against a set of objects using the CLIP model.
 
         Args:
diff --git a/datadreamer/image_generation/image_generator.py b/datadreamer/image_generation/image_generator.py
index 3e2aff7..bfbc53d 100644
--- a/datadreamer/image_generation/image_generator.py
+++ b/datadreamer/image_generation/image_generator.py
@@ -64,7 +64,7 @@ def __init__(
             self.set_seed(seed)
 
     @staticmethod
-    def set_seed(seed: int):
+    def set_seed(seed: int) -> None:
         """Sets the seed for random number generators in Python and PyTorch.
 
         Args:
@@ -78,7 +78,7 @@ def generate_images(
         self,
         prompts: Union[str, List[str]],
         prompt_objects: Optional[List[List[str]]] = None,
-    ):
+    ) -> List[Image.Image]:
         """Generates images based on the provided prompts and optional object prompts.
 
         Args:
diff --git a/datadreamer/image_generation/sdxl_image_generator.py b/datadreamer/image_generation/sdxl_image_generator.py
index 6dc583e..7ccd908 100644
--- a/datadreamer/image_generation/sdxl_image_generator.py
+++ b/datadreamer/image_generation/sdxl_image_generator.py
@@ -36,7 +36,7 @@ def __init__(self, *args, **kwargs):
         self.base, self.refiner = self._init_gen_model()
         self.base_processor, self.refiner_processor = self._init_processor()
 
-    def _init_gen_model(self):
+    def _init_gen_model(self) -> tuple:
         """Initializes the base and refiner models of Stable Diffusion.
 
         Returns:
@@ -80,7 +80,7 @@ def _init_gen_model(self):
 
         return base, refiner
 
-    def _init_processor(self):
+    def _init_processor(self) -> tuple:
         """Initializes the processors for the base and refiner models.
 
         Returns:
diff --git a/datadreamer/image_generation/sdxl_lightning_image_generator.py b/datadreamer/image_generation/sdxl_lightning_image_generator.py
index cc43dfe..f4520e4 100644
--- a/datadreamer/image_generation/sdxl_lightning_image_generator.py
+++ b/datadreamer/image_generation/sdxl_lightning_image_generator.py
@@ -40,7 +40,7 @@ def __init__(self, *args, **kwargs):
         self.pipe = self._init_gen_model()
         self.compel = self._init_compel()
 
-    def _init_gen_model(self):
+    def _init_gen_model(self) -> StableDiffusionXLPipeline:
         """Initializes the Stable Diffusion Lightning model for image generation.
 
         Returns:
@@ -75,7 +75,7 @@ def _init_gen_model(self):
 
         return pipe
 
-    def _init_compel(self):
+    def _init_compel(self) -> Compel:
         """Initializes the Compel model for text prompt weighting.
 
         Returns:
diff --git a/datadreamer/image_generation/sdxl_turbo_image_generator.py b/datadreamer/image_generation/sdxl_turbo_image_generator.py
index 54d9dd7..abd20a0 100644
--- a/datadreamer/image_generation/sdxl_turbo_image_generator.py
+++ b/datadreamer/image_generation/sdxl_turbo_image_generator.py
@@ -31,7 +31,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.base = self._init_gen_model()
 
-    def _init_gen_model(self):
+    def _init_gen_model(self) -> AutoPipelineForText2Image:
         """Initializes the Stable Diffusion Turbo model for image generation.
 
         Returns:
diff --git a/datadreamer/utils/base_converter.py b/datadreamer/utils/base_converter.py
index 3d97199..2de019d 100644
--- a/datadreamer/utils/base_converter.py
+++ b/datadreamer/utils/base_converter.py
@@ -13,7 +13,7 @@ def __init__(self, seed=42):
         np.random.seed(seed)
 
     @abstractmethod
-    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
+    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> None:
         """Converts a dataset into another format.
 
         Args:
@@ -28,7 +28,7 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
         pass
 
     @staticmethod
-    def read_annotations(annotation_path):
+    def read_annotations(annotation_path) -> dict:
         """Reads annotations from a JSON file located at the specified path.
 
         Args:
@@ -42,7 +42,7 @@ def read_annotations(annotation_path):
         return data
 
     @staticmethod
-    def make_splits(images, split_ratios, shuffle=True):
+    def make_splits(images, split_ratios, shuffle=True) -> tuple:
         """Splits the list of images into training, validation, and test sets.
 
         Args:
diff --git a/datadreamer/utils/coco_converter.py b/datadreamer/utils/coco_converter.py
index ba02d97..9e94469 100644
--- a/datadreamer/utils/coco_converter.py
+++ b/datadreamer/utils/coco_converter.py
@@ -31,7 +31,7 @@ class COCOConverter(BaseConverter):
     def __init__(self, seed=42):
         super().__init__(seed)
 
-    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
+    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> None:
         """Converts a dataset into a COCO format.
 
         Args:
@@ -46,7 +46,9 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
         data = BaseConverter.read_annotations(annotation_path)
         self.process_data(data, dataset_dir, output_dir, split_ratios, copy_files)
 
-    def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=True):
+    def process_data(
+        self, data, image_dir, output_dir, split_ratios, copy_files=True
+    ) -> None:
         """Processes the data by dividing it into training and validation sets, and
         saves the images and labels in COCO format.
 
@@ -126,7 +128,9 @@ def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=Tru
                 dataset_output_dir, images_info, annotations, data["class_names"]
             )
 
-    def save_labels(self, dataset_output_dir, images_info, annotations, class_names):
+    def save_labels(
+        self, dataset_output_dir, images_info, annotations, class_names
+    ) -> None:
         """Saves the labels to a JSON file.
 
         Args:
diff --git a/datadreamer/utils/convert_dataset.py b/datadreamer/utils/convert_dataset.py
index 21f1159..874878b 100644
--- a/datadreamer/utils/convert_dataset.py
+++ b/datadreamer/utils/convert_dataset.py
@@ -19,7 +19,22 @@ def convert_dataset(
     dataset_name=None,
     copy_files=True,
     seed=42,
-):
+) -> None:
+    """Converts a dataset from one format to another.
+
+    Args:
+        input_dir (str): Directory containing the images and annotations.
+        output_dir (str): Directory where the processed dataset will be saved.
+        dataset_format (str): Format of the dataset. Can be 'yolo', 'coco', 'luxonis-dataset', or 'cls-single'.
+        split_ratios (list): List of ratios for train, val, and test splits.
+        dataset_plugin (str, optional): Plugin for Luxonis dataset. Defaults to None.
+        dataset_name (str, optional): Name of the Luxonis dataset. Defaults to None.
+        copy_files (bool, optional): Whether to copy the files to the output directory. Defaults to True.
+        seed (int, optional): Random seed. Defaults to 42.
+
+    No return value.
+    """
+
     if dataset_format == "yolo":
         converter = YOLOConverter(seed=seed)
     elif dataset_format == "coco":
diff --git a/datadreamer/utils/dataset_utils.py b/datadreamer/utils/dataset_utils.py
index a396ae0..33fe003 100644
--- a/datadreamer/utils/dataset_utils.py
+++ b/datadreamer/utils/dataset_utils.py
@@ -9,7 +9,22 @@ def save_annotations_to_json(
     class_names=None,
     save_dir=None,
     file_name="annotations.json",
-):
+) -> None:
+    """Saves annotations to a JSON file.
+
+    Args:
+        image_paths (list): List of image paths.
+        labels_list (list): List of labels.
+        boxes_list (list, optional): List of bounding boxes. Defaults to None.
+        class_names (list, optional): List of class names. Defaults to None.
+        save_dir (str, optional): Directory to save the JSON file. Defaults to None.
+        file_name (str, optional): Name of the JSON file. Defaults to 'annotations.json'.
+
+    No return value.
+    """
+    if save_dir is None:
+        save_dir = os.getcwd()
+
     annotations = {}
     for i in range(len(image_paths)):
         # for image_path, bboxes, labels in zip(image_paths, boxes_list, labels_list):
diff --git a/datadreamer/utils/luxonis_dataset_converter.py b/datadreamer/utils/luxonis_dataset_converter.py
index 9b55f79..d78acf8 100644
--- a/datadreamer/utils/luxonis_dataset_converter.py
+++ b/datadreamer/utils/luxonis_dataset_converter.py
@@ -20,7 +20,7 @@ def __init__(self, dataset_plugin=None, dataset_name=None, seed=42):
         self.dataset_plugin = dataset_plugin
         self.dataset_name = dataset_name
 
-    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
+    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> None:
         """Converts a dataset into a LuxonisDataset format.
 
         Args:
@@ -35,7 +35,19 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
         data = BaseConverter.read_annotations(annotation_path)
         self.process_data(data, dataset_dir, output_dir, split_ratios)
 
-    def process_data(self, data, dataset_dir, output_dir, split_ratios):
+    def process_data(self, data, dataset_dir, output_dir, split_ratios) -> None:
+        """Processes the data into LuxonisDataset format.
+
+        Args:
+        - data (dict): The data to process.
+        - dataset_dir (str): The directory where the source dataset is located.
+        - output_dir (str): The directory where the processed dataset should be saved.
+        - split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+
+        No return value.
+        """
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
         class_names = data["class_names"]
         image_paths = list(data.keys())
         image_paths.remove("class_names")
diff --git a/datadreamer/utils/merge_raw_datasets.py b/datadreamer/utils/merge_raw_datasets.py
index 4cea85f..e2639fb 100644
--- a/datadreamer/utils/merge_raw_datasets.py
+++ b/datadreamer/utils/merge_raw_datasets.py
@@ -9,7 +9,18 @@
 logger = logging.getLogger(__name__)
 
 
-def merge_datasets(input_dirs, output_dir, copy_files=True):
+def merge_datasets(input_dirs, output_dir, copy_files=True) -> None:
+    """Merges multiple raw datasets into a single dataset.
+
+    Args:
+        input_dirs (List[str]): A list of input directories containing raw datasets.
+        output_dir (str): The output directory where the merged dataset will be saved.
+        copy_files (bool, optional): Whether to copy the files from the input directories
+            to the output directory. Defaults to True.
+
+    No return value.
+    """
+    # Check if all input directories exist
     config_tasks = []
     config_classes = []
     random_seeds = []
diff --git a/datadreamer/utils/nms.py b/datadreamer/utils/nms.py
index 1373858..f277ab2 100644
--- a/datadreamer/utils/nms.py
+++ b/datadreamer/utils/nms.py
@@ -45,7 +45,7 @@ def non_max_suppression(
     agnostic=False,
     multi_label=False,
     max_det=300,
-):
+) -> list:
     """Runs Non-Maximum Suppression (NMS) on inference results.
     This code is borrowed from: https://github.com/ultralytics/yolov5/blob/47233e1698b89fc437a4fb9463c815e9171be955/utils/general.py#L775
     Args:
diff --git a/datadreamer/utils/single_label_cls_converter.py b/datadreamer/utils/single_label_cls_converter.py
index e447c40..523373d 100644
--- a/datadreamer/utils/single_label_cls_converter.py
+++ b/datadreamer/utils/single_label_cls_converter.py
@@ -35,7 +35,7 @@ class SingleLabelClsConverter(BaseConverter):
     def __init__(self, seed=42):
         super().__init__(seed)
 
-    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
+    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> None:
         """Converts a dataset into a format suitable for single-label classification.
 
         Args:
@@ -50,7 +50,9 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
         data = BaseConverter.read_annotations(annotation_path)
         self.process_data(data, dataset_dir, output_dir, split_ratios, copy_files)
 
-    def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=True):
+    def process_data(
+        self, data, image_dir, output_dir, split_ratios, copy_files=True
+    ) -> None:
         """Processes the data by removing images with multiple labels, then dividing it
         into training and validation sets, and saves the images with single labels.
 
diff --git a/datadreamer/utils/yolo_converter.py b/datadreamer/utils/yolo_converter.py
index 36452da..dd00a4a 100644
--- a/datadreamer/utils/yolo_converter.py
+++ b/datadreamer/utils/yolo_converter.py
@@ -32,7 +32,7 @@ class YOLOConverter(BaseConverter):
     def __init__(self, seed=42):
         super().__init__(seed)
 
-    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
+    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> None:
         """Converts a dataset into a format suitable for training with YOLO, including
         creating training and validation splits.
 
@@ -48,7 +48,7 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
         data = BaseConverter.read_annotations(annotation_path)
         self.process_data(data, dataset_dir, output_dir, split_ratios, copy_files)
 
-    def convert_to_yolo_format(self, box, image_width, image_height):
+    def convert_to_yolo_format(self, box, image_width, image_height) -> list:
         """Converts bounding box coordinates to YOLO format.
 
         Args:
@@ -65,7 +65,9 @@ def convert_to_yolo_format(self, box, image_width, image_height):
         height = (box[3] - box[1]) / image_height
         return [x_center, y_center, width, height]
 
-    def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=True):
+    def process_data(
+        self, data, image_dir, output_dir, split_ratios, copy_files=True
+    ) -> None:
         """Processes the data by dividing it into training and validation sets, and
         saves the images and labels in YOLO format.
 
@@ -131,7 +133,7 @@ def process_data(self, data, image_dir, output_dir, split_ratios, copy_files=Tru
 
         self.create_data_yaml(output_dir, data["class_names"])
 
-    def create_data_yaml(self, root_dir, class_names):
+    def create_data_yaml(self, root_dir, class_names) -> None:
         """Creates a YAML file for dataset configuration, specifying paths and class
         names.
 

From 02c2d14d042815260a68b6af0d428eb0e77aef64 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Tue, 24 Sep 2024 22:31:57 +0000
Subject: [PATCH 07/31] [Automated] Updated coverage badge

---
 media/coverage_badge.svg | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/media/coverage_badge.svg b/media/coverage_badge.svg
index b4a82e6..dd6df12 100644
--- a/media/coverage_badge.svg
+++ b/media/coverage_badge.svg
@@ -9,13 +9,13 @@
     </mask>
     <g mask="url(#a)">
         <path fill="#555" d="M0 0h63v20H0z"/>
-        <path fill="#fe7d37" d="M63 0h36v20H63z"/>
+        <path fill="#dfb317" d="M63 0h36v20H63z"/>
         <path fill="url(#b)" d="M0 0h99v20H0z"/>
     </g>
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
         <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
         <text x="31.5" y="14">coverage</text>
-        <text x="80" y="15" fill="#010101" fill-opacity=".3">56%</text>
-        <text x="80" y="14">56%</text>
+        <text x="80" y="15" fill="#010101" fill-opacity=".3">61%</text>
+        <text x="80" y="14">61%</text>
     </g>
 </svg>

From 8625376e0b525d16c114caf7bae411f2cbcb3567 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Tue, 24 Sep 2024 22:57:40 +0000
Subject: [PATCH 08/31] fix: remove axes in bbox visualization

---
 datadreamer/pipelines/generate_dataset_from_scratch.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py
index 03e3a7d..d3ee3bf 100644
--- a/datadreamer/pipelines/generate_dataset_from_scratch.py
+++ b/datadreamer/pipelines/generate_dataset_from_scratch.py
@@ -623,11 +623,13 @@ def read_image_batch(image_batch, batch_num, batch_size):
 
                 labels_list.append(np.array(labels))
 
+                plt.axis("off")
                 plt.savefig(
                     os.path.join(
                         bbox_dir, f"bbox_{i * args.batch_size_annotation + j}.jpg"
                     )
                 )
+
                 plt.close()
 
         # Save annotations as JSON files

From 73686c05859734f24ec3a27a5cce62c3c24b92de Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Tue, 24 Sep 2024 23:27:57 +0000
Subject: [PATCH 09/31] tests: improve image generation tests

---
 tests/unittests/test_image_generation.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/tests/unittests/test_image_generation.py b/tests/unittests/test_image_generation.py
index f91fcc1..506e42c 100644
--- a/tests/unittests/test_image_generation.py
+++ b/tests/unittests/test_image_generation.py
@@ -25,6 +25,8 @@ def _check_clip_image_tester(device: str):
     url = "https://ultralytics.com/images/bus.jpg"
     im = Image.open(requests.get(url, stream=True).raw)
     tester = ClipImageTester(device=device)
+    # Check that the tester is not None
+    assert tester is not None
     passed, probs, num_passed = tester.test_image(im, ["bus"])
     # Check that the image passed the test
     assert passed is True
@@ -34,7 +36,15 @@ def _check_clip_image_tester(device: str):
     assert probs.shape == (1, 1)
     # Check that the probability is not zero
     assert probs[0, 0] > 0
-    # Release the tester
+    passed_list, probs_list, num_passed_list = tester.test_images_batch([im], [["bus"]])
+    # Check that the image passed the test
+    assert passed_list[0] is True
+    # Check that the number of objects passed is correct
+    assert num_passed_list[0] == 1
+    # Check that the probability has correct shape
+    assert len(probs_list) == 1
+    # Check that the probability is not zero
+    assert probs_list[0][0] > 0
     tester.release(empty_cuda_cache=True if device != "cpu" else False)
 
 
@@ -65,6 +75,8 @@ def _check_image_generator(
     device: str,
 ):
     image_generator = image_generator_class(device=device)
+    # Check that the image generator is not None
+    assert image_generator is not None
     # Generate images and check each of them
     for generated_images_batch in image_generator.generate_images(
         ["A photo of a cat, dog"], [["cat", "dog"]]
@@ -72,6 +84,15 @@ def _check_image_generator(
         generated_image = generated_images_batch[0]
         assert generated_image is not None
         assert isinstance(generated_image, Image.Image)
+
+    images = image_generator.generate_images_batch(
+        ["A photo of a cat, dog"],
+        "blurry, bad quality",
+    )
+    assert len(images) == 1
+    assert images[0] is not None
+    assert isinstance(images[0], Image.Image)
+
     # Release the generator
     image_generator.release(empty_cuda_cache=True if device != "cpu" else False)
 

From 4882a50b412c278a093a60027a2ffcd205b3a95c Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Tue, 24 Sep 2024 23:41:29 +0000
Subject: [PATCH 10/31] [Automated] Updated coverage badge

---
 media/coverage_badge.svg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/media/coverage_badge.svg b/media/coverage_badge.svg
index dd6df12..2fad913 100644
--- a/media/coverage_badge.svg
+++ b/media/coverage_badge.svg
@@ -15,7 +15,7 @@
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
         <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
         <text x="31.5" y="14">coverage</text>
-        <text x="80" y="15" fill="#010101" fill-opacity=".3">61%</text>
-        <text x="80" y="14">61%</text>
+        <text x="80" y="15" fill="#010101" fill-opacity=".3">62%</text>
+        <text x="80" y="14">62%</text>
     </g>
 </svg>

From 4efa3162519eb7da2f62a53441129255627bbf79 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Wed, 25 Sep 2024 13:57:07 +0000
Subject: [PATCH 11/31] docs: fix docstrings formatting

---
 datadreamer/utils/base_converter.py           | 25 ++++++++--------
 datadreamer/utils/coco_converter.py           | 27 ++++++++---------
 .../utils/luxonis_dataset_converter.py        | 16 +++++-----
 .../utils/single_label_cls_converter.py       | 18 +++++------
 datadreamer/utils/yolo_converter.py           | 30 +++++++++----------
 5 files changed, 57 insertions(+), 59 deletions(-)

diff --git a/datadreamer/utils/base_converter.py b/datadreamer/utils/base_converter.py
index 2de019d..60a3f51 100644
--- a/datadreamer/utils/base_converter.py
+++ b/datadreamer/utils/base_converter.py
@@ -17,11 +17,10 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> Non
         """Converts a dataset into another format.
 
         Args:
-        - dataset_dir (str): The directory where the source dataset is located.
-        - output_dir (str): The directory where the processed dataset should be saved.
-        - split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-        - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
-
+            dataset_dir (str): The directory where the source dataset is located.
+            output_dir (str): The directory where the processed dataset should be saved.
+            split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+            copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
         """
@@ -32,10 +31,10 @@ def read_annotations(annotation_path) -> dict:
         """Reads annotations from a JSON file located at the specified path.
 
         Args:
-        - annotation_path (str): The path to the JSON file containing annotations.
+            annotation_path (str): The path to the JSON file containing annotations.
 
         Returns:
-        - dict: A dictionary containing the data loaded from the JSON file.
+            dict: A dictionary containing the data loaded from the JSON file.
         """
         with open(annotation_path) as f:
             data = json.load(f)
@@ -46,14 +45,14 @@ def make_splits(images, split_ratios, shuffle=True) -> tuple:
         """Splits the list of images into training, validation, and test sets.
 
         Args:
-        - images (list of str): A list of image paths.
-        - split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-        - shuffle (bool, optional): Whether to shuffle the list of images. Defaults to True.
+            images (list of str): A list of image paths.
+            split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+            shuffle (bool, optional): Whether to shuffle the list of images. Defaults to True.
 
         Returns:
-        - list of str: A list of image paths for the training set.
-        - list of str: A list of image paths for the validation set.
-        - list of str: A list of image paths for the test set.
+            list of str: A list of image paths for the training set.
+            list of str: A list of image paths for the validation set.
+            list of str: A list of image paths for the test set.
         """
         if shuffle:
             np.random.shuffle(images)
diff --git a/datadreamer/utils/coco_converter.py b/datadreamer/utils/coco_converter.py
index 9e94469..bcd3546 100644
--- a/datadreamer/utils/coco_converter.py
+++ b/datadreamer/utils/coco_converter.py
@@ -35,10 +35,10 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> Non
         """Converts a dataset into a COCO format.
 
         Args:
-        - dataset_dir (str): The directory where the source dataset is located.
-        - output_dir (str): The directory where the processed dataset should be saved.
-        - split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-        - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
+            dataset_dir (str): The directory where the source dataset is located.
+            output_dir (str): The directory where the processed dataset should be saved.
+            split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+            copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
         """
@@ -53,12 +53,11 @@ def process_data(
         saves the images and labels in COCO format.
 
         Args:
-        - data (dict): The dictionary containing image annotations.
-        - image_dir (str): The directory where the source images are located.
-        - output_dir (str): The base directory where the processed data will be saved.
-        - split_ratios (float): The ratio to split the data into training, validation, and test sets.
-        - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
-
+            data (dict): The dictionary containing image annotations.
+            image_dir (str): The directory where the source images are located.
+            output_dir (str): The base directory where the processed data will be saved.
+            split_ratios (float): The ratio to split the data into training, validation, and test sets.
+            copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
         """
@@ -134,10 +133,10 @@ def save_labels(
         """Saves the labels to a JSON file.
 
         Args:
-        - dataset_output_dir (str): The directory where the labels should be saved.
-        - images_info (list of dict): A list of dictionaries containing image information.
-        - annotations (list of dict): A list of dictionaries containing annotation information.
-        - class_names (list of str): A list of class names.
+            dataset_output_dir (str): The directory where the labels should be saved.
+            images_info (list of dict): A list of dictionaries containing image information.
+            annotations (list of dict): A list of dictionaries containing annotation information.
+            class_names (list of str): A list of class names.
 
         No return value.
         """
diff --git a/datadreamer/utils/luxonis_dataset_converter.py b/datadreamer/utils/luxonis_dataset_converter.py
index d78acf8..07b157e 100644
--- a/datadreamer/utils/luxonis_dataset_converter.py
+++ b/datadreamer/utils/luxonis_dataset_converter.py
@@ -24,10 +24,10 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> Non
         """Converts a dataset into a LuxonisDataset format.
 
         Args:
-        - dataset_dir (str): The directory where the source dataset is located.
-        - output_dir (str): The directory where the processed dataset should be saved.
-        - split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-        - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
+            dataset_dir (str): The directory where the source dataset is located.
+            output_dir (str): The directory where the processed dataset should be saved.
+            split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+            copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
         """
@@ -39,10 +39,10 @@ def process_data(self, data, dataset_dir, output_dir, split_ratios) -> None:
         """Processes the data into LuxonisDataset format.
 
         Args:
-        - data (dict): The data to process.
-        - dataset_dir (str): The directory where the source dataset is located.
-        - output_dir (str): The directory where the processed dataset should be saved.
-        - split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+            data (dict): The data to process.
+            dataset_dir (str): The directory where the source dataset is located.
+            output_dir (str): The directory where the processed dataset should be saved.
+            split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
 
         No return value.
         """
diff --git a/datadreamer/utils/single_label_cls_converter.py b/datadreamer/utils/single_label_cls_converter.py
index 523373d..8d56ad1 100644
--- a/datadreamer/utils/single_label_cls_converter.py
+++ b/datadreamer/utils/single_label_cls_converter.py
@@ -39,10 +39,10 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> Non
         """Converts a dataset into a format suitable for single-label classification.
 
         Args:
-        - dataset_dir (str): The directory where the source dataset is located.
-        - output_dir (str): The directory where the processed dataset should be saved.
-        - split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-        - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
+            dataset_dir (str): The directory where the source dataset is located.
+            output_dir (str): The directory where the processed dataset should be saved.
+            split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+            copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
         """
@@ -57,11 +57,11 @@ def process_data(
         into training and validation sets, and saves the images with single labels.
 
         Args:
-        - data (dict): The dictionary containing image annotations.
-        - image_dir (str): The directory where the source images are located.
-        - output_dir (str): The base directory where the processed data will be saved.
-        - split_ratios (float): The ratio to split the data into training, validation, and test sets.
-        - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
+            data (dict): The dictionary containing image annotations.
+            image_dir (str): The directory where the source images are located.
+            output_dir (str): The base directory where the processed data will be saved.
+            split_ratios (float): The ratio to split the data into training, validation, and test sets.
+            copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
         """
diff --git a/datadreamer/utils/yolo_converter.py b/datadreamer/utils/yolo_converter.py
index dd00a4a..08d3e85 100644
--- a/datadreamer/utils/yolo_converter.py
+++ b/datadreamer/utils/yolo_converter.py
@@ -37,10 +37,10 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> Non
         creating training and validation splits.
 
         Args:
-        - dataset_dir (str): The directory where the source dataset is located.
-        - output_dir (str): The directory where the processed dataset should be saved.
-        - split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
-        - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
+            dataset_dir (str): The directory where the source dataset is located.
+            output_dir (str): The directory where the processed dataset should be saved.
+            split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
+            copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
         No return value.
         """
@@ -52,12 +52,12 @@ def convert_to_yolo_format(self, box, image_width, image_height) -> list:
         """Converts bounding box coordinates to YOLO format.
 
         Args:
-        - box (list of float): A list containing the bounding box coordinates [x_min, y_min, x_max, y_max].
-        - image_width (int): The width of the image.
-        - image_height (int): The height of the image.
+            box (list of float): A list containing the bounding box coordinates [x_min, y_min, x_max, y_max].
+            image_width (int): The width of the image.
+            image_height (int): The height of the image.
 
         Returns:
-        - list of float: A list containing the bounding box in YOLO format [x_center, y_center, width, height].
+            list of float: A list containing the bounding box in YOLO format [x_center, y_center, width, height].
         """
         x_center = (box[0] + box[2]) / 2 / image_width
         y_center = (box[1] + box[3]) / 2 / image_height
@@ -72,11 +72,11 @@ def process_data(
         saves the images and labels in YOLO format.
 
         Args:
-        - data (dict): The dictionary containing image annotations.
-        - image_dir (str): The directory where the source images are located.
-        - output_dir (str): The base directory where the processed data will be saved.
-        - split_ratios (float): The ratio to split the data into training, validation, and test sets.
-        - copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
+            data (dict): The dictionary containing image annotations.
+            image_dir (str): The directory where the source images are located.
+            output_dir (str): The base directory where the processed data will be saved.
+            split_ratios (float): The ratio to split the data into training, validation, and test sets.
+            copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.
 
 
         No return value.
@@ -138,8 +138,8 @@ def create_data_yaml(self, root_dir, class_names) -> None:
         names.
 
         Args:
-        - root_dir (str): The root directory where the dataset is located.
-        - class_names (list of str): A list of class names.
+            root_dir (str): The root directory where the dataset is located.
+            class_names (list of str): A list of class names.
 
         No return value.
         """

From 9a70a976674bc5cd428e9c92541dc6bc5fb8464d Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Wed, 25 Sep 2024 15:00:47 +0000
Subject: [PATCH 12/31] fix: type hints

---
 .../dataset_annotation/clip_annotator.py      |  4 ++--
 .../dataset_annotation/owlv2_annotator.py     | 14 ++++++------
 datadreamer/dataset_annotation/utils.py       |  4 +++-
 .../image_generation/clip_image_tester.py     | 12 +++++-----
 .../image_generation/sdxl_image_generator.py  |  6 ++---
 .../prompt_generation/lm_prompt_generator.py  |  4 ++--
 .../prompt_generation/lm_synonym_generator.py |  4 ++--
 .../prompt_generation/prompt_generator.py     |  2 +-
 .../prompt_generation/synonym_generator.py    |  6 ++---
 .../tinyllama_lm_prompt_generator.py          |  4 ++--
 datadreamer/utils/base_converter.py           |  5 +++--
 .../utils/luxonis_dataset_converter.py        | 17 +++++++++++---
 datadreamer/utils/merge_raw_datasets.py       |  5 ++++-
 datadreamer/utils/nms.py                      |  3 ++-
 .../utils/single_label_cls_converter.py       | 18 ++++++++++++---
 datadreamer/utils/yolo_converter.py           | 22 +++++++++++++++----
 16 files changed, 87 insertions(+), 43 deletions(-)

diff --git a/datadreamer/dataset_annotation/clip_annotator.py b/datadreamer/dataset_annotation/clip_annotator.py
index 1da779d..a39d1c6 100644
--- a/datadreamer/dataset_annotation/clip_annotator.py
+++ b/datadreamer/dataset_annotation/clip_annotator.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import logging
-from typing import List
+from typing import Dict, List
 
 import numpy as np
 import PIL
@@ -76,7 +76,7 @@ def annotate_batch(
         images: List[PIL.Image.Image],
         objects: List[str],
         conf_threshold: float = 0.1,
-        synonym_dict: dict[str, List[str]] | None = None,
+        synonym_dict: Dict[str, List[str]] | None = None,
     ) -> List[np.ndarray]:
         """Annotates images using the OWLv2 model.
 
diff --git a/datadreamer/dataset_annotation/owlv2_annotator.py b/datadreamer/dataset_annotation/owlv2_annotator.py
index 3537fdc..89d4023 100644
--- a/datadreamer/dataset_annotation/owlv2_annotator.py
+++ b/datadreamer/dataset_annotation/owlv2_annotator.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import logging
-from typing import List, Tuple
+from typing import Dict, List, Tuple
 
 import numpy as np
 import PIL
@@ -85,7 +85,7 @@ def _generate_annotations(
         images: List[PIL.Image.Image],
         prompts: List[str],
         conf_threshold: float = 0.1,
-    ) -> List[dict[str, torch.Tensor]]:
+    ) -> List[Dict[str, torch.Tensor]]:
         """Generates annotations for the given images and prompts.
 
         Args:
@@ -94,7 +94,7 @@ def _generate_annotations(
             conf_threshold (float, optional): Confidence threshold for the annotations. Defaults to 0.1.
 
         Returns:
-            dict: A dictionary containing the annotations for the images.
+            List[Dict[str, torch.Tensor]]: The annotations for the given images and prompts.
         """
         n = len(images)
         batched_prompts = [prompts] * n
@@ -119,11 +119,11 @@ def _generate_annotations(
 
     def _get_annotations(
         self,
-        pred: dict[str, torch.Tensor],
+        pred: Dict[str, torch.Tensor],
         use_tta: bool,
         img_dim: int,
-        synonym_dict: dict[str, List[str]] | None,
-        synonym_dict_rev: dict[int, int] | None,
+        synonym_dict: Dict[str, List[str]] | None,
+        synonym_dict_rev: Dict[int, int] | None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """Extracts the annotations from the predictions.
 
@@ -161,7 +161,7 @@ def annotate_batch(
         conf_threshold: float = 0.1,
         iou_threshold: float = 0.2,
         use_tta: bool = False,
-        synonym_dict: dict[str, List[str]] | None = None,
+        synonym_dict: Dict[str, List[str]] | None = None,
     ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
         """Annotates images using the OWLv2 model.
 
diff --git a/datadreamer/dataset_annotation/utils.py b/datadreamer/dataset_annotation/utils.py
index e0edc20..bfb13b7 100644
--- a/datadreamer/dataset_annotation/utils.py
+++ b/datadreamer/dataset_annotation/utils.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
 
+from typing import List
+
 from torchvision import transforms
 
 
-def apply_tta(image) -> list:
+def apply_tta(image) -> List[transforms.Compose]:
     """Apply test-time augmentation (TTA) to the given image.
 
     Args:
diff --git a/datadreamer/image_generation/clip_image_tester.py b/datadreamer/image_generation/clip_image_tester.py
index d2aeccb..8f86a88 100644
--- a/datadreamer/image_generation/clip_image_tester.py
+++ b/datadreamer/image_generation/clip_image_tester.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import logging
-from typing import List
+from typing import List, Tuple
 
 import torch
 from PIL import Image
@@ -34,8 +34,8 @@ def __init__(self, device: str = "cuda") -> None:
         self.clip.to(self.device)
 
     def test_image(
-        self, image: Image.Image, objects: List[str], conf_threshold=0.05
-    ) -> tuple:
+        self, image: Image.Image, objects: List[str], conf_threshold: float = 0.05
+    ) -> Tuple[bool, torch.Tensor, int]:
         """Tests the generated image against a set of objects using the CLIP model.
 
         Args:
@@ -67,7 +67,7 @@ def test_images_batch(
         images: List[Image.Image],
         objects: List[List[str]],
         conf_threshold=0.05,
-    ) -> List[tuple]:
+    ) -> Tuple[List[bool], List[torch.Tensor], List[int]]:
         """Tests the generated images against a set of objects using the CLIP model.
 
         Args:
@@ -76,8 +76,8 @@ def test_images_batch(
             conf_threshold (float, optional): Confidence threshold for considering an object as present. Defaults to 0.05.
 
         Returns:
-            List[tuple]: A list of tuples containing a boolean indicating if the image passes the test,
-                        the probabilities of the objects, and the number of objects that passed the test.
+            Tuple[List[bool], List[torch.Tensor], List[int]]: A tuple containing a list of booleans indicating if the images pass the test,
+                   a list of probabilities of the objects, and a list of the number of objects that passed the test.
         """
         # Transform the inputs for the CLIP model
         objects_array = []
diff --git a/datadreamer/image_generation/sdxl_image_generator.py b/datadreamer/image_generation/sdxl_image_generator.py
index 7ccd908..3c090de 100644
--- a/datadreamer/image_generation/sdxl_image_generator.py
+++ b/datadreamer/image_generation/sdxl_image_generator.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import logging
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 import torch
 from compel import Compel, ReturnedEmbeddingsType
@@ -36,7 +36,7 @@ def __init__(self, *args, **kwargs):
         self.base, self.refiner = self._init_gen_model()
         self.base_processor, self.refiner_processor = self._init_processor()
 
-    def _init_gen_model(self) -> tuple:
+    def _init_gen_model(self) -> Tuple[DiffusionPipeline, DiffusionPipeline]:
         """Initializes the base and refiner models of Stable Diffusion.
 
         Returns:
@@ -80,7 +80,7 @@ def _init_gen_model(self) -> tuple:
 
         return base, refiner
 
-    def _init_processor(self) -> tuple:
+    def _init_processor(self) -> Tuple[Compel, Compel]:
         """Initializes the processors for the base and refiner models.
 
         Returns:
diff --git a/datadreamer/prompt_generation/lm_prompt_generator.py b/datadreamer/prompt_generation/lm_prompt_generator.py
index adbd44c..8a3e6e1 100644
--- a/datadreamer/prompt_generation/lm_prompt_generator.py
+++ b/datadreamer/prompt_generation/lm_prompt_generator.py
@@ -3,7 +3,7 @@
 import logging
 import random
 import re
-from typing import List, Literal, Optional
+from typing import List, Literal, Optional, Tuple
 
 import torch
 from tqdm import tqdm
@@ -65,7 +65,7 @@ def __init__(
         )
         self.model, self.tokenizer, self.pipeline = self._init_lang_model()
 
-    def _init_lang_model(self) -> tuple[AutoModelForCausalLM, AutoTokenizer, Pipeline]:
+    def _init_lang_model(self) -> Tuple[AutoModelForCausalLM, AutoTokenizer, Pipeline]:
         """Initializes the language model, tokenizer and pipeline for prompt generation.
 
         Returns:
diff --git a/datadreamer/prompt_generation/lm_synonym_generator.py b/datadreamer/prompt_generation/lm_synonym_generator.py
index a971655..850ccfb 100644
--- a/datadreamer/prompt_generation/lm_synonym_generator.py
+++ b/datadreamer/prompt_generation/lm_synonym_generator.py
@@ -2,7 +2,7 @@
 
 import logging
 import re
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 import torch
 from transformers import (
@@ -45,7 +45,7 @@ def __init__(
         super().__init__(synonyms_number, seed, device)
         self.model, self.tokenizer, self.pipeline = self._init_lang_model()
 
-    def _init_lang_model(self) -> tuple[AutoModelForCausalLM, AutoTokenizer, Pipeline]:
+    def _init_lang_model(self) -> Tuple[AutoModelForCausalLM, AutoTokenizer, Pipeline]:
         """Initializes the language model, tokenizer and pipeline for prompt generation.
 
         Returns:
diff --git a/datadreamer/prompt_generation/prompt_generator.py b/datadreamer/prompt_generation/prompt_generator.py
index 825243c..50662ac 100644
--- a/datadreamer/prompt_generation/prompt_generator.py
+++ b/datadreamer/prompt_generation/prompt_generator.py
@@ -49,7 +49,7 @@ def __init__(
         self.quantization = quantization if quantization is not None else "none"
 
     @staticmethod
-    def set_seed(seed: int):
+    def set_seed(seed: int) -> None:
         """Sets the random seed for consistent prompt generation.
 
         Args:
diff --git a/datadreamer/prompt_generation/synonym_generator.py b/datadreamer/prompt_generation/synonym_generator.py
index bb4240e..b5d338f 100644
--- a/datadreamer/prompt_generation/synonym_generator.py
+++ b/datadreamer/prompt_generation/synonym_generator.py
@@ -3,7 +3,7 @@
 import json
 import logging
 from abc import ABC, abstractmethod
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 from tqdm import tqdm
 
@@ -41,7 +41,7 @@ def __init__(
         self.seed = seed
         self.device = device
 
-    def generate_synonyms_for_list(self, words: List[str]) -> dict:
+    def generate_synonyms_for_list(self, words: List[str]) -> Dict:
         """Generates synonyms for a list of words and returns them in a dictionary.
 
         Args:
@@ -56,7 +56,7 @@ def generate_synonyms_for_list(self, words: List[str]) -> dict:
             synonyms_dict[word] = synonyms
         return synonyms_dict
 
-    def save_synonyms(self, synonyms, save_path: str) -> None:
+    def save_synonyms(self, synonyms: Dict, save_path: str) -> None:
         """Saves the generated synonyms to a JSON file.
 
         Args:
diff --git a/datadreamer/prompt_generation/tinyllama_lm_prompt_generator.py b/datadreamer/prompt_generation/tinyllama_lm_prompt_generator.py
index ed5fdcf..9e939a7 100644
--- a/datadreamer/prompt_generation/tinyllama_lm_prompt_generator.py
+++ b/datadreamer/prompt_generation/tinyllama_lm_prompt_generator.py
@@ -2,7 +2,7 @@
 
 import logging
 import re
-from typing import List, Literal, Optional
+from typing import List, Literal, Optional, Tuple
 
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, Pipeline, pipeline
@@ -50,7 +50,7 @@ def __init__(
             quantization,
         )
 
-    def _init_lang_model(self) -> tuple[AutoModelForCausalLM, AutoTokenizer, Pipeline]:
+    def _init_lang_model(self) -> Tuple[AutoModelForCausalLM, AutoTokenizer, Pipeline]:
         """Initializes the language model, tokenizer and pipeline for prompt generation.
 
         Returns:
diff --git a/datadreamer/utils/base_converter.py b/datadreamer/utils/base_converter.py
index 60a3f51..5c8243e 100644
--- a/datadreamer/utils/base_converter.py
+++ b/datadreamer/utils/base_converter.py
@@ -2,6 +2,7 @@
 
 import json
 from abc import ABC, abstractmethod
+from typing import Dict, List, Tuple
 
 import numpy as np
 
@@ -27,7 +28,7 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> Non
         pass
 
     @staticmethod
-    def read_annotations(annotation_path) -> dict:
+    def read_annotations(annotation_path) -> Dict:
         """Reads annotations from a JSON file located at the specified path.
 
         Args:
@@ -41,7 +42,7 @@ def read_annotations(annotation_path) -> dict:
         return data
 
     @staticmethod
-    def make_splits(images, split_ratios, shuffle=True) -> tuple:
+    def make_splits(images, split_ratios, shuffle=True) -> Tuple[List, List, List]:
         """Splits the list of images into training, validation, and test sets.
 
         Args:
diff --git a/datadreamer/utils/luxonis_dataset_converter.py b/datadreamer/utils/luxonis_dataset_converter.py
index 07b157e..9a2e6f9 100644
--- a/datadreamer/utils/luxonis_dataset_converter.py
+++ b/datadreamer/utils/luxonis_dataset_converter.py
@@ -2,6 +2,7 @@
 
 import logging
 import os
+from typing import Dict, List
 
 from luxonis_ml.data import DATASETS_REGISTRY, LuxonisDataset
 from luxonis_ml.data.utils.enums import BucketStorage
@@ -15,12 +16,20 @@
 class LuxonisDatasetConverter(BaseConverter):
     """Class for converting a dataset to LuxonisDataset format."""
 
-    def __init__(self, dataset_plugin=None, dataset_name=None, seed=42):
+    def __init__(
+        self, dataset_plugin: str = None, dataset_name: str = None, seed: int = 42
+    ):
         super().__init__(seed)
         self.dataset_plugin = dataset_plugin
         self.dataset_name = dataset_name
 
-    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> None:
+    def convert(
+        self,
+        dataset_dir: str,
+        output_dir: str,
+        split_ratios: List[float],
+        copy_files: bool = True,
+    ) -> None:
         """Converts a dataset into a LuxonisDataset format.
 
         Args:
@@ -35,7 +44,9 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> Non
         data = BaseConverter.read_annotations(annotation_path)
         self.process_data(data, dataset_dir, output_dir, split_ratios)
 
-    def process_data(self, data, dataset_dir, output_dir, split_ratios) -> None:
+    def process_data(
+        self, data: Dict, dataset_dir: str, output_dir: str, split_ratios: List[float]
+    ) -> None:
         """Processes the data into LuxonisDataset format.
 
         Args:
diff --git a/datadreamer/utils/merge_raw_datasets.py b/datadreamer/utils/merge_raw_datasets.py
index e2639fb..c6eb64e 100644
--- a/datadreamer/utils/merge_raw_datasets.py
+++ b/datadreamer/utils/merge_raw_datasets.py
@@ -5,11 +5,14 @@
 import logging
 import os
 import shutil
+from typing import List
 
 logger = logging.getLogger(__name__)
 
 
-def merge_datasets(input_dirs, output_dir, copy_files=True) -> None:
+def merge_datasets(
+    input_dirs: List[str], output_dir: str, copy_files: bool = True
+) -> None:
     """Merges multiple raw datasets into a single dataset.
 
     Args:
diff --git a/datadreamer/utils/nms.py b/datadreamer/utils/nms.py
index f277ab2..f0f29e4 100644
--- a/datadreamer/utils/nms.py
+++ b/datadreamer/utils/nms.py
@@ -7,6 +7,7 @@
 import logging
 import os
 import time
+from typing import List
 
 import cv2
 import numpy as np
@@ -45,7 +46,7 @@ def non_max_suppression(
     agnostic=False,
     multi_label=False,
     max_det=300,
-) -> list:
+) -> List[np.ndarray]:
     """Runs Non-Maximum Suppression (NMS) on inference results.
     This code is borrowed from: https://github.com/ultralytics/yolov5/blob/47233e1698b89fc437a4fb9463c815e9171be955/utils/general.py#L775
     Args:
diff --git a/datadreamer/utils/single_label_cls_converter.py b/datadreamer/utils/single_label_cls_converter.py
index 8d56ad1..c24bec7 100644
--- a/datadreamer/utils/single_label_cls_converter.py
+++ b/datadreamer/utils/single_label_cls_converter.py
@@ -3,6 +3,7 @@
 import logging
 import os
 import shutil
+from typing import Dict, List
 
 from datadreamer.utils import BaseConverter
 
@@ -32,10 +33,16 @@ class SingleLabelClsConverter(BaseConverter):
     │   ├── class_2
     """
 
-    def __init__(self, seed=42):
+    def __init__(self, seed: int = 42):
         super().__init__(seed)
 
-    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> None:
+    def convert(
+        self,
+        dataset_dir: str,
+        output_dir: str,
+        split_ratios: List[float],
+        copy_files: bool = True,
+    ) -> None:
         """Converts a dataset into a format suitable for single-label classification.
 
         Args:
@@ -51,7 +58,12 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> Non
         self.process_data(data, dataset_dir, output_dir, split_ratios, copy_files)
 
     def process_data(
-        self, data, image_dir, output_dir, split_ratios, copy_files=True
+        self,
+        data: Dict,
+        image_dir: str,
+        output_dir: str,
+        split_ratios: List[float],
+        copy_files: bool = True,
     ) -> None:
         """Processes the data by removing images with multiple labels, then dividing it
         into training and validation sets, and saves the images with single labels.
diff --git a/datadreamer/utils/yolo_converter.py b/datadreamer/utils/yolo_converter.py
index 08d3e85..715e429 100644
--- a/datadreamer/utils/yolo_converter.py
+++ b/datadreamer/utils/yolo_converter.py
@@ -2,6 +2,7 @@
 
 import os
 import shutil
+from typing import Dict, List
 
 from PIL import Image
 
@@ -32,7 +33,13 @@ class YOLOConverter(BaseConverter):
     def __init__(self, seed=42):
         super().__init__(seed)
 
-    def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> None:
+    def convert(
+        self,
+        dataset_dir: str,
+        output_dir: str,
+        split_ratios: List[float],
+        copy_files: bool = True,
+    ):
         """Converts a dataset into a format suitable for training with YOLO, including
         creating training and validation splits.
 
@@ -48,7 +55,9 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> Non
         data = BaseConverter.read_annotations(annotation_path)
         self.process_data(data, dataset_dir, output_dir, split_ratios, copy_files)
 
-    def convert_to_yolo_format(self, box, image_width, image_height) -> list:
+    def convert_to_yolo_format(
+        self, box: List[float], image_width: int, image_height: int
+    ) -> List[float]:
         """Converts bounding box coordinates to YOLO format.
 
         Args:
@@ -66,7 +75,12 @@ def convert_to_yolo_format(self, box, image_width, image_height) -> list:
         return [x_center, y_center, width, height]
 
     def process_data(
-        self, data, image_dir, output_dir, split_ratios, copy_files=True
+        self,
+        data: Dict,
+        image_dir: str,
+        output_dir: str,
+        split_ratios: List[float],
+        copy_files: bool = True,
     ) -> None:
         """Processes the data by dividing it into training and validation sets, and
         saves the images and labels in YOLO format.
@@ -133,7 +147,7 @@ def process_data(
 
         self.create_data_yaml(output_dir, data["class_names"])
 
-    def create_data_yaml(self, root_dir, class_names) -> None:
+    def create_data_yaml(self, root_dir: str, class_names: List[str]) -> None:
         """Creates a YAML file for dataset configuration, specifying paths and class
         names.
 

From 46e944b5803a517c9a91133ef310dc4115db1e07 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Wed, 25 Sep 2024 15:37:48 +0000
Subject: [PATCH 13/31] tests: replace default ubuntu runner with buildjet
 runner

---
 .github/workflows/tests.yaml | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 0b1aaf7..31a711d 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -13,7 +13,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-latest, windows-latest, macOS-latest]
+        os: [buildjet-4vcpu-ubuntu-2204, windows-latest, macOS-latest]
         version: ['3.10', '3.11']
 
     runs-on: ${{ matrix.os }}
@@ -31,46 +31,43 @@ jobs:
         cache: pip
 
     - name: Install dependencies [Ubuntu]
-      if: matrix.os == 'ubuntu-latest'
+      if: matrix.os == 'buildjet-4vcpu-ubuntu-2204'
       run: |
         sudo apt update
         sudo apt install -y pandoc
         pip install -e .[dev]
         pip install coverage-badge>=1.1.0 pytest-cov>=4.1.0
-
     - name: Install dependencies [Windows]
       if: matrix.os == 'windows-latest'
       run: |
         pip install -e .[dev]
         pip install coverage-badge>=1.1.0 pytest-cov>=4.1.0
-
     - name: Install dependencies [macOS]
       if: matrix.os == 'macOS-latest'
       run: |
         pip install -e .[dev]
         pip install coverage-badge>=1.1.0 pytest-cov>=4.1.0
-
     - name: Run tests with coverage [Ubuntu]
-      if: matrix.os == 'ubuntu-latest' && matrix.version == '3.10'
+      if: matrix.os == 'buildjet-4vcpu-ubuntu-2204' && matrix.version == '3.10'
       run: pytest tests --cov=datadreamer --cov-report xml --junit-xml pytest.xml
 
     - name: Run tests [Windows, macOS]
-      if: matrix.os != 'ubuntu-latest' || matrix.version != '3.10'
+      if: matrix.os != 'buildjet-4vcpu-ubuntu-2204' || matrix.version != '3.10'
       run: pytest tests --junit-xml pytest.xml
 
     - name: Generate coverage badge [Ubuntu]
-      if: matrix.os == 'ubuntu-latest' && matrix.version == '3.10'
+      if: matrix.os == 'buildjet-4vcpu-ubuntu-2204' && matrix.version == '3.10'
       run: coverage-badge -o media/coverage_badge.svg -f
 
     - name: Generate coverage report [Ubuntu]
-      if: matrix.os == 'ubuntu-latest' && matrix.version == '3.10'
+      if: matrix.os == 'buildjet-4vcpu-ubuntu-2204' && matrix.version == '3.10'
       uses: orgoro/coverage@v3.1
       with:
         coverageFile: coverage.xml
         token: ${{ secrets.GITHUB_TOKEN }}
 
     - name: Commit coverage badge [Ubuntu]
-      if: matrix.os == 'ubuntu-latest' && matrix.version == '3.10'
+      if: matrix.os == 'buildjet-4vcpu-ubuntu-2204' && matrix.version == '3.10'
       run: |
         git config --global user.name 'GitHub Actions'
         git config --global user.email 'actions@github.com'
@@ -78,9 +75,8 @@ jobs:
           git add media/coverage_badge.svg
           git commit -m "[Automated] Updated coverage badge"
         }
-
     - name: Push changes [Ubuntu]
-      if: matrix.os == 'ubuntu-latest' && matrix.version == '3.10'
+      if: matrix.os == 'buildjet-4vcpu-ubuntu-2204' && matrix.version == '3.10'
       uses: ad-m/github-push-action@master
       with:
         branch: ${{ github.head_ref }}

From 2639a9bb9cad26108704884039ee2485b8ea5ca7 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Wed, 25 Sep 2024 16:08:14 +0000
Subject: [PATCH 14/31] fix: type hint

---
 datadreamer/image_generation/clip_image_tester.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datadreamer/image_generation/clip_image_tester.py b/datadreamer/image_generation/clip_image_tester.py
index 8f86a88..8147533 100644
--- a/datadreamer/image_generation/clip_image_tester.py
+++ b/datadreamer/image_generation/clip_image_tester.py
@@ -66,7 +66,7 @@ def test_images_batch(
         self,
         images: List[Image.Image],
         objects: List[List[str]],
-        conf_threshold=0.05,
+        conf_threshold: float = 0.05,
     ) -> Tuple[List[bool], List[torch.Tensor], List[int]]:
         """Tests the generated images against a set of objects using the CLIP model.
 

From 261e392f5acdf9e127c25f41e76b7e90a0978e88 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Wed, 25 Sep 2024 16:26:50 +0000
Subject: [PATCH 15/31] test: modify memory computation

---
 tests/integration/test_pipeline.py        | 4 ++--
 tests/unittests/test_annotators.py        | 2 +-
 tests/unittests/test_image_generation.py  | 4 ++--
 tests/unittests/test_prompt_generation.py | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/integration/test_pipeline.py b/tests/integration/test_pipeline.py
index 293d3a7..f54244e 100644
--- a/tests/integration/test_pipeline.py
+++ b/tests/integration/test_pipeline.py
@@ -8,9 +8,9 @@
 import torch
 
 # Get the total memory in GB
-total_memory = psutil.virtual_memory().total / (1024**3)
+total_memory = psutil.virtual_memory().total / (1000**3)
 # Get the total disk space in GB
-total_disk_space = psutil.disk_usage("/").total / (1024**3)
+total_disk_space = psutil.disk_usage("/").total / (1000**3)
 
 
 def _check_detection_pipeline(cmd: str, target_folder: str):
diff --git a/tests/unittests/test_annotators.py b/tests/unittests/test_annotators.py
index 698ed3d..f69f83e 100644
--- a/tests/unittests/test_annotators.py
+++ b/tests/unittests/test_annotators.py
@@ -11,7 +11,7 @@
 from datadreamer.dataset_annotation.owlv2_annotator import OWLv2Annotator
 
 # Get the total disk space in GB
-total_disk_space = psutil.disk_usage("/").total / (1024**3)
+total_disk_space = psutil.disk_usage("/").total / (1000**3)
 
 
 def _check_owlv2_annotator(device: str, size: str = "base"):
diff --git a/tests/unittests/test_image_generation.py b/tests/unittests/test_image_generation.py
index 506e42c..4a73d0b 100644
--- a/tests/unittests/test_image_generation.py
+++ b/tests/unittests/test_image_generation.py
@@ -16,9 +16,9 @@
 from datadreamer.image_generation.clip_image_tester import ClipImageTester
 
 # Get the total memory in GB
-total_memory = psutil.virtual_memory().total / (1024**3)
+total_memory = psutil.virtual_memory().total / (1000**3)
 # Get the total disk space in GB
-total_disk_space = psutil.disk_usage("/").total / (1024**3)
+total_disk_space = psutil.disk_usage("/").total / (1000**3)
 
 
 def _check_clip_image_tester(device: str):
diff --git a/tests/unittests/test_prompt_generation.py b/tests/unittests/test_prompt_generation.py
index e77472d..4b906ac 100644
--- a/tests/unittests/test_prompt_generation.py
+++ b/tests/unittests/test_prompt_generation.py
@@ -15,9 +15,9 @@
 )
 
 # Get the total memory in GB
-total_memory = psutil.virtual_memory().total / (1024**3)
+total_memory = psutil.virtual_memory().total / (1000**3)
 # Get the total disk space in GB
-total_disk_space = psutil.disk_usage("/").total / (1024**3)
+total_disk_space = psutil.disk_usage("/").total / (1000**3)
 
 
 def test_simple_prompt_generator():

From 9e812e07210f799dff0e8d4dfee39b890aa58f71 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Wed, 25 Sep 2024 16:56:26 +0000
Subject: [PATCH 16/31] test: round up ram computation

---
 tests/integration/test_pipeline.py        | 2 ++
 tests/unittests/test_image_generation.py  | 1 +
 tests/unittests/test_prompt_generation.py | 1 +
 3 files changed, 4 insertions(+)

diff --git a/tests/integration/test_pipeline.py b/tests/integration/test_pipeline.py
index f54244e..7df8676 100644
--- a/tests/integration/test_pipeline.py
+++ b/tests/integration/test_pipeline.py
@@ -9,6 +9,8 @@
 
 # Get the total memory in GB
 total_memory = psutil.virtual_memory().total / (1000**3)
+print(total_memory)
+total_memory = int(total_memory) + (total_memory > int(total_memory))
 # Get the total disk space in GB
 total_disk_space = psutil.disk_usage("/").total / (1000**3)
 
diff --git a/tests/unittests/test_image_generation.py b/tests/unittests/test_image_generation.py
index 4a73d0b..efbeb74 100644
--- a/tests/unittests/test_image_generation.py
+++ b/tests/unittests/test_image_generation.py
@@ -17,6 +17,7 @@
 
 # Get the total memory in GB
 total_memory = psutil.virtual_memory().total / (1000**3)
+total_memory = int(total_memory) + (total_memory > int(total_memory))
 # Get the total disk space in GB
 total_disk_space = psutil.disk_usage("/").total / (1000**3)
 
diff --git a/tests/unittests/test_prompt_generation.py b/tests/unittests/test_prompt_generation.py
index 4b906ac..7769a71 100644
--- a/tests/unittests/test_prompt_generation.py
+++ b/tests/unittests/test_prompt_generation.py
@@ -16,6 +16,7 @@
 
 # Get the total memory in GB
 total_memory = psutil.virtual_memory().total / (1000**3)
+total_memory = int(total_memory) + (total_memory > int(total_memory))
 # Get the total disk space in GB
 total_disk_space = psutil.disk_usage("/").total / (1000**3)
 

From 6a78e00e6f487a69fd8f9af180839a589db1b70a Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Wed, 25 Sep 2024 17:22:11 +0000
Subject: [PATCH 17/31] test: disable output capturing

---
 .github/workflows/tests.yaml       | 4 ++--
 tests/integration/test_pipeline.py | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 31a711d..a5e3a76 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -49,11 +49,11 @@ jobs:
         pip install coverage-badge>=1.1.0 pytest-cov>=4.1.0
     - name: Run tests with coverage [Ubuntu]
       if: matrix.os == 'buildjet-4vcpu-ubuntu-2204' && matrix.version == '3.10'
-      run: pytest tests --cov=datadreamer --cov-report xml --junit-xml pytest.xml
+      run: pytest -s tests --cov=datadreamer --cov-report xml --junit-xml pytest.xml
 
     - name: Run tests [Windows, macOS]
       if: matrix.os != 'buildjet-4vcpu-ubuntu-2204' || matrix.version != '3.10'
-      run: pytest tests --junit-xml pytest.xml
+      run: pytest -s tests --junit-xml pytest.xml
 
     - name: Generate coverage badge [Ubuntu]
       if: matrix.os == 'buildjet-4vcpu-ubuntu-2204' && matrix.version == '3.10'
diff --git a/tests/integration/test_pipeline.py b/tests/integration/test_pipeline.py
index 7df8676..fb6d3f5 100644
--- a/tests/integration/test_pipeline.py
+++ b/tests/integration/test_pipeline.py
@@ -9,10 +9,12 @@
 
 # Get the total memory in GB
 total_memory = psutil.virtual_memory().total / (1000**3)
-print(total_memory)
+print(f"Total memory: {total_memory}")
 total_memory = int(total_memory) + (total_memory > int(total_memory))
+print(f"Total memory rounded: {total_memory}")
 # Get the total disk space in GB
 total_disk_space = psutil.disk_usage("/").total / (1000**3)
+print(f"Total disk space: {total_disk_space}")
 
 
 def _check_detection_pipeline(cmd: str, target_folder: str):

From 3de930682f0c09834a6dfe0a8c3afd552951a510 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Wed, 25 Sep 2024 17:54:28 +0000
Subject: [PATCH 18/31] test: decrease required ram for demanding tests

---
 .github/workflows/tests.yaml              |   4 +-
 tests/integration/test_pipeline.py        | 123 +++++++++++-----------
 tests/unittests/test_image_generation.py  |  24 ++---
 tests/unittests/test_prompt_generation.py |   8 +-
 4 files changed, 78 insertions(+), 81 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index a5e3a76..31a711d 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -49,11 +49,11 @@ jobs:
         pip install coverage-badge>=1.1.0 pytest-cov>=4.1.0
     - name: Run tests with coverage [Ubuntu]
       if: matrix.os == 'buildjet-4vcpu-ubuntu-2204' && matrix.version == '3.10'
-      run: pytest -s tests --cov=datadreamer --cov-report xml --junit-xml pytest.xml
+      run: pytest tests --cov=datadreamer --cov-report xml --junit-xml pytest.xml
 
     - name: Run tests [Windows, macOS]
       if: matrix.os != 'buildjet-4vcpu-ubuntu-2204' || matrix.version != '3.10'
-      run: pytest -s tests --junit-xml pytest.xml
+      run: pytest tests --junit-xml pytest.xml
 
     - name: Generate coverage badge [Ubuntu]
       if: matrix.os == 'buildjet-4vcpu-ubuntu-2204' && matrix.version == '3.10'
diff --git a/tests/integration/test_pipeline.py b/tests/integration/test_pipeline.py
index fb6d3f5..7cfb88a 100644
--- a/tests/integration/test_pipeline.py
+++ b/tests/integration/test_pipeline.py
@@ -9,12 +9,9 @@
 
 # Get the total memory in GB
 total_memory = psutil.virtual_memory().total / (1000**3)
-print(f"Total memory: {total_memory}")
 total_memory = int(total_memory) + (total_memory > int(total_memory))
-print(f"Total memory rounded: {total_memory}")
 # Get the total disk space in GB
 total_disk_space = psutil.disk_usage("/").total / (1000**3)
-print(f"Total disk space: {total_disk_space}")
 
 
 def _check_detection_pipeline(cmd: str, target_folder: str):
@@ -260,8 +257,8 @@ def test_negative_num_objects_range():
 # DETECTION - SIMPLE LM
 # =========================================================
 @pytest.mark.skipif(
-    total_memory < 16 or total_disk_space < 35,
-    reason="Test requires at least 16GB of RAM and 35GB of HDD",
+    total_memory < 15 or total_disk_space < 35,
+    reason="Test requires at least 15GB of RAM and 35GB of HDD",
 )
 def test_cpu_simple_sdxl_turbo_detection_pipeline():
     # Define target folder
@@ -282,8 +279,8 @@ def test_cpu_simple_sdxl_turbo_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
-    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
+    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
 )
 def test_cuda_simple_sdxl_turbo_detection_pipeline():
     # Define target folder
@@ -304,8 +301,8 @@ def test_cuda_simple_sdxl_turbo_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 55,
-    reason="Test requires GPU, at least 16GB of RAM and 55GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 55,
+    reason="Test requires GPU, at least 15GB of RAM and 55GB of HDD",
 )
 def test_cuda_simple_llm_synonym_sdxl_turbo_detection_pipeline():
     # Define target folder
@@ -327,8 +324,8 @@ def test_cuda_simple_llm_synonym_sdxl_turbo_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
-    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
+    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
 )
 def test_cuda_simple_wordnet_synonym_sdxl_turbo_detection_pipeline():
     # Define target folder
@@ -350,8 +347,8 @@ def test_cuda_simple_wordnet_synonym_sdxl_turbo_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 16 or total_disk_space < 35,
-    reason="Test requires at least 16GB of RAM and 35GB of HDD",
+    total_memory < 15 or total_disk_space < 35,
+    reason="Test requires at least 15GB of RAM and 35GB of HDD",
 )
 def test_cpu_simple_sdxl_detection_pipeline():
     # Define target folder
@@ -372,8 +369,8 @@ def test_cpu_simple_sdxl_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
-    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
+    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
 )
 def test_cuda_simple_sdxl_detection_pipeline():
     # Define target folder
@@ -394,8 +391,8 @@ def test_cuda_simple_sdxl_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 16 or total_disk_space < 35,
-    reason="Test requires at least 16GB of RAM and 35GB of HDD",
+    total_memory < 15 or total_disk_space < 35,
+    reason="Test requires at least 15GB of RAM and 35GB of HDD",
 )
 def test_cpu_simple_sdxl_lightning_detection_pipeline():
     # Define target folder
@@ -416,8 +413,8 @@ def test_cpu_simple_sdxl_lightning_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
-    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
+    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
 )
 def test_cuda_simple_sdxl_lightning_detection_pipeline():
     # Define target folder
@@ -463,8 +460,8 @@ def test_cpu_lm_sdxl_turbo_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 16 or not torch.cuda.is_available() or total_disk_space < 55,
-    reason="Test requires at least 16GB of RAM, CUDA support and 55GB of HDD",
+    total_memory < 15 or not torch.cuda.is_available() or total_disk_space < 55,
+    reason="Test requires at least 15GB of RAM, CUDA support and 55GB of HDD",
 )
 def test_cuda_lm_sdxl_turbo_detection_pipeline():
     # Define target folder
@@ -530,8 +527,8 @@ def test_cpu_lm_sdxl_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 16 or not torch.cuda.is_available() or total_disk_space < 55,
-    reason="Test requires at least 16GB of RAM, CUDA support and 55GB of HDD",
+    total_memory < 15 or not torch.cuda.is_available() or total_disk_space < 55,
+    reason="Test requires at least 15GB of RAM, CUDA support and 55GB of HDD",
 )
 def test_cuda_lm_sdxl_detection_pipeline():
     # Define target folder
@@ -578,8 +575,8 @@ def test_cuda_4bit_lm_sdxl_detection_pipeline():
 # DETECTION - TinyLlama LLM
 # =========================================================
 @pytest.mark.skipif(
-    total_memory < 16 or total_disk_space < 35,
-    reason="Test requires at least 16GB of RAM and 35GB of HDD",
+    total_memory < 15 or total_disk_space < 35,
+    reason="Test requires at least 15GB of RAM and 35GB of HDD",
 )
 def test_cpu_tiny_sdxl_turbo_detection_pipeline():
     # Define target folder
@@ -600,8 +597,8 @@ def test_cpu_tiny_sdxl_turbo_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
-    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
+    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
 )
 def test_cuda_tiny_sdxl_turbo_detection_pipeline():
     # Define target folder
@@ -622,8 +619,8 @@ def test_cuda_tiny_sdxl_turbo_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 16 or total_disk_space < 35,
-    reason="Test requires at least 16GB of RAM and 35GB of HDD",
+    total_memory < 15 or total_disk_space < 35,
+    reason="Test requires at least 15GB of RAM and 35GB of HDD",
 )
 def test_cpu_tiny_sdxl_detection_pipeline():
     # Define target folder
@@ -644,8 +641,8 @@ def test_cpu_tiny_sdxl_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
-    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
+    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
 )
 def test_cuda_tiny_sdxl_detection_pipeline():
     # Define target folder
@@ -669,8 +666,8 @@ def test_cuda_tiny_sdxl_detection_pipeline():
 # CLASSIFICATION - SIMPLE LM
 # =========================================================
 @pytest.mark.skipif(
-    total_memory < 16 or total_disk_space < 35,
-    reason="Test requires at least 16GB of RAM and 35GB of HDD",
+    total_memory < 15 or total_disk_space < 35,
+    reason="Test requires at least 15GB of RAM and 35GB of HDD",
 )
 def test_cpu_simple_sdxl_turbo_classification_pipeline():
     # Define target folder
@@ -693,8 +690,8 @@ def test_cpu_simple_sdxl_turbo_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
-    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
+    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
 )
 def test_cuda_simple_sdxl_turbo_classification_pipeline():
     # Define target folder
@@ -717,8 +714,8 @@ def test_cuda_simple_sdxl_turbo_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 55,
-    reason="Test requires GPU, at least 16GB of RAM and 55GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 55,
+    reason="Test requires GPU, at least 15GB of RAM and 55GB of HDD",
 )
 def test_cuda_simple_llm_synonym_sdxl_turbo_classification_pipeline():
     # Define target folder
@@ -742,8 +739,8 @@ def test_cuda_simple_llm_synonym_sdxl_turbo_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
-    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
+    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
 )
 def test_cuda_simple_wordnet_synonym_sdxl_turbo_classification_pipeline():
     # Define target folder
@@ -767,8 +764,8 @@ def test_cuda_simple_wordnet_synonym_sdxl_turbo_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 16 or total_disk_space < 35,
-    reason="Test requires at least 16GB of RAM and 35GB of HDD",
+    total_memory < 15 or total_disk_space < 35,
+    reason="Test requires at least 15GB of RAM and 35GB of HDD",
 )
 def test_cpu_simple_sdxl_classification_pipeline():
     # Define target folder
@@ -791,8 +788,8 @@ def test_cpu_simple_sdxl_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
-    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
+    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
 )
 def test_cuda_simple_sdxl_classification_pipeline():
     # Define target folder
@@ -842,8 +839,8 @@ def test_cpu_lm_sdxl_turbo_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 16 or not torch.cuda.is_available() or total_disk_space < 55,
-    reason="Test requires at least 16GB of RAM, 55GB of HDD and CUDA support",
+    total_memory < 15 or not torch.cuda.is_available() or total_disk_space < 55,
+    reason="Test requires at least 15GB of RAM, 55GB of HDD and CUDA support",
 )
 def test_cuda_lm_sdxl_turbo_classification_pipeline():
     # Define target folder
@@ -915,8 +912,8 @@ def test_cpu_lm_sdxl_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 16 or not torch.cuda.is_available() or total_disk_space < 55,
-    reason="Test requires at least 16GB of RAM, CUDA support and 55GB of HDD",
+    total_memory < 15 or not torch.cuda.is_available() or total_disk_space < 55,
+    reason="Test requires at least 15GB of RAM, CUDA support and 55GB of HDD",
 )
 def test_cuda_lm_sdxl_classification_pipeline():
     # Define target folder
@@ -967,8 +964,8 @@ def test_cuda_4bit_lm_sdxl_classification_pipeline():
 # CLASSIFICATION - TinyLlama LLM
 # =========================================================
 @pytest.mark.skipif(
-    total_memory < 16 or total_disk_space < 35,
-    reason="Test requires at least 16GB of RAM and 35GB of HDD",
+    total_memory < 15 or total_disk_space < 35,
+    reason="Test requires at least 15GB of RAM and 35GB of HDD",
 )
 def test_cpu_tiny_sdxl_turbo_classification_pipeline():
     # Define target folder
@@ -991,8 +988,8 @@ def test_cpu_tiny_sdxl_turbo_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
-    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
+    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
 )
 def test_cuda_tiny_sdxl_turbo_classification_pipeline():
     # Define target folder
@@ -1015,8 +1012,8 @@ def test_cuda_tiny_sdxl_turbo_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 16 or total_disk_space < 35,
-    reason="Test requires at least 16GB of RAM and 35GB of HDD",
+    total_memory < 15 or total_disk_space < 35,
+    reason="Test requires at least 15GB of RAM and 35GB of HDD",
 )
 def test_cpu_tiny_sdxl_classification_pipeline():
     # Define target folder
@@ -1039,8 +1036,8 @@ def test_cpu_tiny_sdxl_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
-    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
+    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
 )
 def test_cuda_tiny_sdxl_classification_pipeline():
     # Define target folder
@@ -1066,8 +1063,8 @@ def test_cuda_tiny_sdxl_classification_pipeline():
 # TEST WITH CONFIG FILE
 # =========================================================
 @pytest.mark.skipif(
-    total_memory < 16 or total_disk_space < 35,
-    reason="Test requires at least 16GB of RAM and 35GB of HDD",
+    total_memory < 15 or total_disk_space < 35,
+    reason="Test requires at least 15GB of RAM and 35GB of HDD",
 )
 def test_cpu_simple_sdxl_turbo_config_detection_pipeline():
     # Define target folder
@@ -1084,8 +1081,8 @@ def test_cpu_simple_sdxl_turbo_config_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
-    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
+    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
 )
 def test_cuda_simple_sdxl_turbo_config_detection_pipeline():
     # Define target folder
@@ -1102,8 +1099,8 @@ def test_cuda_simple_sdxl_turbo_config_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 16 or total_disk_space < 35,
-    reason="Test requires at least 16GB of RAM and 35GB of HDD",
+    total_memory < 15 or total_disk_space < 35,
+    reason="Test requires at least 15GB of RAM and 35GB of HDD",
 )
 def test_cpu_simple_sdxl_turbo_config_classification_pipeline():
     # Define target folder
@@ -1122,8 +1119,8 @@ def test_cpu_simple_sdxl_turbo_config_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
-    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
+    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
 )
 def test_cuda_simple_sdxl_turbo_config_classification_pipeline():
     # Define target folder
diff --git a/tests/unittests/test_image_generation.py b/tests/unittests/test_image_generation.py
index efbeb74..0cb9105 100644
--- a/tests/unittests/test_image_generation.py
+++ b/tests/unittests/test_image_generation.py
@@ -99,48 +99,48 @@ def _check_image_generator(
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 25,
-    reason="Test requires GPU, at least 16GB of RAM and 25GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 25,
+    reason="Test requires GPU, at least 15GB of RAM and 25GB of HDD",
 )
 def test_cuda_sdxl_image_generator():
     _check_image_generator(StableDiffusionImageGenerator, "cuda")
 
 
 @pytest.mark.skipif(
-    total_memory < 16 or total_disk_space < 25,
-    reason="Test requires at least 16GB of RAM and 25GB of HDD",
+    total_memory < 15 or total_disk_space < 25,
+    reason="Test requires at least 15GB of RAM and 25GB of HDD",
 )
 def test_cpu_sdxl_image_generator():
     _check_image_generator(StableDiffusionImageGenerator, "cpu")
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 25,
-    reason="Test requires GPU, at least 16GB of RAM and 25GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 25,
+    reason="Test requires GPU, at least 15GB of RAM and 25GB of HDD",
 )
 def test_cuda_sdxl_turbo_image_generator():
     _check_image_generator(StableDiffusionTurboImageGenerator, "cuda")
 
 
 @pytest.mark.skipif(
-    total_memory < 16 or total_disk_space < 25,
-    reason="Test requires at least 16GB of RAM and 25GB of HDD",
+    total_memory < 15 or total_disk_space < 25,
+    reason="Test requires at least 15GB of RAM and 25GB of HDD",
 )
 def test_cpu_sdxl_turbo_image_generator():
     _check_image_generator(StableDiffusionTurboImageGenerator, "cpu")
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 25,
-    reason="Test requires GPU, at least 16GB of RAM and 25GB of HDD",
+    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 25,
+    reason="Test requires GPU, at least 15GB of RAM and 25GB of HDD",
 )
 def test_cuda_sdxl_lightning_image_generator():
     _check_image_generator(StableDiffusionLightningImageGenerator, "cuda")
 
 
 @pytest.mark.skipif(
-    total_memory < 16 or total_disk_space < 25,
-    reason="Test requires at least 16GB of RAM and 25GB of HDD",
+    total_memory < 15 or total_disk_space < 25,
+    reason="Test requires at least 15GB of RAM and 25GB of HDD",
 )
 def test_cpu_sdxl_lightning_image_generator():
     _check_image_generator(StableDiffusionLightningImageGenerator, "cpu")
diff --git a/tests/unittests/test_prompt_generation.py b/tests/unittests/test_prompt_generation.py
index 7769a71..0540611 100644
--- a/tests/unittests/test_prompt_generation.py
+++ b/tests/unittests/test_prompt_generation.py
@@ -70,8 +70,8 @@ def _check_lm_prompt_generator(
 
 
 @pytest.mark.skipif(
-    total_memory < 16 or not torch.cuda.is_available() or total_disk_space < 35,
-    reason="Test requires at least 16GB of RAM, 35GB of HDD and CUDA support",
+    total_memory < 15 or not torch.cuda.is_available() or total_disk_space < 35,
+    reason="Test requires at least 15GB of RAM, 35GB of HDD and CUDA support",
 )
 def test_cuda_lm_prompt_generator():
     _check_lm_prompt_generator("cuda")
@@ -128,8 +128,8 @@ def _check_synonym_generator(device: str, synonym_generator_class=LMSynonymGener
 
 
 @pytest.mark.skipif(
-    total_memory < 16 or not torch.cuda.is_available() or total_disk_space < 35,
-    reason="Test requires at least 16GB of RAM, 35GB of HDD and CUDA support",
+    total_memory < 15 or not torch.cuda.is_available() or total_disk_space < 35,
+    reason="Test requires at least 15GB of RAM, 35GB of HDD and CUDA support",
 )
 def test_cuda_synonym_generator():
     _check_synonym_generator("cuda")

From a839a4afd76017f6e4a4d336ab3d0955f7f5ba52 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <49622375+sokovninn@users.noreply.github.com>
Date: Wed, 25 Sep 2024 23:41:15 +0200
Subject: [PATCH 19/31] test: 8vpcu buildjet runner

---
 .github/workflows/tests.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 31a711d..9d6d82d 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -13,8 +13,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [buildjet-4vcpu-ubuntu-2204, windows-latest, macOS-latest]
-        version: ['3.10', '3.11']
+        os: [buildjet-8vcpu-ubuntu-2204, windows-latest, macOS-latest]
+        version: ['3.10']
 
     runs-on: ${{ matrix.os }}
 
@@ -113,4 +113,4 @@ jobs:
       - name: Publish Test Results
         uses: EnricoMi/publish-unit-test-result-action@v2
         with:
-          files: "artifacts/**/*.xml"
\ No newline at end of file
+          files: "artifacts/**/*.xml"

From 6e43d3abcb31e75c4ad1e6b0e1011b039bb1351a Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <49622375+sokovninn@users.noreply.github.com>
Date: Wed, 25 Sep 2024 23:49:40 +0200
Subject: [PATCH 20/31] test: fix buildjet 8cpu runner

---
 .github/workflows/tests.yaml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 9d6d82d..2b57aec 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -31,7 +31,7 @@ jobs:
         cache: pip
 
     - name: Install dependencies [Ubuntu]
-      if: matrix.os == 'buildjet-4vcpu-ubuntu-2204'
+      if: matrix.os == 'buildjet-8vpcu-ubuntu-2204'
       run: |
         sudo apt update
         sudo apt install -y pandoc
@@ -48,26 +48,26 @@ jobs:
         pip install -e .[dev]
         pip install coverage-badge>=1.1.0 pytest-cov>=4.1.0
     - name: Run tests with coverage [Ubuntu]
-      if: matrix.os == 'buildjet-4vcpu-ubuntu-2204' && matrix.version == '3.10'
+      if: matrix.os == 'buildjet-8vpcu-ubuntu-2204' && matrix.version == '3.10'
       run: pytest tests --cov=datadreamer --cov-report xml --junit-xml pytest.xml
 
     - name: Run tests [Windows, macOS]
-      if: matrix.os != 'buildjet-4vcpu-ubuntu-2204' || matrix.version != '3.10'
+      if: matrix.os != 'buildjet-8vpcu-ubuntu-2204' || matrix.version != '3.10'
       run: pytest tests --junit-xml pytest.xml
 
     - name: Generate coverage badge [Ubuntu]
-      if: matrix.os == 'buildjet-4vcpu-ubuntu-2204' && matrix.version == '3.10'
+      if: matrix.os == 'buildjet-8vpcu-ubuntu-2204' && matrix.version == '3.10'
       run: coverage-badge -o media/coverage_badge.svg -f
 
     - name: Generate coverage report [Ubuntu]
-      if: matrix.os == 'buildjet-4vcpu-ubuntu-2204' && matrix.version == '3.10'
+      if: matrix.os == 'buildjet-8vpcu-ubuntu-2204' && matrix.version == '3.10'
       uses: orgoro/coverage@v3.1
       with:
         coverageFile: coverage.xml
         token: ${{ secrets.GITHUB_TOKEN }}
 
     - name: Commit coverage badge [Ubuntu]
-      if: matrix.os == 'buildjet-4vcpu-ubuntu-2204' && matrix.version == '3.10'
+      if: matrix.os == 'buildjet-8vpcu-ubuntu-2204' && matrix.version == '3.10'
       run: |
         git config --global user.name 'GitHub Actions'
         git config --global user.email 'actions@github.com'
@@ -76,7 +76,7 @@ jobs:
           git commit -m "[Automated] Updated coverage badge"
         }
     - name: Push changes [Ubuntu]
-      if: matrix.os == 'buildjet-4vcpu-ubuntu-2204' && matrix.version == '3.10'
+      if: matrix.os == 'buildjet-8vpcu-ubuntu-2204' && matrix.version == '3.10'
       uses: ad-m/github-push-action@master
       with:
         branch: ${{ github.head_ref }}

From 005e3b8ba9aaacecd64d2391a031e2bb96efa3eb Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <49622375+sokovninn@users.noreply.github.com>
Date: Thu, 26 Sep 2024 00:13:18 +0200
Subject: [PATCH 21/31] test: fix 8vcpu buildjet

---
 .github/workflows/tests.yaml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 2b57aec..bf56969 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -31,7 +31,7 @@ jobs:
         cache: pip
 
     - name: Install dependencies [Ubuntu]
-      if: matrix.os == 'buildjet-8vpcu-ubuntu-2204'
+      if: matrix.os == 'buildjet-8vcpu-ubuntu-2204'
       run: |
         sudo apt update
         sudo apt install -y pandoc
@@ -48,26 +48,26 @@ jobs:
         pip install -e .[dev]
         pip install coverage-badge>=1.1.0 pytest-cov>=4.1.0
     - name: Run tests with coverage [Ubuntu]
-      if: matrix.os == 'buildjet-8vpcu-ubuntu-2204' && matrix.version == '3.10'
+      if: matrix.os == 'buildjet-8vcpu-ubuntu-2204' && matrix.version == '3.10'
       run: pytest tests --cov=datadreamer --cov-report xml --junit-xml pytest.xml
 
     - name: Run tests [Windows, macOS]
-      if: matrix.os != 'buildjet-8vpcu-ubuntu-2204' || matrix.version != '3.10'
+      if: matrix.os != 'buildjet-8vcpu-ubuntu-2204' || matrix.version != '3.10'
       run: pytest tests --junit-xml pytest.xml
 
     - name: Generate coverage badge [Ubuntu]
-      if: matrix.os == 'buildjet-8vpcu-ubuntu-2204' && matrix.version == '3.10'
+      if: matrix.os == 'buildjet-8vcpu-ubuntu-2204' && matrix.version == '3.10'
       run: coverage-badge -o media/coverage_badge.svg -f
 
     - name: Generate coverage report [Ubuntu]
-      if: matrix.os == 'buildjet-8vpcu-ubuntu-2204' && matrix.version == '3.10'
+      if: matrix.os == 'buildjet-8vcpu-ubuntu-2204' && matrix.version == '3.10'
       uses: orgoro/coverage@v3.1
       with:
         coverageFile: coverage.xml
         token: ${{ secrets.GITHUB_TOKEN }}
 
     - name: Commit coverage badge [Ubuntu]
-      if: matrix.os == 'buildjet-8vpcu-ubuntu-2204' && matrix.version == '3.10'
+      if: matrix.os == 'buildjet-8vcpu-ubuntu-2204' && matrix.version == '3.10'
       run: |
         git config --global user.name 'GitHub Actions'
         git config --global user.email 'actions@github.com'
@@ -76,7 +76,7 @@ jobs:
           git commit -m "[Automated] Updated coverage badge"
         }
     - name: Push changes [Ubuntu]
-      if: matrix.os == 'buildjet-8vpcu-ubuntu-2204' && matrix.version == '3.10'
+      if: matrix.os == 'buildjet-8vcpu-ubuntu-2204' && matrix.version == '3.10'
       uses: ad-m/github-push-action@master
       with:
         branch: ${{ github.head_ref }}

From 9f3a538688e900ff3ca894454c6293643cef6830 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Fri, 27 Sep 2024 00:28:30 +0000
Subject: [PATCH 22/31] test: divide tests into core and heavy

---
 .../workflows/{tests.yaml => core-tests.yaml} |  19 +-
 .github/workflows/unit-tests.yaml             | 116 +++++
 .../integration/sample_config.yaml            |   0
 tests/core_tests/integration/test_pipeline.py | 122 ++++++
 .../unittests/test_annotators.py              |  26 +-
 .../unittests/test_converters.py              |   0
 .../unittests/test_image_generation.py        |  50 +--
 .../unittests/test_pipeline_arguments.py      | 213 ++++++++++
 .../unittests/test_prompt_generation.py       |  52 +--
 .../{ => core_tests}/unittests/test_utils.py  |   0
 .../integration/test_pipeline.py              | 398 +++---------------
 .../unittests/test_image_generation.py        |  69 +++
 .../unittests/test_prompt_generation.py       |  95 +++++
 13 files changed, 716 insertions(+), 444 deletions(-)
 rename .github/workflows/{tests.yaml => core-tests.yaml} (88%)
 create mode 100644 .github/workflows/unit-tests.yaml
 rename tests/{ => core_tests}/integration/sample_config.yaml (100%)
 create mode 100644 tests/core_tests/integration/test_pipeline.py
 rename tests/{ => core_tests}/unittests/test_annotators.py (80%)
 rename tests/{ => core_tests}/unittests/test_converters.py (100%)
 rename tests/{ => core_tests}/unittests/test_image_generation.py (71%)
 create mode 100644 tests/core_tests/unittests/test_pipeline_arguments.py
 rename tests/{ => core_tests}/unittests/test_prompt_generation.py (78%)
 rename tests/{ => core_tests}/unittests/test_utils.py (100%)
 rename tests/{ => heavy_tests}/integration/test_pipeline.py (69%)
 create mode 100644 tests/heavy_tests/unittests/test_image_generation.py
 create mode 100644 tests/heavy_tests/unittests/test_prompt_generation.py

diff --git a/.github/workflows/tests.yaml b/.github/workflows/core-tests.yaml
similarity index 88%
rename from .github/workflows/tests.yaml
rename to .github/workflows/core-tests.yaml
index bf56969..41c96a9 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/core-tests.yaml
@@ -1,12 +1,13 @@
-name: Tests
+name: Core tests
 
 on:
   pull_request:
-    branches: [ dev, main ]
+    branches: [ main ]
     paths:
       - 'datadreamer/**/**.py'
-      - 'tests/**/**.py'
-      - .github/workflows/tests.yaml
+      - 'tests/core_tests/**/**.py'
+      - .github/workflows/core-tests.yaml
+  workflow_dispatch:
 
 jobs:
   run_tests:
@@ -14,7 +15,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [buildjet-8vcpu-ubuntu-2204, windows-latest, macOS-latest]
-        version: ['3.10']
+        version: ['3.10', '3.11']
 
     runs-on: ${{ matrix.os }}
 
@@ -49,11 +50,11 @@ jobs:
         pip install coverage-badge>=1.1.0 pytest-cov>=4.1.0
     - name: Run tests with coverage [Ubuntu]
       if: matrix.os == 'buildjet-8vcpu-ubuntu-2204' && matrix.version == '3.10'
-      run: pytest tests --cov=datadreamer --cov-report xml --junit-xml pytest.xml
+      run: pytest tests/core_tests --cov=datadreamer --cov-report xml --junit-xml pytest.xml
 
     - name: Run tests [Windows, macOS]
-      if: matrix.os != 'buildjet-8vcpu-ubuntu-2204' || matrix.version != '3.10'
-      run: pytest tests --junit-xml pytest.xml
+      if: matrix.os != 'buildjet-8vcpu-ubuntu-2204'
+      run: pytest tests/core_tests --junit-xml pytest.xml
 
     - name: Generate coverage badge [Ubuntu]
       if: matrix.os == 'buildjet-8vcpu-ubuntu-2204' && matrix.version == '3.10'
@@ -93,7 +94,7 @@ jobs:
   publish-test-results:
     name: "Publish Tests Results"
     needs: run_tests
-    runs-on: ubuntu-latest
+    runs-on: buildjet-8vcpu-ubuntu-2204
     permissions:
       checks: write
       pull-requests: write
diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml
new file mode 100644
index 0000000..59de92a
--- /dev/null
+++ b/.github/workflows/unit-tests.yaml
@@ -0,0 +1,116 @@
+name: Unit tests
+
+on:
+  pull_request:
+    branches: [ dev ]
+    paths:
+      - 'datadreamer/**/**.py'
+      - 'tests/core_tests/unittests/**.py'
+      - .github/workflows/unit-tests.yaml
+
+jobs:
+  run_tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest, macOS-latest]
+        version: ['3.10', '3.11']
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        ref: ${{ github.head_ref }}
+
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.version }}
+        cache: pip
+
+    - name: Install dependencies [Ubuntu]
+      if: matrix.os == 'ubuntu-latest'
+      run: |
+        sudo apt update
+        sudo apt install -y pandoc
+        pip install -e .[dev]
+        pip install coverage-badge>=1.1.0 pytest-cov>=4.1.0
+    - name: Install dependencies [Windows]
+      if: matrix.os == 'windows-latest'
+      run: |
+        pip install -e .[dev]
+        pip install coverage-badge>=1.1.0 pytest-cov>=4.1.0
+    - name: Install dependencies [macOS]
+      if: matrix.os == 'macOS-latest'
+      run: |
+        pip install -e .[dev]
+        pip install coverage-badge>=1.1.0 pytest-cov>=4.1.0
+    - name: Run tests with coverage [Ubuntu]
+      if: matrix.os == 'ubuntu-latest' && matrix.version == '3.10'
+      run: pytest tests/core_tests/unittests --cov=datadreamer --cov-report xml --junit-xml pytest.xml
+
+    - name: Run tests [Windows, macOS]
+      if: matrix.os != 'ubuntu-latest' || matrix.version != '3.10'
+      run: pytest tests/core_tests/unittests --junit-xml pytest.xml
+
+    - name: Generate coverage badge [Ubuntu]
+      if: matrix.os == 'ubuntu-latest' && matrix.version == '3.10'
+      run: coverage-badge -o media/coverage_badge.svg -f
+
+    - name: Generate coverage report [Ubuntu]
+      if: matrix.os == 'ubuntu-latest' && matrix.version == '3.10'
+      uses: orgoro/coverage@v3.1
+      with:
+        coverageFile: coverage.xml
+        token: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Commit coverage badge [Ubuntu]
+      if: matrix.os == 'ubuntu-latest' && matrix.version == '3.10'
+      run: |
+        git config --global user.name 'GitHub Actions'
+        git config --global user.email 'actions@github.com'
+        git diff --quiet media/coverage_badge.svg || {
+          git add media/coverage_badge.svg
+          git commit -m "[Automated] Updated coverage badge"
+        }
+    - name: Push changes [Ubuntu]
+      if: matrix.os == 'ubuntu-latest' && matrix.version == '3.10'
+      uses: ad-m/github-push-action@master
+      with:
+        branch: ${{ github.head_ref }}
+
+    - name: Upload Test Results
+      if: always()
+      uses: actions/upload-artifact@v4
+      with:
+        name: Test Results [${{ matrix.os }}] (Python ${{ matrix.version }})
+        path: pytest.xml
+        retention-days: 10
+        if-no-files-found: error
+
+  publish-test-results:
+    name: "Publish Tests Results"
+    needs: run_tests
+    runs-on: ubuntu-latest
+    permissions:
+      checks: write
+      pull-requests: write
+    if: always()
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
+
+      - name: Download Artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: artifacts
+
+      - name: Publish Test Results
+        uses: EnricoMi/publish-unit-test-result-action@v2
+        with:
+          files: "artifacts/**/*.xml"
diff --git a/tests/integration/sample_config.yaml b/tests/core_tests/integration/sample_config.yaml
similarity index 100%
rename from tests/integration/sample_config.yaml
rename to tests/core_tests/integration/sample_config.yaml
diff --git a/tests/core_tests/integration/test_pipeline.py b/tests/core_tests/integration/test_pipeline.py
new file mode 100644
index 0000000..190c0ce
--- /dev/null
+++ b/tests/core_tests/integration/test_pipeline.py
@@ -0,0 +1,122 @@
+from __future__ import annotations
+
+import os
+import subprocess
+
+import psutil
+import pytest
+import torch
+
+# Get the total memory in GB
+total_memory = psutil.virtual_memory().total / (1024**3)
+# Get the total disk space in GB
+total_disk_space = psutil.disk_usage("/").total / (1024**3)
+
+
+def _check_detection_pipeline(cmd: str, target_folder: str):
+    # Run the command
+    result = subprocess.run(cmd, shell=True)
+    assert result.returncode == 0, "Command failed to run"
+    # Check that the target folder is a folder
+    assert os.path.isdir(target_folder), "Directory not created"
+    files = [
+        "annotations.json",
+        "generation_args.yaml",
+        "prompts.json",
+    ]
+    # Check that all the files were created
+    for file in files:
+        assert os.path.isfile(os.path.join(target_folder, file)), f"{file} not created"
+    # Check that an image with an unique was created
+    assert (
+        len(
+            list(
+                filter(
+                    lambda x: "image_" in x and ".jpg" in x, os.listdir(target_folder)
+                )
+            )
+        )
+        > 0
+    ), "Images not created"
+    # Check that the "bboxes_visualization" folder was created
+    assert os.path.isdir(
+        os.path.join(target_folder, "bboxes_visualization")
+    ), "bboxes_visualization directory not created"
+
+# =========================================================
+# TEST WITH CONFIG FILE
+# =========================================================
+@pytest.mark.skipif(
+    total_memory < 16 or total_disk_space < 35,
+    reason="Test requires at least 16GB of RAM and 35GB of HDD",
+)
+def test_cpu_simple_sdxl_turbo_config_detection_pipeline():
+    # Define target folder
+    target_folder = "data/data-det-cpu-simple-sdxl-turbo-config/"
+    # Define the command to run the datadreamer
+    cmd = (
+        f"datadreamer --save_dir {target_folder} "
+        f"--num_objects_range 1 2 "
+        f"--config ./sample_config.yaml "
+        f"--device cpu"
+    )
+    # Check the run of the pipeline
+    _check_detection_pipeline(cmd, target_folder)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
+    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
+)
+def test_cuda_simple_sdxl_turbo_config_detection_pipeline():
+    # Define target folder
+    target_folder = "data/data-det-cuda-simple-sdxl-turbo-config/"
+    # Define the command to run the datadreamer
+    cmd = (
+        f"datadreamer --save_dir {target_folder} "
+        f"--num_objects_range 1 2 "
+        f"--config ./sample_config.yaml "
+        f"--device cuda"
+    )
+    # Check the run of the pipeline
+    _check_detection_pipeline(cmd, target_folder)
+
+
+@pytest.mark.skipif(
+    total_memory < 16 or total_disk_space < 35,
+    reason="Test requires at least 16GB of RAM and 35GB of HDD",
+)
+def test_cpu_simple_sdxl_turbo_config_classification_pipeline():
+    # Define target folder
+    target_folder = "data/data-cls-cpu-simple-sdxl-turbo-config/"
+    # Define the command to run the datadreamer
+    cmd = (
+        f"datadreamer --task classification "
+        f"--save_dir {target_folder} "
+        f"--num_objects_range 1 2 "
+        f"--image_annotator clip "
+        f"--config ./sample_config.yaml "
+        f"--device cpu"
+    )
+    # Check the run of the pipeline
+    _check_detection_pipeline(cmd, target_folder)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
+    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
+)
+def test_cuda_simple_sdxl_turbo_config_classification_pipeline():
+    # Define target folder
+    target_folder = "data/data-cls-cuda-simple-sdxl-turbo-config/"
+    # Define the command to run the datadreamer
+    cmd = (
+        f"datadreamer --task classification "
+        f"--save_dir {target_folder} "
+        f"--num_objects_range 1 2 "
+        f"--image_annotator clip "
+        f"--config ./sample_config.yaml "
+        f"--device cuda"
+    )
+    # Check the run of the pipeline
+    _check_detection_pipeline(cmd, target_folder)
diff --git a/tests/unittests/test_annotators.py b/tests/core_tests/unittests/test_annotators.py
similarity index 80%
rename from tests/unittests/test_annotators.py
rename to tests/core_tests/unittests/test_annotators.py
index f69f83e..794b898 100644
--- a/tests/unittests/test_annotators.py
+++ b/tests/core_tests/unittests/test_annotators.py
@@ -11,7 +11,7 @@
 from datadreamer.dataset_annotation.owlv2_annotator import OWLv2Annotator
 
 # Get the total disk space in GB
-total_disk_space = psutil.disk_usage("/").total / (1000**3)
+total_disk_space = psutil.disk_usage("/").total / (1024**3)
 
 
 def _check_owlv2_annotator(device: str, size: str = "base"):
@@ -40,16 +40,16 @@ def _check_owlv2_annotator(device: str, size: str = "base"):
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_disk_space < 15,
-    reason="Test requires GPU and 15GB of HDD",
+    not torch.cuda.is_available() or total_disk_space < 16,
+    reason="Test requires GPU and 16GB of HDD",
 )
 def test_cuda_owlv2_annotator():
     _check_owlv2_annotator("cuda")
 
 
 @pytest.mark.skipif(
-    total_disk_space < 15,
-    reason="Test requires at least 15GB of HDD",
+    total_disk_space < 16,
+    reason="Test requires at least 16GB of HDD",
 )
 def test_cpu_owlv2_annotator():
     _check_owlv2_annotator("cpu")
@@ -67,32 +67,32 @@ def _check_clip_annotator(device: str, size: str = "base"):
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_disk_space < 15,
-    reason="Test requires GPU and 15GB of HDD",
+    not torch.cuda.is_available() or total_disk_space < 16,
+    reason="Test requires GPU and 16GB of HDD",
 )
 def test_cuda_clip_base_annotator():
     _check_clip_annotator("cuda")
 
 
 @pytest.mark.skipif(
-    total_disk_space < 15,
-    reason="Test requires at least 15GB of HDD",
+    total_disk_space < 16,
+    reason="Test requires at least 16GB of HDD",
 )
 def test_cpu_clip_base_annotator():
     _check_clip_annotator("cpu")
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_disk_space < 15,
-    reason="Test requires GPU and 15GB of HDD",
+    not torch.cuda.is_available() or total_disk_space < 16,
+    reason="Test requires GPU and 16GB of HDD",
 )
 def test_cuda_clip_large_annotator():
     _check_clip_annotator("cuda")
 
 
 @pytest.mark.skipif(
-    total_disk_space < 15,
-    reason="Test requires at least 15GB of HDD",
+    total_disk_space < 16,
+    reason="Test requires at least 16GB of HDD",
 )
 def test_cpu_clip_large_annotator():
     _check_clip_annotator("cpu")
diff --git a/tests/unittests/test_converters.py b/tests/core_tests/unittests/test_converters.py
similarity index 100%
rename from tests/unittests/test_converters.py
rename to tests/core_tests/unittests/test_converters.py
diff --git a/tests/unittests/test_image_generation.py b/tests/core_tests/unittests/test_image_generation.py
similarity index 71%
rename from tests/unittests/test_image_generation.py
rename to tests/core_tests/unittests/test_image_generation.py
index 0cb9105..51fda42 100644
--- a/tests/unittests/test_image_generation.py
+++ b/tests/core_tests/unittests/test_image_generation.py
@@ -8,18 +8,18 @@
 import torch
 from PIL import Image
 
+from datadreamer.image_generation.clip_image_tester import ClipImageTester
 from datadreamer.image_generation import (
     StableDiffusionImageGenerator,
     StableDiffusionLightningImageGenerator,
     StableDiffusionTurboImageGenerator,
 )
-from datadreamer.image_generation.clip_image_tester import ClipImageTester
 
 # Get the total memory in GB
-total_memory = psutil.virtual_memory().total / (1000**3)
-total_memory = int(total_memory) + (total_memory > int(total_memory))
+total_memory = psutil.virtual_memory().total / (1024**3)
 # Get the total disk space in GB
-total_disk_space = psutil.disk_usage("/").total / (1000**3)
+total_disk_space = psutil.disk_usage("/").total / (1024**3)
+
 
 
 def _check_clip_image_tester(device: str):
@@ -50,21 +50,20 @@ def _check_clip_image_tester(device: str):
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_disk_space < 15,
-    reason="Test requires GPU and 15GB of HDD",
+    not torch.cuda.is_available() or total_disk_space < 16,
+    reason="Test requires GPU and 16GB of HDD",
 )
 def test_cuda_clip_image_tester():
     _check_clip_image_tester("cuda")
 
 
 @pytest.mark.skipif(
-    total_disk_space < 15,
-    reason="Test requires at least 15GB of HDD",
+    total_disk_space < 16,
+    reason="Test requires at least 16GB of HDD",
 )
 def test_cpu_clip_image_tester():
     _check_clip_image_tester("cpu")
 
-
 def _check_image_generator(
     image_generator_class: Type[
         Union[
@@ -97,50 +96,33 @@ def _check_image_generator(
     # Release the generator
     image_generator.release(empty_cuda_cache=True if device != "cpu" else False)
 
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 25,
-    reason="Test requires GPU, at least 15GB of RAM and 25GB of HDD",
-)
-def test_cuda_sdxl_image_generator():
-    _check_image_generator(StableDiffusionImageGenerator, "cuda")
-
-
-@pytest.mark.skipif(
-    total_memory < 15 or total_disk_space < 25,
-    reason="Test requires at least 15GB of RAM and 25GB of HDD",
-)
-def test_cpu_sdxl_image_generator():
-    _check_image_generator(StableDiffusionImageGenerator, "cpu")
-
-
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 25,
-    reason="Test requires GPU, at least 15GB of RAM and 25GB of HDD",
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 25,
+    reason="Test requires GPU, at least 16GB of RAM and 25GB of HDD",
 )
 def test_cuda_sdxl_turbo_image_generator():
     _check_image_generator(StableDiffusionTurboImageGenerator, "cuda")
 
 
 @pytest.mark.skipif(
-    total_memory < 15 or total_disk_space < 25,
-    reason="Test requires at least 15GB of RAM and 25GB of HDD",
+    total_memory < 16 or total_disk_space < 25,
+    reason="Test requires at least 16GB of RAM and 25GB of HDD",
 )
 def test_cpu_sdxl_turbo_image_generator():
     _check_image_generator(StableDiffusionTurboImageGenerator, "cpu")
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 25,
-    reason="Test requires GPU, at least 15GB of RAM and 25GB of HDD",
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 25,
+    reason="Test requires GPU, at least 16GB of RAM and 25GB of HDD",
 )
 def test_cuda_sdxl_lightning_image_generator():
     _check_image_generator(StableDiffusionLightningImageGenerator, "cuda")
 
 
 @pytest.mark.skipif(
-    total_memory < 15 or total_disk_space < 25,
-    reason="Test requires at least 15GB of RAM and 25GB of HDD",
+    total_memory < 16 or total_disk_space < 25,
+    reason="Test requires at least 16GB of RAM and 25GB of HDD",
 )
 def test_cpu_sdxl_lightning_image_generator():
     _check_image_generator(StableDiffusionLightningImageGenerator, "cpu")
diff --git a/tests/core_tests/unittests/test_pipeline_arguments.py b/tests/core_tests/unittests/test_pipeline_arguments.py
new file mode 100644
index 0000000..198db72
--- /dev/null
+++ b/tests/core_tests/unittests/test_pipeline_arguments.py
@@ -0,0 +1,213 @@
+from __future__ import annotations
+
+import subprocess
+
+import pytest
+
+
+def _check_wrong_argument_choice(cmd: str):
+    with pytest.raises(subprocess.CalledProcessError):
+        subprocess.check_call(cmd, shell=True)
+
+
+def _check_wrong_value(cmd: str):
+    with pytest.raises(ValueError):
+        try:
+            subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError as e:
+            raise ValueError(e.output.decode()) from e
+
+
+# =========================================================
+# ARGUMENTS CHECKS
+# =========================================================
+def test_invalid_task_value():
+    # Define the cmd
+    cmd = "datadreamer --task invalid_task"
+    _check_wrong_argument_choice(cmd)
+
+
+def test_invalid_prompts_number_type():
+    # Define the cmd
+    cmd = "datadreamer --prompts_number value"
+    _check_wrong_argument_choice(cmd)
+
+
+def test_invalid_num_objects_range_type():
+    # Define the cmd
+    cmd = "datadreamer --num_objects_range value"
+    _check_wrong_argument_choice(cmd)
+
+
+def test_invalid_conf_threshold_range_type():
+    # Define the cmd
+    cmd = "datadreamer --conf_threshold value"
+    _check_wrong_argument_choice(cmd)
+
+
+def test_invalid_image_tester_patience_type():
+    # Define the cmd
+    cmd = "datadreamer --image_tester_patience value"
+    _check_wrong_argument_choice(cmd)
+
+
+def test_invalid_seed_type():
+    # Define the cmd
+    cmd = "datadreamer --seed value --device cpu"
+    _check_wrong_argument_choice(cmd)
+
+
+def test_invalid_prompt_generator():
+    # Define the cmd
+    cmd = "datadreamer --prompt_generator invalide_value"
+    _check_wrong_argument_choice(cmd)
+
+
+def test_invalid_image_generator():
+    # Define the cmd
+    cmd = "datadreamer --image_generator invalide_value"
+    _check_wrong_argument_choice(cmd)
+
+
+def test_invalid_image_annotator():
+    # Define the cmd
+    cmd = "datadreamer --image_annotator invalide_value"
+    _check_wrong_argument_choice(cmd)
+
+
+def test_invalid_det_image_annotator():
+    # Define the cmd
+    cmd = "datadreamer --image_annotator clip"
+    _check_wrong_argument_choice(cmd)
+
+
+def test_invalid_clf_image_annotator():
+    # Define the cmd
+    cmd = "datadreamer --image_annotator owlv2 --task classification"
+    _check_wrong_argument_choice(cmd)
+
+
+def test_invalid_device():
+    # Define the cmd
+    cmd = "datadreamer --device invalide_value"
+    _check_wrong_argument_choice(cmd)
+
+
+def test_invalid_annotator_size():
+    # Define the cmd
+    cmd = "datadreamer --annotator_size invalide_value"
+    _check_wrong_argument_choice(cmd)
+
+
+def test_empty_class_names():
+    # Define the cmd
+    cmd = "datadreamer --class_names []"
+    _check_wrong_value(cmd)
+
+
+def test_invalid_class_names():
+    # Define the cmd
+    cmd = "datadreamer --class_names [2, -1]"
+    _check_wrong_value(cmd)
+
+
+def test_invalid_prompts_number():
+    # Define the cmd
+    cmd = "datadreamer --prompts_number -1"
+    _check_wrong_value(cmd)
+
+
+def test_negative_conf_threshold():
+    # Define the cmd
+    cmd = "datadreamer --conf_threshold -1"
+    _check_wrong_value(cmd)
+
+
+def test_big_conf_threshold():
+    # Define the cmd
+    cmd = "datadreamer --conf_threshold 10"
+    _check_wrong_value(cmd)
+
+
+def test_negative_annotation_iou_threshold():
+    # Define the cmd
+    cmd = "datadreamer --annotation_iou_threshold -1"
+    _check_wrong_value(cmd)
+
+
+def test_big_annotation_iou_threshold():
+    # Define the cmd
+    cmd = "datadreamer --annotation_iou_threshold 10"
+    _check_wrong_value(cmd)
+
+
+def test_invalid_image_tester_patience():
+    # Define the cmd
+    cmd = "datadreamer --image_tester_patience -1"
+    _check_wrong_value(cmd)
+
+
+def test_invalid_seed():
+    # Define the cmd
+    cmd = "datadreamer --seed -1 --device cpu"
+    _check_wrong_value(cmd)
+
+
+def test_invalid_synonym_generator():
+    # Define the cmd
+    cmd = "datadreamer --device cpu --synonym_generator invalid"
+    _check_wrong_value(cmd)
+
+
+def test_invalid_lm_quantization():
+    # Define the cmd
+    cmd = "datadreamer --device cude --lm_quantization invalid"
+    _check_wrong_value(cmd)
+
+
+def test_invalid_device_lm_quantization():
+    # Define the cmd
+    cmd = "datadreamer --device cpu --lm_quantization 4bit"
+    _check_wrong_value(cmd)
+
+
+def test_invalid_batch_size_prompt():
+    # Define the cmd
+    cmd = "datadreamer --batch_size_prompt -1"
+    _check_wrong_value(cmd)
+
+
+def test_invalid_batch_size_annotation():
+    # Define the cmd
+    cmd = "datadreamer --batch_size_annotation -1"
+    _check_wrong_value(cmd)
+
+
+def test_invalid_batch_size_image():
+    # Define the cmd
+    cmd = "datadreamer --batch_size_image -1"
+    _check_wrong_value(cmd)
+
+
+def test_invalid_num_objects_range():
+    # Define the cmd
+    cmd = "datadreamer --num_objects_range 1"
+    _check_wrong_value(cmd)
+
+
+def test_many_num_objects_range():
+    # Define the cmd
+    cmd = "datadreamer --num_objects_range 1 2 3"
+    _check_wrong_value(cmd)
+
+
+def test_desc_num_objects_range():
+    # Define the cmd
+    cmd = "datadreamer --num_objects_range 3 1"
+    _check_wrong_value(cmd)
+
+
+def test_negative_num_objects_range():
+    # Define the cmd
+    cmd = "datadreamer --num_objects_range -3 1"
+    _check_wrong_value(cmd)
\ No newline at end of file
diff --git a/tests/unittests/test_prompt_generation.py b/tests/core_tests/unittests/test_prompt_generation.py
similarity index 78%
rename from tests/unittests/test_prompt_generation.py
rename to tests/core_tests/unittests/test_prompt_generation.py
index 0540611..fca6fc0 100644
--- a/tests/unittests/test_prompt_generation.py
+++ b/tests/core_tests/unittests/test_prompt_generation.py
@@ -15,10 +15,9 @@
 )
 
 # Get the total memory in GB
-total_memory = psutil.virtual_memory().total / (1000**3)
-total_memory = int(total_memory) + (total_memory > int(total_memory))
+total_memory = psutil.virtual_memory().total / (1024**3)
 # Get the total disk space in GB
-total_disk_space = psutil.disk_usage("/").total / (1000**3)
+total_disk_space = psutil.disk_usage("/").total / (1024**3)
 
 
 def test_simple_prompt_generator():
@@ -40,7 +39,6 @@ def test_simple_prompt_generator():
         # Check the generated text
         assert prompt_text == f"A photo of a {', a '.join(selected_objects)}"
 
-
 def _check_lm_prompt_generator(
     device: str, prompt_generator_class=LMPromptGenerator, quantization: str = "none"
 ):
@@ -68,15 +66,6 @@ def _check_lm_prompt_generator(
         assert len(prompt_text) > 0 and prompt_text.lower().startswith("a photo of")
     prompt_generator.release(empty_cuda_cache=True if device != "cpu" else False)
 
-
-@pytest.mark.skipif(
-    total_memory < 15 or not torch.cuda.is_available() or total_disk_space < 35,
-    reason="Test requires at least 15GB of RAM, 35GB of HDD and CUDA support",
-)
-def test_cuda_lm_prompt_generator():
-    _check_lm_prompt_generator("cuda")
-
-
 @pytest.mark.skipif(
     total_memory < 12 or not torch.cuda.is_available() or total_disk_space < 25,
     reason="Test requires at least 12GB of RAM, 25GB of HDD and CUDA support",
@@ -84,14 +73,12 @@ def test_cuda_lm_prompt_generator():
 def test_cuda_4bit_lm_prompt_generator():
     _check_lm_prompt_generator("cuda", quantization="4bit")
 
-
 @pytest.mark.skipif(
-    total_memory < 32 or total_disk_space < 35,
-    reason="Test requires at least 28GB of RAM and 35GB of HDD for running on CPU",
+    total_memory < 12 or total_disk_space < 12,
+    reason="Test requires at least 12GB of RAM and 12GB of HDD for running on CPU",
 )
-def test_cpu_lm_prompt_generator():
-    _check_lm_prompt_generator("cpu")
-
+def test_cpu_tinyllama_lm_prompt_generator():
+    _check_lm_prompt_generator("cpu", TinyLlamaLMPromptGenerator)
 
 @pytest.mark.skipif(
     total_memory < 8 or not torch.cuda.is_available() or total_disk_space < 12,
@@ -101,14 +88,6 @@ def test_cuda_tinyllama_lm_prompt_generator():
     _check_lm_prompt_generator("cuda", TinyLlamaLMPromptGenerator)
 
 
-@pytest.mark.skipif(
-    total_memory < 12 or total_disk_space < 12,
-    reason="Test requires at least 12GB of RAM and 12GB of HDD for running on CPU",
-)
-def test_cpu_tinyllama_lm_prompt_generator():
-    _check_lm_prompt_generator("cpu", TinyLlamaLMPromptGenerator)
-
-
 def _check_synonym_generator(device: str, synonym_generator_class=LMSynonymGenerator):
     synonyms_num = 3
     generator = synonym_generator_class(synonyms_number=synonyms_num, device=device)
@@ -127,28 +106,11 @@ def _check_synonym_generator(device: str, synonym_generator_class=LMSynonymGener
     generator.release(empty_cuda_cache=True if device != "cpu" else False)
 
 
-@pytest.mark.skipif(
-    total_memory < 15 or not torch.cuda.is_available() or total_disk_space < 35,
-    reason="Test requires at least 15GB of RAM, 35GB of HDD and CUDA support",
-)
-def test_cuda_synonym_generator():
-    _check_synonym_generator("cuda")
-
-
-@pytest.mark.skipif(
-    total_memory < 32 or total_disk_space < 35,
-    reason="Test requires at least 28GB of RAM and 35GB of HDD for running on CPU",
-)
-def test_cpu_synonym_generator():
-    _check_synonym_generator("cpu")
-
-
 def test_cpu_wordnet_synonym_generator():
     _check_synonym_generator("cpu", WordNetSynonymGenerator)
 
-
 @pytest.mark.skipif(
-    torch.cuda.is_available(),
+    not torch.cuda.is_available(),
     reason="Test requires CUDA support",
 )
 def test_cuda_wordnet_synonym_generator():
diff --git a/tests/unittests/test_utils.py b/tests/core_tests/unittests/test_utils.py
similarity index 100%
rename from tests/unittests/test_utils.py
rename to tests/core_tests/unittests/test_utils.py
diff --git a/tests/integration/test_pipeline.py b/tests/heavy_tests/integration/test_pipeline.py
similarity index 69%
rename from tests/integration/test_pipeline.py
rename to tests/heavy_tests/integration/test_pipeline.py
index 7cfb88a..d368dac 100644
--- a/tests/integration/test_pipeline.py
+++ b/tests/heavy_tests/integration/test_pipeline.py
@@ -8,10 +8,9 @@
 import torch
 
 # Get the total memory in GB
-total_memory = psutil.virtual_memory().total / (1000**3)
-total_memory = int(total_memory) + (total_memory > int(total_memory))
+total_memory = psutil.virtual_memory().total / (1024*3)
 # Get the total disk space in GB
-total_disk_space = psutil.disk_usage("/").total / (1000**3)
+total_disk_space = psutil.disk_usage("/").total / (1024**3)
 
 
 def _check_detection_pipeline(cmd: str, target_folder: str):
@@ -45,220 +44,12 @@ def _check_detection_pipeline(cmd: str, target_folder: str):
     ), "bboxes_visualization directory not created"
 
 
-def _check_wrong_argument_choice(cmd: str):
-    with pytest.raises(subprocess.CalledProcessError):
-        subprocess.check_call(cmd, shell=True)
-
-
-def _check_wrong_value(cmd: str):
-    with pytest.raises(ValueError):
-        try:
-            subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
-        except subprocess.CalledProcessError as e:
-            raise ValueError(e.output.decode()) from e
-
-
-# =========================================================
-# ARGUMENTS CHECKS
-# =========================================================
-def test_invalid_task_value():
-    # Define the cmd
-    cmd = "datadreamer --task invalid_task"
-    _check_wrong_argument_choice(cmd)
-
-
-def test_invalid_prompts_number_type():
-    # Define the cmd
-    cmd = "datadreamer --prompts_number value"
-    _check_wrong_argument_choice(cmd)
-
-
-def test_invalid_num_objects_range_type():
-    # Define the cmd
-    cmd = "datadreamer --num_objects_range value"
-    _check_wrong_argument_choice(cmd)
-
-
-def test_invalid_conf_threshold_range_type():
-    # Define the cmd
-    cmd = "datadreamer --conf_threshold value"
-    _check_wrong_argument_choice(cmd)
-
-
-def test_invalid_image_tester_patience_type():
-    # Define the cmd
-    cmd = "datadreamer --image_tester_patience value"
-    _check_wrong_argument_choice(cmd)
-
-
-def test_invalid_seed_type():
-    # Define the cmd
-    cmd = "datadreamer --seed value --device cpu"
-    _check_wrong_argument_choice(cmd)
-
-
-def test_invalid_prompt_generator():
-    # Define the cmd
-    cmd = "datadreamer --prompt_generator invalide_value"
-    _check_wrong_argument_choice(cmd)
-
-
-def test_invalid_image_generator():
-    # Define the cmd
-    cmd = "datadreamer --image_generator invalide_value"
-    _check_wrong_argument_choice(cmd)
-
-
-def test_invalid_image_annotator():
-    # Define the cmd
-    cmd = "datadreamer --image_annotator invalide_value"
-    _check_wrong_argument_choice(cmd)
-
-
-def test_invalid_det_image_annotator():
-    # Define the cmd
-    cmd = "datadreamer --image_annotator clip"
-    _check_wrong_argument_choice(cmd)
-
-
-def test_invalid_clf_image_annotator():
-    # Define the cmd
-    cmd = "datadreamer --image_annotator owlv2 --task classification"
-    _check_wrong_argument_choice(cmd)
-
-
-def test_invalid_device():
-    # Define the cmd
-    cmd = "datadreamer --device invalide_value"
-    _check_wrong_argument_choice(cmd)
-
-
-def test_invalid_annotator_size():
-    # Define the cmd
-    cmd = "datadreamer --annotator_size invalide_value"
-    _check_wrong_argument_choice(cmd)
-
-
-def test_empty_class_names():
-    # Define the cmd
-    cmd = "datadreamer --class_names []"
-    _check_wrong_value(cmd)
-
-
-def test_invalid_class_names():
-    # Define the cmd
-    cmd = "datadreamer --class_names [2, -1]"
-    _check_wrong_value(cmd)
-
-
-def test_invalid_prompts_number():
-    # Define the cmd
-    cmd = "datadreamer --prompts_number -1"
-    _check_wrong_value(cmd)
-
-
-def test_negative_conf_threshold():
-    # Define the cmd
-    cmd = "datadreamer --conf_threshold -1"
-    _check_wrong_value(cmd)
-
-
-def test_big_conf_threshold():
-    # Define the cmd
-    cmd = "datadreamer --conf_threshold 10"
-    _check_wrong_value(cmd)
-
-
-def test_negative_annotation_iou_threshold():
-    # Define the cmd
-    cmd = "datadreamer --annotation_iou_threshold -1"
-    _check_wrong_value(cmd)
-
-
-def test_big_annotation_iou_threshold():
-    # Define the cmd
-    cmd = "datadreamer --annotation_iou_threshold 10"
-    _check_wrong_value(cmd)
-
-
-def test_invalid_image_tester_patience():
-    # Define the cmd
-    cmd = "datadreamer --image_tester_patience -1"
-    _check_wrong_value(cmd)
-
-
-def test_invalid_seed():
-    # Define the cmd
-    cmd = "datadreamer --seed -1 --device cpu"
-    _check_wrong_value(cmd)
-
-
-def test_invalid_synonym_generator():
-    # Define the cmd
-    cmd = "datadreamer --device cpu --synonym_generator invalid"
-    _check_wrong_value(cmd)
-
-
-def test_invalid_lm_quantization():
-    # Define the cmd
-    cmd = "datadreamer --device cude --lm_quantization invalid"
-    _check_wrong_value(cmd)
-
-
-def test_invalid_device_lm_quantization():
-    # Define the cmd
-    cmd = "datadreamer --device cpu --lm_quantization 4bit"
-    _check_wrong_value(cmd)
-
-
-def test_invalid_batch_size_prompt():
-    # Define the cmd
-    cmd = "datadreamer --batch_size_prompt -1"
-    _check_wrong_value(cmd)
-
-
-def test_invalid_batch_size_annotation():
-    # Define the cmd
-    cmd = "datadreamer --batch_size_annotation -1"
-    _check_wrong_value(cmd)
-
-
-def test_invalid_batch_size_image():
-    # Define the cmd
-    cmd = "datadreamer --batch_size_image -1"
-    _check_wrong_value(cmd)
-
-
-def test_invalid_num_objects_range():
-    # Define the cmd
-    cmd = "datadreamer --num_objects_range 1"
-    _check_wrong_value(cmd)
-
-
-def test_many_num_objects_range():
-    # Define the cmd
-    cmd = "datadreamer --num_objects_range 1 2 3"
-    _check_wrong_value(cmd)
-
-
-def test_desc_num_objects_range():
-    # Define the cmd
-    cmd = "datadreamer --num_objects_range 3 1"
-    _check_wrong_value(cmd)
-
-
-def test_negative_num_objects_range():
-    # Define the cmd
-    cmd = "datadreamer --num_objects_range -3 1"
-    _check_wrong_value(cmd)
-
-
 # =========================================================
 # DETECTION - SIMPLE LM
 # =========================================================
 @pytest.mark.skipif(
-    total_memory < 15 or total_disk_space < 35,
-    reason="Test requires at least 15GB of RAM and 35GB of HDD",
+    total_memory < 16 or total_disk_space < 35,
+    reason="Test requires at least 16GB of RAM and 35GB of HDD",
 )
 def test_cpu_simple_sdxl_turbo_detection_pipeline():
     # Define target folder
@@ -279,8 +70,8 @@ def test_cpu_simple_sdxl_turbo_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
-    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
+    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
 )
 def test_cuda_simple_sdxl_turbo_detection_pipeline():
     # Define target folder
@@ -301,8 +92,8 @@ def test_cuda_simple_sdxl_turbo_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 55,
-    reason="Test requires GPU, at least 15GB of RAM and 55GB of HDD",
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 55,
+    reason="Test requires GPU, at least 16GB of RAM and 55GB of HDD",
 )
 def test_cuda_simple_llm_synonym_sdxl_turbo_detection_pipeline():
     # Define target folder
@@ -324,8 +115,8 @@ def test_cuda_simple_llm_synonym_sdxl_turbo_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
-    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
+    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
 )
 def test_cuda_simple_wordnet_synonym_sdxl_turbo_detection_pipeline():
     # Define target folder
@@ -347,8 +138,8 @@ def test_cuda_simple_wordnet_synonym_sdxl_turbo_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 15 or total_disk_space < 35,
-    reason="Test requires at least 15GB of RAM and 35GB of HDD",
+    total_memory < 16 or total_disk_space < 35,
+    reason="Test requires at least 16GB of RAM and 35GB of HDD",
 )
 def test_cpu_simple_sdxl_detection_pipeline():
     # Define target folder
@@ -369,8 +160,8 @@ def test_cpu_simple_sdxl_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
-    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
+    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
 )
 def test_cuda_simple_sdxl_detection_pipeline():
     # Define target folder
@@ -391,8 +182,8 @@ def test_cuda_simple_sdxl_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 15 or total_disk_space < 35,
-    reason="Test requires at least 15GB of RAM and 35GB of HDD",
+    total_memory < 16 or total_disk_space < 35,
+    reason="Test requires at least 16GB of RAM and 35GB of HDD",
 )
 def test_cpu_simple_sdxl_lightning_detection_pipeline():
     # Define target folder
@@ -413,8 +204,8 @@ def test_cpu_simple_sdxl_lightning_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
-    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
+    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
 )
 def test_cuda_simple_sdxl_lightning_detection_pipeline():
     # Define target folder
@@ -460,8 +251,8 @@ def test_cpu_lm_sdxl_turbo_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 15 or not torch.cuda.is_available() or total_disk_space < 55,
-    reason="Test requires at least 15GB of RAM, CUDA support and 55GB of HDD",
+    total_memory < 16 or not torch.cuda.is_available() or total_disk_space < 55,
+    reason="Test requires at least 16GB of RAM, CUDA support and 55GB of HDD",
 )
 def test_cuda_lm_sdxl_turbo_detection_pipeline():
     # Define target folder
@@ -527,8 +318,8 @@ def test_cpu_lm_sdxl_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 15 or not torch.cuda.is_available() or total_disk_space < 55,
-    reason="Test requires at least 15GB of RAM, CUDA support and 55GB of HDD",
+    total_memory < 16 or not torch.cuda.is_available() or total_disk_space < 55,
+    reason="Test requires at least 16GB of RAM, CUDA support and 55GB of HDD",
 )
 def test_cuda_lm_sdxl_detection_pipeline():
     # Define target folder
@@ -575,8 +366,8 @@ def test_cuda_4bit_lm_sdxl_detection_pipeline():
 # DETECTION - TinyLlama LLM
 # =========================================================
 @pytest.mark.skipif(
-    total_memory < 15 or total_disk_space < 35,
-    reason="Test requires at least 15GB of RAM and 35GB of HDD",
+    total_memory < 16 or total_disk_space < 35,
+    reason="Test requires at least 16GB of RAM and 35GB of HDD",
 )
 def test_cpu_tiny_sdxl_turbo_detection_pipeline():
     # Define target folder
@@ -597,8 +388,8 @@ def test_cpu_tiny_sdxl_turbo_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
-    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
+    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
 )
 def test_cuda_tiny_sdxl_turbo_detection_pipeline():
     # Define target folder
@@ -619,8 +410,8 @@ def test_cuda_tiny_sdxl_turbo_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 15 or total_disk_space < 35,
-    reason="Test requires at least 15GB of RAM and 35GB of HDD",
+    total_memory < 16 or total_disk_space < 35,
+    reason="Test requires at least 16GB of RAM and 35GB of HDD",
 )
 def test_cpu_tiny_sdxl_detection_pipeline():
     # Define target folder
@@ -641,8 +432,8 @@ def test_cpu_tiny_sdxl_detection_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
-    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
+    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
 )
 def test_cuda_tiny_sdxl_detection_pipeline():
     # Define target folder
@@ -666,8 +457,8 @@ def test_cuda_tiny_sdxl_detection_pipeline():
 # CLASSIFICATION - SIMPLE LM
 # =========================================================
 @pytest.mark.skipif(
-    total_memory < 15 or total_disk_space < 35,
-    reason="Test requires at least 15GB of RAM and 35GB of HDD",
+    total_memory < 16 or total_disk_space < 35,
+    reason="Test requires at least 16GB of RAM and 35GB of HDD",
 )
 def test_cpu_simple_sdxl_turbo_classification_pipeline():
     # Define target folder
@@ -690,8 +481,8 @@ def test_cpu_simple_sdxl_turbo_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
-    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
+    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
 )
 def test_cuda_simple_sdxl_turbo_classification_pipeline():
     # Define target folder
@@ -714,8 +505,8 @@ def test_cuda_simple_sdxl_turbo_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 55,
-    reason="Test requires GPU, at least 15GB of RAM and 55GB of HDD",
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 55,
+    reason="Test requires GPU, at least 16GB of RAM and 55GB of HDD",
 )
 def test_cuda_simple_llm_synonym_sdxl_turbo_classification_pipeline():
     # Define target folder
@@ -739,8 +530,8 @@ def test_cuda_simple_llm_synonym_sdxl_turbo_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
-    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
+    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
 )
 def test_cuda_simple_wordnet_synonym_sdxl_turbo_classification_pipeline():
     # Define target folder
@@ -764,8 +555,8 @@ def test_cuda_simple_wordnet_synonym_sdxl_turbo_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 15 or total_disk_space < 35,
-    reason="Test requires at least 15GB of RAM and 35GB of HDD",
+    total_memory < 16 or total_disk_space < 35,
+    reason="Test requires at least 16GB of RAM and 35GB of HDD",
 )
 def test_cpu_simple_sdxl_classification_pipeline():
     # Define target folder
@@ -788,8 +579,8 @@ def test_cpu_simple_sdxl_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
-    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
+    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
 )
 def test_cuda_simple_sdxl_classification_pipeline():
     # Define target folder
@@ -839,8 +630,8 @@ def test_cpu_lm_sdxl_turbo_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 15 or not torch.cuda.is_available() or total_disk_space < 55,
-    reason="Test requires at least 15GB of RAM, 55GB of HDD and CUDA support",
+    total_memory < 16 or not torch.cuda.is_available() or total_disk_space < 55,
+    reason="Test requires at least 16GB of RAM, 55GB of HDD and CUDA support",
 )
 def test_cuda_lm_sdxl_turbo_classification_pipeline():
     # Define target folder
@@ -912,8 +703,8 @@ def test_cpu_lm_sdxl_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 15 or not torch.cuda.is_available() or total_disk_space < 55,
-    reason="Test requires at least 15GB of RAM, CUDA support and 55GB of HDD",
+    total_memory < 16 or not torch.cuda.is_available() or total_disk_space < 55,
+    reason="Test requires at least 16GB of RAM, CUDA support and 55GB of HDD",
 )
 def test_cuda_lm_sdxl_classification_pipeline():
     # Define target folder
@@ -964,8 +755,8 @@ def test_cuda_4bit_lm_sdxl_classification_pipeline():
 # CLASSIFICATION - TinyLlama LLM
 # =========================================================
 @pytest.mark.skipif(
-    total_memory < 15 or total_disk_space < 35,
-    reason="Test requires at least 15GB of RAM and 35GB of HDD",
+    total_memory < 16 or total_disk_space < 35,
+    reason="Test requires at least 16GB of RAM and 35GB of HDD",
 )
 def test_cpu_tiny_sdxl_turbo_classification_pipeline():
     # Define target folder
@@ -988,8 +779,8 @@ def test_cpu_tiny_sdxl_turbo_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
-    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
+    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
 )
 def test_cuda_tiny_sdxl_turbo_classification_pipeline():
     # Define target folder
@@ -1012,8 +803,8 @@ def test_cuda_tiny_sdxl_turbo_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    total_memory < 15 or total_disk_space < 35,
-    reason="Test requires at least 15GB of RAM and 35GB of HDD",
+    total_memory < 16 or total_disk_space < 35,
+    reason="Test requires at least 16GB of RAM and 35GB of HDD",
 )
 def test_cpu_tiny_sdxl_classification_pipeline():
     # Define target folder
@@ -1036,8 +827,8 @@ def test_cpu_tiny_sdxl_classification_pipeline():
 
 
 @pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
-    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
+    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
 )
 def test_cuda_tiny_sdxl_classification_pipeline():
     # Define target folder
@@ -1056,83 +847,4 @@ def test_cuda_tiny_sdxl_classification_pipeline():
         f"--device cuda"
     )
     # Check the run of the pipeline
-    _check_detection_pipeline(cmd, target_folder)
-
-
-# =========================================================
-# TEST WITH CONFIG FILE
-# =========================================================
-@pytest.mark.skipif(
-    total_memory < 15 or total_disk_space < 35,
-    reason="Test requires at least 15GB of RAM and 35GB of HDD",
-)
-def test_cpu_simple_sdxl_turbo_config_detection_pipeline():
-    # Define target folder
-    target_folder = "data/data-det-cpu-simple-sdxl-turbo-config/"
-    # Define the command to run the datadreamer
-    cmd = (
-        f"datadreamer --save_dir {target_folder} "
-        f"--num_objects_range 1 2 "
-        f"--config ./sample_config.yaml "
-        f"--device cpu"
-    )
-    # Check the run of the pipeline
-    _check_detection_pipeline(cmd, target_folder)
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
-    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
-)
-def test_cuda_simple_sdxl_turbo_config_detection_pipeline():
-    # Define target folder
-    target_folder = "data/data-det-cuda-simple-sdxl-turbo-config/"
-    # Define the command to run the datadreamer
-    cmd = (
-        f"datadreamer --save_dir {target_folder} "
-        f"--num_objects_range 1 2 "
-        f"--config ./sample_config.yaml "
-        f"--device cuda"
-    )
-    # Check the run of the pipeline
-    _check_detection_pipeline(cmd, target_folder)
-
-
-@pytest.mark.skipif(
-    total_memory < 15 or total_disk_space < 35,
-    reason="Test requires at least 15GB of RAM and 35GB of HDD",
-)
-def test_cpu_simple_sdxl_turbo_config_classification_pipeline():
-    # Define target folder
-    target_folder = "data/data-cls-cpu-simple-sdxl-turbo-config/"
-    # Define the command to run the datadreamer
-    cmd = (
-        f"datadreamer --task classification "
-        f"--save_dir {target_folder} "
-        f"--num_objects_range 1 2 "
-        f"--image_annotator clip "
-        f"--config ./sample_config.yaml "
-        f"--device cpu"
-    )
-    # Check the run of the pipeline
-    _check_detection_pipeline(cmd, target_folder)
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 15 or total_disk_space < 35,
-    reason="Test requires GPU, at least 15GB of RAM and 35GB of HDD",
-)
-def test_cuda_simple_sdxl_turbo_config_classification_pipeline():
-    # Define target folder
-    target_folder = "data/data-cls-cuda-simple-sdxl-turbo-config/"
-    # Define the command to run the datadreamer
-    cmd = (
-        f"datadreamer --task classification "
-        f"--save_dir {target_folder} "
-        f"--num_objects_range 1 2 "
-        f"--image_annotator clip "
-        f"--config ./sample_config.yaml "
-        f"--device cuda"
-    )
-    # Check the run of the pipeline
-    _check_detection_pipeline(cmd, target_folder)
+    _check_detection_pipeline(cmd, target_folder)
\ No newline at end of file
diff --git a/tests/heavy_tests/unittests/test_image_generation.py b/tests/heavy_tests/unittests/test_image_generation.py
new file mode 100644
index 0000000..482fec2
--- /dev/null
+++ b/tests/heavy_tests/unittests/test_image_generation.py
@@ -0,0 +1,69 @@
+from __future__ import annotations
+
+from typing import Type, Union
+
+import psutil
+import pytest
+import requests
+import torch
+from PIL import Image
+
+from datadreamer.image_generation import (
+    StableDiffusionImageGenerator,
+    StableDiffusionLightningImageGenerator,
+    StableDiffusionTurboImageGenerator,
+)
+
+# Get the total memory in GB
+total_memory = psutil.virtual_memory().total / (1024**3)
+# Get the total disk space in GB
+total_disk_space = psutil.disk_usage("/").total / (1024**3)
+
+def _check_image_generator(
+    image_generator_class: Type[
+        Union[
+            StableDiffusionImageGenerator,
+            StableDiffusionTurboImageGenerator,
+            StableDiffusionLightningImageGenerator,
+        ]
+    ],
+    device: str,
+):
+    image_generator = image_generator_class(device=device)
+    # Check that the image generator is not None
+    assert image_generator is not None
+    # Generate images and check each of them
+    for generated_images_batch in image_generator.generate_images(
+        ["A photo of a cat, dog"], [["cat", "dog"]]
+    ):
+        generated_image = generated_images_batch[0]
+        assert generated_image is not None
+        assert isinstance(generated_image, Image.Image)
+
+    images = image_generator.generate_images_batch(
+        ["A photo of a cat, dog"],
+        "blurry, bad quality",
+    )
+    assert len(images) == 1
+    assert images[0] is not None
+    assert isinstance(images[0], Image.Image)
+
+    # Release the generator
+    image_generator.release(empty_cuda_cache=True if device != "cpu" else False)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 25,
+    reason="Test requires GPU, at least 16GB of RAM and 25GB of HDD",
+)
+def test_cuda_sdxl_image_generator():
+    _check_image_generator(StableDiffusionImageGenerator, "cuda")
+
+
+@pytest.mark.skipif(
+    total_memory < 16 or total_disk_space < 25,
+    reason="Test requires at least 16GB of RAM and 25GB of HDD",
+)
+def test_cpu_sdxl_image_generator():
+    _check_image_generator(StableDiffusionImageGenerator, "cpu")
+
diff --git a/tests/heavy_tests/unittests/test_prompt_generation.py b/tests/heavy_tests/unittests/test_prompt_generation.py
new file mode 100644
index 0000000..7ffd077
--- /dev/null
+++ b/tests/heavy_tests/unittests/test_prompt_generation.py
@@ -0,0 +1,95 @@
+from __future__ import annotations
+
+import psutil
+import pytest
+import torch
+
+from datadreamer.prompt_generation.lm_prompt_generator import LMPromptGenerator
+from datadreamer.prompt_generation.lm_synonym_generator import LMSynonymGenerator
+from datadreamer.prompt_generation.simple_prompt_generator import SimplePromptGenerator
+from datadreamer.prompt_generation.tinyllama_lm_prompt_generator import (
+    TinyLlamaLMPromptGenerator,
+)
+from datadreamer.prompt_generation.wordnet_synonym_generator import (
+    WordNetSynonymGenerator,
+)
+
+# Get the total memory in GB
+total_memory = psutil.virtual_memory().total / (1024**3)
+# Get the total disk space in GB
+total_disk_space = psutil.disk_usage("/").total / (1024**3)
+
+def _check_lm_prompt_generator(
+    device: str, prompt_generator_class=LMPromptGenerator, quantization: str = "none"
+):
+    object_names = ["aeroplane", "bicycle", "bird", "boat"]
+    prompt_generator = prompt_generator_class(
+        class_names=object_names,
+        prompts_number=2,
+        device=device,
+        quantization=quantization,
+    )
+    prompts = prompt_generator.generate_prompts()
+    # Check that the some prompts were generated
+    assert len(prompts) > 0
+    # Iterate through the prompts
+    for selected_objects, prompt_text in prompts:
+        # Selected objects aren't empty
+        assert len(selected_objects) > 0
+        # The slected objects are in the range
+        assert (
+            prompt_generator.num_objects_range[0]
+            <= len(selected_objects)
+            <= prompt_generator.num_objects_range[1]
+        )
+        # Check the generated text
+        assert len(prompt_text) > 0 and prompt_text.lower().startswith("a photo of")
+    prompt_generator.release(empty_cuda_cache=True if device != "cpu" else False)
+
+
+@pytest.mark.skipif(
+    total_memory < 16 or not torch.cuda.is_available() or total_disk_space < 35,
+    reason="Test requires at least 16GB of RAM, 35GB of HDD and CUDA support",
+)
+def test_cuda_lm_prompt_generator():
+    _check_lm_prompt_generator("cuda")
+
+@pytest.mark.skipif(
+    total_memory < 32 or total_disk_space < 35,
+    reason="Test requires at least 28GB of RAM and 35GB of HDD for running on CPU",
+)
+def test_cpu_lm_prompt_generator():
+    _check_lm_prompt_generator("cpu")
+
+def _check_synonym_generator(device: str, synonym_generator_class=LMSynonymGenerator):
+    synonyms_num = 3
+    generator = synonym_generator_class(synonyms_number=synonyms_num, device=device)
+    synonyms = generator.generate_synonyms_for_list(["astronaut", "cat", "dog"])
+    # Check that the some synonyms were generated
+    assert len(synonyms) > 0
+    # Iterate through the synonyms
+    for word, synonym_list in synonyms.items():
+        # Check that the word is not empty
+        assert len(word) > 0
+        # Check that the synonym list is not empty
+        assert len(synonym_list) > 0
+        # Check that the synonyms are not empty
+        for synonym in synonym_list:
+            assert len(synonym) > 0
+    generator.release(empty_cuda_cache=True if device != "cpu" else False)
+
+
+@pytest.mark.skipif(
+    total_memory < 16 or not torch.cuda.is_available() or total_disk_space < 35,
+    reason="Test requires at least 16GB of RAM, 35GB of HDD and CUDA support",
+)
+def test_cuda_synonym_generator():
+    _check_synonym_generator("cuda")
+
+
+@pytest.mark.skipif(
+    total_memory < 32 or total_disk_space < 35,
+    reason="Test requires at least 28GB of RAM and 35GB of HDD for running on CPU",
+)
+def test_cpu_synonym_generator():
+    _check_synonym_generator("cpu")

From 153ac989f202341a530e5780ae358f43d472c40b Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Fri, 27 Sep 2024 00:31:34 +0000
Subject: [PATCH 23/31] style: tests formatting

---
 tests/core_tests/integration/test_pipeline.py         |  1 +
 tests/core_tests/unittests/test_image_generation.py   |  5 +++--
 tests/core_tests/unittests/test_pipeline_arguments.py |  2 +-
 tests/core_tests/unittests/test_prompt_generation.py  |  5 +++++
 tests/heavy_tests/integration/test_pipeline.py        |  4 ++--
 tests/heavy_tests/unittests/test_image_generation.py  |  3 +--
 tests/heavy_tests/unittests/test_prompt_generation.py | 10 +++-------
 7 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/tests/core_tests/integration/test_pipeline.py b/tests/core_tests/integration/test_pipeline.py
index 190c0ce..e76f916 100644
--- a/tests/core_tests/integration/test_pipeline.py
+++ b/tests/core_tests/integration/test_pipeline.py
@@ -43,6 +43,7 @@ def _check_detection_pipeline(cmd: str, target_folder: str):
         os.path.join(target_folder, "bboxes_visualization")
     ), "bboxes_visualization directory not created"
 
+
 # =========================================================
 # TEST WITH CONFIG FILE
 # =========================================================
diff --git a/tests/core_tests/unittests/test_image_generation.py b/tests/core_tests/unittests/test_image_generation.py
index 51fda42..2436f75 100644
--- a/tests/core_tests/unittests/test_image_generation.py
+++ b/tests/core_tests/unittests/test_image_generation.py
@@ -8,12 +8,12 @@
 import torch
 from PIL import Image
 
-from datadreamer.image_generation.clip_image_tester import ClipImageTester
 from datadreamer.image_generation import (
     StableDiffusionImageGenerator,
     StableDiffusionLightningImageGenerator,
     StableDiffusionTurboImageGenerator,
 )
+from datadreamer.image_generation.clip_image_tester import ClipImageTester
 
 # Get the total memory in GB
 total_memory = psutil.virtual_memory().total / (1024**3)
@@ -21,7 +21,6 @@
 total_disk_space = psutil.disk_usage("/").total / (1024**3)
 
 
-
 def _check_clip_image_tester(device: str):
     url = "https://ultralytics.com/images/bus.jpg"
     im = Image.open(requests.get(url, stream=True).raw)
@@ -64,6 +63,7 @@ def test_cuda_clip_image_tester():
 def test_cpu_clip_image_tester():
     _check_clip_image_tester("cpu")
 
+
 def _check_image_generator(
     image_generator_class: Type[
         Union[
@@ -96,6 +96,7 @@ def _check_image_generator(
     # Release the generator
     image_generator.release(empty_cuda_cache=True if device != "cpu" else False)
 
+
 @pytest.mark.skipif(
     not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 25,
     reason="Test requires GPU, at least 16GB of RAM and 25GB of HDD",
diff --git a/tests/core_tests/unittests/test_pipeline_arguments.py b/tests/core_tests/unittests/test_pipeline_arguments.py
index 198db72..f435da7 100644
--- a/tests/core_tests/unittests/test_pipeline_arguments.py
+++ b/tests/core_tests/unittests/test_pipeline_arguments.py
@@ -210,4 +210,4 @@ def test_desc_num_objects_range():
 def test_negative_num_objects_range():
     # Define the cmd
     cmd = "datadreamer --num_objects_range -3 1"
-    _check_wrong_value(cmd)
\ No newline at end of file
+    _check_wrong_value(cmd)
diff --git a/tests/core_tests/unittests/test_prompt_generation.py b/tests/core_tests/unittests/test_prompt_generation.py
index fca6fc0..f2dcd9f 100644
--- a/tests/core_tests/unittests/test_prompt_generation.py
+++ b/tests/core_tests/unittests/test_prompt_generation.py
@@ -39,6 +39,7 @@ def test_simple_prompt_generator():
         # Check the generated text
         assert prompt_text == f"A photo of a {', a '.join(selected_objects)}"
 
+
 def _check_lm_prompt_generator(
     device: str, prompt_generator_class=LMPromptGenerator, quantization: str = "none"
 ):
@@ -66,6 +67,7 @@ def _check_lm_prompt_generator(
         assert len(prompt_text) > 0 and prompt_text.lower().startswith("a photo of")
     prompt_generator.release(empty_cuda_cache=True if device != "cpu" else False)
 
+
 @pytest.mark.skipif(
     total_memory < 12 or not torch.cuda.is_available() or total_disk_space < 25,
     reason="Test requires at least 12GB of RAM, 25GB of HDD and CUDA support",
@@ -73,6 +75,7 @@ def _check_lm_prompt_generator(
 def test_cuda_4bit_lm_prompt_generator():
     _check_lm_prompt_generator("cuda", quantization="4bit")
 
+
 @pytest.mark.skipif(
     total_memory < 12 or total_disk_space < 12,
     reason="Test requires at least 12GB of RAM and 12GB of HDD for running on CPU",
@@ -80,6 +83,7 @@ def test_cuda_4bit_lm_prompt_generator():
 def test_cpu_tinyllama_lm_prompt_generator():
     _check_lm_prompt_generator("cpu", TinyLlamaLMPromptGenerator)
 
+
 @pytest.mark.skipif(
     total_memory < 8 or not torch.cuda.is_available() or total_disk_space < 12,
     reason="Test requires at least 8GB of RAM, 12GB of HDD and CUDA support",
@@ -109,6 +113,7 @@ def _check_synonym_generator(device: str, synonym_generator_class=LMSynonymGener
 def test_cpu_wordnet_synonym_generator():
     _check_synonym_generator("cpu", WordNetSynonymGenerator)
 
+
 @pytest.mark.skipif(
     not torch.cuda.is_available(),
     reason="Test requires CUDA support",
diff --git a/tests/heavy_tests/integration/test_pipeline.py b/tests/heavy_tests/integration/test_pipeline.py
index d368dac..3f48e91 100644
--- a/tests/heavy_tests/integration/test_pipeline.py
+++ b/tests/heavy_tests/integration/test_pipeline.py
@@ -8,7 +8,7 @@
 import torch
 
 # Get the total memory in GB
-total_memory = psutil.virtual_memory().total / (1024*3)
+total_memory = psutil.virtual_memory().total / (1024 * 3)
 # Get the total disk space in GB
 total_disk_space = psutil.disk_usage("/").total / (1024**3)
 
@@ -847,4 +847,4 @@ def test_cuda_tiny_sdxl_classification_pipeline():
         f"--device cuda"
     )
     # Check the run of the pipeline
-    _check_detection_pipeline(cmd, target_folder)
\ No newline at end of file
+    _check_detection_pipeline(cmd, target_folder)
diff --git a/tests/heavy_tests/unittests/test_image_generation.py b/tests/heavy_tests/unittests/test_image_generation.py
index 482fec2..30141cc 100644
--- a/tests/heavy_tests/unittests/test_image_generation.py
+++ b/tests/heavy_tests/unittests/test_image_generation.py
@@ -4,7 +4,6 @@
 
 import psutil
 import pytest
-import requests
 import torch
 from PIL import Image
 
@@ -19,6 +18,7 @@
 # Get the total disk space in GB
 total_disk_space = psutil.disk_usage("/").total / (1024**3)
 
+
 def _check_image_generator(
     image_generator_class: Type[
         Union[
@@ -66,4 +66,3 @@ def test_cuda_sdxl_image_generator():
 )
 def test_cpu_sdxl_image_generator():
     _check_image_generator(StableDiffusionImageGenerator, "cpu")
-
diff --git a/tests/heavy_tests/unittests/test_prompt_generation.py b/tests/heavy_tests/unittests/test_prompt_generation.py
index 7ffd077..a943f5c 100644
--- a/tests/heavy_tests/unittests/test_prompt_generation.py
+++ b/tests/heavy_tests/unittests/test_prompt_generation.py
@@ -6,19 +6,13 @@
 
 from datadreamer.prompt_generation.lm_prompt_generator import LMPromptGenerator
 from datadreamer.prompt_generation.lm_synonym_generator import LMSynonymGenerator
-from datadreamer.prompt_generation.simple_prompt_generator import SimplePromptGenerator
-from datadreamer.prompt_generation.tinyllama_lm_prompt_generator import (
-    TinyLlamaLMPromptGenerator,
-)
-from datadreamer.prompt_generation.wordnet_synonym_generator import (
-    WordNetSynonymGenerator,
-)
 
 # Get the total memory in GB
 total_memory = psutil.virtual_memory().total / (1024**3)
 # Get the total disk space in GB
 total_disk_space = psutil.disk_usage("/").total / (1024**3)
 
+
 def _check_lm_prompt_generator(
     device: str, prompt_generator_class=LMPromptGenerator, quantization: str = "none"
 ):
@@ -54,6 +48,7 @@ def _check_lm_prompt_generator(
 def test_cuda_lm_prompt_generator():
     _check_lm_prompt_generator("cuda")
 
+
 @pytest.mark.skipif(
     total_memory < 32 or total_disk_space < 35,
     reason="Test requires at least 28GB of RAM and 35GB of HDD for running on CPU",
@@ -61,6 +56,7 @@ def test_cuda_lm_prompt_generator():
 def test_cpu_lm_prompt_generator():
     _check_lm_prompt_generator("cpu")
 
+
 def _check_synonym_generator(device: str, synonym_generator_class=LMSynonymGenerator):
     synonyms_num = 3
     generator = synonym_generator_class(synonyms_number=synonyms_num, device=device)

From 280c9541f5bcd20483d2116e9f46b8642c0b4efb Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Fri, 27 Sep 2024 00:35:44 +0000
Subject: [PATCH 24/31] test: rename core tests

---
 .github/workflows/{core-tests.yaml => tests.yaml} | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename .github/workflows/{core-tests.yaml => tests.yaml} (98%)

diff --git a/.github/workflows/core-tests.yaml b/.github/workflows/tests.yaml
similarity index 98%
rename from .github/workflows/core-tests.yaml
rename to .github/workflows/tests.yaml
index 41c96a9..ae09afd 100644
--- a/.github/workflows/core-tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -1,4 +1,4 @@
-name: Core tests
+name: Tests
 
 on:
   pull_request:
@@ -6,7 +6,7 @@ on:
     paths:
       - 'datadreamer/**/**.py'
       - 'tests/core_tests/**/**.py'
-      - .github/workflows/core-tests.yaml
+      - .github/workflows/tests.yaml
   workflow_dispatch:
 
 jobs:

From b2da07ad681c5cbc3f0eab0b3b066eb2ea41a57a Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Fri, 27 Sep 2024 00:43:38 +0000
Subject: [PATCH 25/31] test: run core tests on pull to dev

---
 .github/workflows/tests.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index ae09afd..8a831bd 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -2,7 +2,7 @@ name: Tests
 
 on:
   pull_request:
-    branches: [ main ]
+    branches: [ main, dev ]
     paths:
       - 'datadreamer/**/**.py'
       - 'tests/core_tests/**/**.py'

From c4dae6e6d47a6609d9ba450ffd1c705240cb1f01 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Fri, 27 Sep 2024 01:08:27 +0000
Subject: [PATCH 26/31] test: fix config paths

---
 .github/workflows/tests.yaml                  | 2 +-
 tests/core_tests/integration/test_pipeline.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 8a831bd..2fa2ef1 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -15,7 +15,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [buildjet-8vcpu-ubuntu-2204, windows-latest, macOS-latest]
-        version: ['3.10', '3.11']
+        version: ['3.10']
 
     runs-on: ${{ matrix.os }}
 
diff --git a/tests/core_tests/integration/test_pipeline.py b/tests/core_tests/integration/test_pipeline.py
index e76f916..397143a 100644
--- a/tests/core_tests/integration/test_pipeline.py
+++ b/tests/core_tests/integration/test_pipeline.py
@@ -58,7 +58,7 @@ def test_cpu_simple_sdxl_turbo_config_detection_pipeline():
     cmd = (
         f"datadreamer --save_dir {target_folder} "
         f"--num_objects_range 1 2 "
-        f"--config ./sample_config.yaml "
+        f"--config ./tests/core_tests/integration/sample_config.yaml "
         f"--device cpu"
     )
     # Check the run of the pipeline
@@ -76,7 +76,7 @@ def test_cuda_simple_sdxl_turbo_config_detection_pipeline():
     cmd = (
         f"datadreamer --save_dir {target_folder} "
         f"--num_objects_range 1 2 "
-        f"--config ./sample_config.yaml "
+        f"--config ./tests/core_tests/integration/sample_config.yaml "
         f"--device cuda"
     )
     # Check the run of the pipeline
@@ -96,7 +96,7 @@ def test_cpu_simple_sdxl_turbo_config_classification_pipeline():
         f"--save_dir {target_folder} "
         f"--num_objects_range 1 2 "
         f"--image_annotator clip "
-        f"--config ./sample_config.yaml "
+        f"--config ./tests/core_tests/integration/sample_config.yaml "
         f"--device cpu"
     )
     # Check the run of the pipeline
@@ -116,7 +116,7 @@ def test_cuda_simple_sdxl_turbo_config_classification_pipeline():
         f"--save_dir {target_folder} "
         f"--num_objects_range 1 2 "
         f"--image_annotator clip "
-        f"--config ./sample_config.yaml "
+        f"--config ./tests/core_tests/integration/sample_config.yaml "
         f"--device cuda"
     )
     # Check the run of the pipeline

From a256cbc10a8c61a57e465d947f6381fa59e9e192 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Fri, 27 Sep 2024 01:28:16 +0000
Subject: [PATCH 27/31] [Automated] Updated coverage badge

---
 media/coverage_badge.svg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/media/coverage_badge.svg b/media/coverage_badge.svg
index 2fad913..6d24dca 100644
--- a/media/coverage_badge.svg
+++ b/media/coverage_badge.svg
@@ -15,7 +15,7 @@
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
         <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
         <text x="31.5" y="14">coverage</text>
-        <text x="80" y="15" fill="#010101" fill-opacity=".3">62%</text>
-        <text x="80" y="14">62%</text>
+        <text x="80" y="15" fill="#010101" fill-opacity=".3">73%</text>
+        <text x="80" y="14">73%</text>
     </g>
 </svg>

From 9504f9351e6171c513034e49c84b396e0b52e9a2 Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Fri, 27 Sep 2024 01:46:50 +0000
Subject: [PATCH 28/31] test: update tests

---
 .github/workflows/tests.yaml                  |  7 ++-
 tests/core_tests/integration/test_pipeline.py | 49 +++++++++++++++++++
 .../heavy_tests/integration/test_pipeline.py  | 44 -----------------
 3 files changed, 54 insertions(+), 46 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 2fa2ef1..7553b16 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -15,7 +15,10 @@ jobs:
       fail-fast: false
       matrix:
         os: [buildjet-8vcpu-ubuntu-2204, windows-latest, macOS-latest]
-        version: ['3.10']
+        version: ['3.10', '3.11']
+        exclude:
+          - os: buildjet-8vcpu-ubuntu-2204
+            version: '3.11'
 
     runs-on: ${{ matrix.os }}
 
@@ -94,7 +97,7 @@ jobs:
   publish-test-results:
     name: "Publish Tests Results"
     needs: run_tests
-    runs-on: buildjet-8vcpu-ubuntu-2204
+    runs-on: ubuntu-latest
     permissions:
       checks: write
       pull-requests: write
diff --git a/tests/core_tests/integration/test_pipeline.py b/tests/core_tests/integration/test_pipeline.py
index 397143a..a6eba19 100644
--- a/tests/core_tests/integration/test_pipeline.py
+++ b/tests/core_tests/integration/test_pipeline.py
@@ -44,6 +44,55 @@ def _check_detection_pipeline(cmd: str, target_folder: str):
     ), "bboxes_visualization directory not created"
 
 
+# =========================================================
+# DETECTION - SIMPLE LM
+# =========================================================
+@pytest.mark.skipif(
+    total_memory < 16 or total_disk_space < 35,
+    reason="Test requires at least 16GB of RAM and 35GB of HDD",
+)
+def test_cpu_simple_sdxl_turbo_detection_pipeline():
+    # Define target folder
+    target_folder = "data/data-det-cpu-simple-sdxl-turbo/"
+    # Define the command to run the datadreamer
+    cmd = (
+        f"datadreamer --save_dir {target_folder} "
+        f"--class_names alien mars cat "
+        f"--prompts_number 1 "
+        f"--prompt_generator simple "
+        f"--num_objects_range 1 2 "
+        f"--image_generator sdxl-turbo "
+        f"--use_image_tester "
+        f"--synonym_generator wordnet "
+        f"--device cpu"
+    )
+    # Check the run of the pipeline
+    _check_detection_pipeline(cmd, target_folder)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
+    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
+)
+def test_cuda_simple_sdxl_turbo_detection_pipeline():
+    # Define target folder
+    target_folder = "data/data-det-cuda-simple-sdxl-turbo/"
+    # Define the command to run the datadreamer
+    cmd = (
+        f"datadreamer --save_dir {target_folder} "
+        f"--class_names alien mars cat "
+        f"--prompts_number 1 "
+        f"--prompt_generator simple "
+        f"--num_objects_range 1 2 "
+        f"--image_generator sdxl-turbo "
+        f"--use_image_tester "
+        f"--synonym_generator wordnet "
+        f"--device cuda"
+    )
+    # Check the run of the pipeline
+    _check_detection_pipeline(cmd, target_folder)
+
+
 # =========================================================
 # TEST WITH CONFIG FILE
 # =========================================================
diff --git a/tests/heavy_tests/integration/test_pipeline.py b/tests/heavy_tests/integration/test_pipeline.py
index 3f48e91..03750ea 100644
--- a/tests/heavy_tests/integration/test_pipeline.py
+++ b/tests/heavy_tests/integration/test_pipeline.py
@@ -181,50 +181,6 @@ def test_cuda_simple_sdxl_detection_pipeline():
     _check_detection_pipeline(cmd, target_folder)
 
 
-@pytest.mark.skipif(
-    total_memory < 16 or total_disk_space < 35,
-    reason="Test requires at least 16GB of RAM and 35GB of HDD",
-)
-def test_cpu_simple_sdxl_lightning_detection_pipeline():
-    # Define target folder
-    target_folder = "data/data-det-cpu-simple-sdxl-lightning/"
-    # Define the command to run the datadreamer
-    cmd = (
-        f"datadreamer --save_dir {target_folder} "
-        f"--class_names alien mars cat "
-        f"--prompts_number 1 "
-        f"--prompt_generator simple "
-        f"--num_objects_range 1 2 "
-        f"--image_generator sdxl-lightning "
-        f"--use_image_tester "
-        f"--device cpu"
-    )
-    # Check the run of the pipeline
-    _check_detection_pipeline(cmd, target_folder)
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35,
-    reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD",
-)
-def test_cuda_simple_sdxl_lightning_detection_pipeline():
-    # Define target folder
-    target_folder = "data/data-det-cuda-simple-sdxl-lightning/"
-    # Define the command to run the datadreamer
-    cmd = (
-        f"datadreamer --save_dir {target_folder} "
-        f"--class_names alien mars cat "
-        f"--prompts_number 1 "
-        f"--prompt_generator simple "
-        f"--num_objects_range 1 2 "
-        f"--image_generator sdxl-lightning "
-        f"--use_image_tester "
-        f"--device cuda"
-    )
-    # Check the run of the pipeline
-    _check_detection_pipeline(cmd, target_folder)
-
-
 # =========================================================
 # DETECTION - LLM
 # =========================================================

From c16f5d744b1532f33268694d95d564af655ab8f4 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Fri, 27 Sep 2024 01:59:30 +0000
Subject: [PATCH 29/31] [Automated] Updated coverage badge

---
 media/coverage_badge.svg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/media/coverage_badge.svg b/media/coverage_badge.svg
index 6d24dca..2fad913 100644
--- a/media/coverage_badge.svg
+++ b/media/coverage_badge.svg
@@ -15,7 +15,7 @@
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
         <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
         <text x="31.5" y="14">coverage</text>
-        <text x="80" y="15" fill="#010101" fill-opacity=".3">73%</text>
-        <text x="80" y="14">73%</text>
+        <text x="80" y="15" fill="#010101" fill-opacity=".3">62%</text>
+        <text x="80" y="14">62%</text>
     </g>
 </svg>

From ba9fb3bda857d017aa65bbbd9e52c107bce09ebb Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Fri, 27 Sep 2024 12:58:25 +0000
Subject: [PATCH 30/31] test: run core tests on pr to main

---
 .github/workflows/tests.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 7553b16..6f964ac 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -2,7 +2,7 @@ name: Tests
 
 on:
   pull_request:
-    branches: [ main, dev ]
+    branches: [ main ]
     paths:
       - 'datadreamer/**/**.py'
       - 'tests/core_tests/**/**.py'

From 8de4fe9cc9bb3fb389a83aa141cb4d65349b681b Mon Sep 17 00:00:00 2001
From: Nikita Sokovnin <nikitoos67@gmail.com>
Date: Fri, 27 Sep 2024 12:59:46 +0000
Subject: [PATCH 31/31] test: rename heavy test scripts

---
 .../integration/{test_pipeline.py => test_pipeline_heavy.py}      | 0
 .../{test_image_generation.py => test_image_generation_heavy.py}  | 0
 ...{test_prompt_generation.py => test_prompt_generation_heavy.py} | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/heavy_tests/integration/{test_pipeline.py => test_pipeline_heavy.py} (100%)
 rename tests/heavy_tests/unittests/{test_image_generation.py => test_image_generation_heavy.py} (100%)
 rename tests/heavy_tests/unittests/{test_prompt_generation.py => test_prompt_generation_heavy.py} (100%)

diff --git a/tests/heavy_tests/integration/test_pipeline.py b/tests/heavy_tests/integration/test_pipeline_heavy.py
similarity index 100%
rename from tests/heavy_tests/integration/test_pipeline.py
rename to tests/heavy_tests/integration/test_pipeline_heavy.py
diff --git a/tests/heavy_tests/unittests/test_image_generation.py b/tests/heavy_tests/unittests/test_image_generation_heavy.py
similarity index 100%
rename from tests/heavy_tests/unittests/test_image_generation.py
rename to tests/heavy_tests/unittests/test_image_generation_heavy.py
diff --git a/tests/heavy_tests/unittests/test_prompt_generation.py b/tests/heavy_tests/unittests/test_prompt_generation_heavy.py
similarity index 100%
rename from tests/heavy_tests/unittests/test_prompt_generation.py
rename to tests/heavy_tests/unittests/test_prompt_generation_heavy.py