From e8a9e116a6f894671caa394f528c55f91cddf954 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Honza=20=C4=8Cuhel?=
 <79118988+HonzaCuhel@users.noreply.github.com>
Date: Fri, 23 Feb 2024 15:15:37 +0100
Subject: [PATCH] Batch annotation (#35)

* Add batch annotation

* Update docs & add test & fix batched annotation

* Change default batch annotation

* Fix annotation tests

* Fix tests

* [Automated] Updated coverage badge

* Update annotation example & docstrings

* Fix formatting

* Fix docstring

* [Automated] Updated coverage badge

* refactor: replace annotate() with annotate_batch()

* feature: replace owlv2 resize

---------

Co-authored-by: Jan Cuhel <jcuhel98@gmail.com>
Co-authored-by: GitHub Actions <actions@github.com>
Co-authored-by: Nikita Sokovnin <nikitoos67@gmail.com>
---
 README.md                                     |   1 +
 .../dataset_annotation/image_annotator.py     |   4 +-
 .../dataset_annotation/owlv2_annotator.py     | 211 +++++++++++++-----
 datadreamer/dataset_annotation/utils.py       |   4 +-
 .../generate_dataset_from_scratch.py          | 116 +++++-----
 examples/image_annotation_example.py          |   8 +-
 ..._by_step_dataset_generation_pipeline.ipynb | 122 +++++-----
 media/coverage_badge.svg                      |   4 +-
 tests/integration/test_pipeline.py            |   6 +
 tests/unittests/test_annotators.py            |  23 +-
 10 files changed, 306 insertions(+), 193 deletions(-)
diff --git a/README.md b/README.md
index db3450b..2e47f38 100644
--- a/README.md
+++ b/README.md
@@ -116,6 +116,7 @@ datadreamer --save_dir <directory> --class_names <objects> --prompts_number <num
 - `--image_tester_patience`: Patience level for image tester. Default is 1.
 - `--lm_quantization`: Quantization to use for Mistral language model. Choose between `none` and `4bit`. Default is `none`.
 - `--batch_size_prompt`: Batch size for prompt generation. Default is 64.
+- `--batch_size_annotation`: Batch size for annotation. Default is 8.
 - `--batch_size_image`: Batch size for image generation. Default is 1.
 - `--device`: Choose between `cuda` and `cpu`. Default is cuda.
 - `--seed`: Set a random seed for image and prompt generation. Default is 42.
diff --git a/datadreamer/dataset_annotation/image_annotator.py b/datadreamer/dataset_annotation/image_annotator.py
index 8ba4935..bf50dfe 100644
--- a/datadreamer/dataset_annotation/image_annotator.py
+++ b/datadreamer/dataset_annotation/image_annotator.py
@@ -20,7 +20,7 @@ class BaseAnnotator(ABC):
                                     which can be overridden by subclasses for specific tasks.
 
     Methods:
-        annotate(): Abstract method to be implemented by subclasses. It should contain
+        annotate_batch(): Abstract method to be implemented by subclasses. It should contain
                     the logic for performing annotation based on the task definition.
     """
 
@@ -31,5 +31,5 @@ def __init__(
         self.task_definition = task_definition
 
     @abstractmethod
-    def annotate(self):
+    def annotate_batch(self):
         pass
diff --git a/datadreamer/dataset_annotation/owlv2_annotator.py b/datadreamer/dataset_annotation/owlv2_annotator.py
index f6d571e..c359612 100644
--- a/datadreamer/dataset_annotation/owlv2_annotator.py
+++ b/datadreamer/dataset_annotation/owlv2_annotator.py
@@ -1,3 +1,7 @@
+from typing import List, Tuple
+
+import numpy as np
+import PIL
 import torch
 from transformers import Owlv2ForObjectDetection, Owlv2Processor
 
@@ -18,7 +22,7 @@ class OWLv2Annotator(BaseAnnotator):
     Methods:
         _init_model(): Initializes the OWLv2 model.
         _init_processor(): Initializes the processor for the OWLv2 model.
-        annotate(image, prompts, conf_threshold, use_tta, synonym_dict): Annotates the given image with bounding boxes and labels.
+        annotate_batch(image, prompts, conf_threshold, use_tta, synonym_dict): Annotates the given image with bounding boxes and labels.
         release(empty_cuda_cache): Releases resources and optionally empties the CUDA cache.
     """
 
@@ -56,16 +60,90 @@ def _init_processor(self):
             Owlv2Processor: The initialized processor.
         """
         return Owlv2Processor.from_pretrained(
-            "google/owlv2-base-patch16-ensemble", do_pad=False
+            "google/owlv2-base-patch16-ensemble", do_pad=False, do_resize=False
         )
 
-    def annotate(
-        self, image, prompts, conf_threshold=0.1, use_tta=False, synonym_dict=None
-    ):
-        """Annotates an image using the OWLv2 model.
+    def _generate_annotations(
+        self,
+        images: List[PIL.Image.Image],
+        prompts: List[str],
+        conf_threshold: float = 0.1,
+    ) -> List[dict[str, torch.Tensor]]:
+        """Generates annotations for the given images and prompts.
 
         Args:
-            image: The image to be annotated.
+            images: The images to be annotated.
+            prompts: Prompts to guide the annotation.
+            conf_threshold (float, optional): Confidence threshold for the annotations. Defaults to 0.1.
+
+        Returns:
+            dict: A dictionary containing the annotations for the images.
+        """
+        n = len(images)
+        batched_prompts = [prompts] * n
+        target_sizes = torch.Tensor(images[0].size[::-1]).repeat((n, 1)).to(self.device)
+
+        # resize the images to the model's input size
+        images = [images[i].resize((960, 960)) for i in range(n)]
+        inputs = self.processor(
+            text=batched_prompts, images=images, return_tensors="pt"
+        ).to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        # print(outputs)
+        preds = self.processor.post_process_object_detection(
+            outputs=outputs, target_sizes=target_sizes, threshold=conf_threshold
+        )
+
+        return preds
+
+    def _get_annotations(
+        self,
+        pred: dict[str, torch.Tensor],
+        use_tta: bool,
+        img_dim: int,
+        synonym_dict: dict[str, List[str]] | None,
+        synonym_dict_rev: dict[int, int] | None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Extracts the annotations from the predictions.
+
+        Args:
+            pred: The predictions from the model.
+            use_tta (bool): Flag to whether the test-time augmentation was applied.
+            img_dim (int): The dimension of the image.
+            synonym_dict (dict): Dictionary for handling synonyms in labels.
+            synonym_dict_rev (dict): Dictionary for handling synonyms in labels.
+
+        Returns:
+            tuple: A tuple containing the final bounding boxes, scores, and labels for the annotations.
+        """
+
+        boxes, scores, labels = (
+            pred["boxes"],
+            pred["scores"],
+            pred["labels"],
+        )
+        # Flip boxes back if using TTA
+        if use_tta:
+            boxes[:, [0, 2]] = img_dim - boxes[:, [2, 0]]
+
+        if synonym_dict is not None:
+            labels = torch.tensor([synonym_dict_rev[label.item()] for label in labels])
+
+        return boxes, scores, labels
+
+    def annotate_batch(
+        self,
+        images: List[PIL.Image.Image],
+        prompts: List[str],
+        conf_threshold: float = 0.1,
+        use_tta: bool = False,
+        synonym_dict: dict[str, List[str]] | None = None,
+    ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
+        """Annotates images using the OWLv2 model.
+
+        Args:
+            images: The images to be annotated.
             prompts: Prompts to guide the annotation.
             conf_threshold (float, optional): Confidence threshold for the annotations. Defaults to 0.1.
             use_tta (bool, optional): Flag to apply test-time augmentation. Defaults to False.
@@ -75,9 +153,7 @@ def annotate(
             tuple: A tuple containing the final bounding boxes, scores, and labels for the annotations.
         """
         if use_tta:
-            augmented_images = apply_tta(image)
-        else:
-            augmented_images = [image]
+            augmented_images = [apply_tta(image)[0] for image in images]
 
         if synonym_dict is not None:
             prompts_syn = []
@@ -93,69 +169,88 @@ def annotate(
                         synonym_dict_rev[prompts_syn.index(v)] = prompts.index(key)
             prompts = prompts_syn
 
-        all_boxes = []
-        all_scores = []
-        all_labels = []
+        preds = self._generate_annotations(images, prompts, conf_threshold)
+        if use_tta:
+            augmented_preds = self._generate_annotations(
+                augmented_images, prompts, conf_threshold
+            )
+        else:
+            augmented_preds = [None] * len(images)
 
-        target_sizes = torch.Tensor([augmented_images[0].size[::-1]]).to(self.device)
+        final_boxes = []
+        final_scores = []
+        final_labels = []
 
-        for aug_image in augmented_images:
-            inputs = self.processor(
-                text=prompts, images=aug_image, return_tensors="pt"
-            ).to(self.device)
-            with torch.no_grad():
-                outputs = self.model(**inputs)
-            # print(outputs)
-            preds = self.processor.post_process_object_detection(
-                outputs=outputs, target_sizes=target_sizes, threshold=conf_threshold
+        for i, (pred, aug_pred) in enumerate(zip(preds, augmented_preds)):
+            boxes, scores, labels = self._get_annotations(
+                pred,
+                False,
+                images[i].size[0],
+                synonym_dict,
+                synonym_dict_rev if synonym_dict is not None else None,
             )
 
-            boxes, scores, labels = (
-                preds[0]["boxes"],
-                preds[0]["scores"],
-                preds[0]["labels"],
-            )
-            # Flip boxes back if using TTA
-            if use_tta and len(all_boxes) == 1:
-                boxes[:, [0, 2]] = image.size[0] - boxes[:, [2, 0]]
+            all_boxes = [boxes.to("cpu")]
+            all_scores = [scores.to("cpu")]
+            all_labels = [labels.to("cpu")]
 
-            if synonym_dict is not None:
-                labels = torch.tensor(
-                    [synonym_dict_rev[label.item()] for label in labels]
+            # Flip boxes back if using TTA
+            if use_tta:
+                aug_boxes, aug_scores, aug_labels = self._get_annotations(
+                    aug_pred,
+                    True,
+                    images[i].size[0],
+                    synonym_dict,
+                    synonym_dict_rev if synonym_dict is not None else None,
                 )
 
-            all_boxes.append(boxes.to("cpu"))
-            all_scores.append(scores.to("cpu"))
-            all_labels.append(labels.to("cpu"))
+                all_boxes.append(aug_boxes.to("cpu"))
+                all_scores.append(aug_scores.to("cpu"))
+                all_labels.append(aug_labels.to("cpu"))
 
-        # Convert list of tensors to a single tensor for NMS
-        all_boxes_cat = torch.cat(all_boxes)
-        all_scores_cat = torch.cat(all_scores)
-        all_labels_cat = torch.cat(all_labels)
+            one_hot_labels = torch.nn.functional.one_hot(
+                torch.cat(all_labels), num_classes=len(prompts)
+            )
 
-        one_hot_labels = torch.nn.functional.one_hot(
-            all_labels_cat, num_classes=len(prompts)
-        )
+            # Apply NMS
+            # transform predictions to shape [N, 5 + num_classes], N is the number of bboxes for nms function
+            all_boxes_cat = torch.cat(
+                (
+                    torch.cat(all_boxes),
+                    torch.cat(all_scores).unsqueeze(-1),
+                    one_hot_labels,
+                ),
+                dim=1,
+            )
 
-        # Apply NMS
-        # transform predictions to shape [N, 5 + num_classes], N is the number of bboxes for nms function
-        all_boxes_cat = torch.cat(
-            (all_boxes_cat, all_scores_cat.unsqueeze(-1), one_hot_labels),
-            dim=1,
-        )
+            # output is a list of detections, each item is one tensor with shape (num_boxes, 6), 6 is for [xyxy, conf, cls].
+            output = non_max_suppression(
+                all_boxes_cat.unsqueeze(0), conf_thres=conf_threshold, iou_thres=0.2
+            )
 
-        # output is  a list of detections, each item is one tensor with shape (num_boxes, 6), 6 is for [xyxy, conf, cls].
-        output = non_max_suppression(
-            all_boxes_cat.unsqueeze(0), conf_thres=conf_threshold, iou_thres=0.2
-        )
+            output_boxes = output[0][:, :4]
+            output_scores = output[0][:, 4]
+            output_local_labels = output[0][:, 5].long()
 
-        final_boxes = output[0][:, :4]
-        final_scores = output[0][:, 4]
-        final_labels = output[0][:, 5].long()
+            final_boxes.append(
+                output_boxes.detach().cpu().numpy()
+                if not isinstance(output_boxes, np.ndarray)
+                else output_boxes
+            )
+            final_scores.append(
+                output_scores.detach().cpu().numpy()
+                if not isinstance(output_scores, np.ndarray)
+                else output_scores
+            )
+            final_labels.append(
+                output_local_labels.detach().cpu().numpy()
+                if not isinstance(output_local_labels, np.ndarray)
+                else output_local_labels
+            )
 
         return final_boxes, final_scores, final_labels
 
-    def release(self, empty_cuda_cache=False) -> None:
+    def release(self, empty_cuda_cache: bool = False) -> None:
         """Releases the model and optionally empties the CUDA cache.
 
         Args:
diff --git a/datadreamer/dataset_annotation/utils.py b/datadreamer/dataset_annotation/utils.py
index acf9734..2b0ae88 100644
--- a/datadreamer/dataset_annotation/utils.py
+++ b/datadreamer/dataset_annotation/utils.py
@@ -8,7 +8,7 @@ def apply_tta(image):
         image: The image to be augmented.
 
     Returns:
-        list: A list of augmented images, including the original and transformed versions.
+        list: A list of augmented images.
 
     Note:
         Currently, only horizontal flip is enabled. Additional transformations like
@@ -16,7 +16,7 @@ def apply_tta(image):
     """
     tta_transforms = [
         # Original image
-        transforms.Compose([]),
+        # transforms.Compose([]),
         # Horizontal Flip
         transforms.Compose([transforms.RandomHorizontalFlip(p=1)]),
         # Vertical Flip
diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py
index d106f60..db52cfb 100644
--- a/datadreamer/pipelines/generate_dataset_from_scratch.py
+++ b/datadreamer/pipelines/generate_dataset_from_scratch.py
@@ -145,6 +145,13 @@ def parse_args():
         help="Batch size for prompt generation",
     )
 
+    parser.add_argument(
+        "--batch_size_annotation",
+        type=int,
+        default=1,
+        help="Batch size for annotation",
+    )
+
     parser.add_argument(
         "--batch_size_image",
         type=int,
@@ -228,6 +235,10 @@ def check_args(args):
     if args.batch_size_prompt < 1:
         raise ValueError("--batch_size_prompt must be a positive integer")
 
+    # Check batch_size_prompt
+    if args.batch_size_annotation < 1:
+        raise ValueError("--batch_size_annotation must be a positive integer")
+
     # Check batch_size_image
     if args.batch_size_image < 1:
         raise ValueError("--batch_size_image must be a positive integer")
@@ -366,69 +377,66 @@ def main():
         scores_list = []
         labels_list = []
 
-        for i, image_path in tqdm(
-            enumerate(image_paths),
+        # Split image_paths into batches
+        image_batches = [
+            image_paths[i : i + args.batch_size_annotation]
+            for i in range(0, len(image_paths), args.batch_size_annotation)
+        ]
+
+        for i, image_batch in tqdm(
+            enumerate(image_batches),
             desc="Annotating images",
-            total=len(image_paths),
+            total=len(image_batches),
         ):
-            image = Image.open(image_path)
-            boxes, scores, local_labels = annotator.annotate(
-                image,
+            images = [Image.open(image_path) for image_path in image_batch]
+            boxes_batch, scores_batch, local_labels_batch = annotator.annotate_batch(
+                images,
                 args.class_names,
                 conf_threshold=args.conf_threshold,
                 use_tta=args.use_tta,
                 synonym_dict=synonym_dict,
             )
-            # Convert to numpy arrays
-            boxes = (
-                boxes.detach().cpu().numpy()
-                if not isinstance(boxes, np.ndarray)
-                else boxes
-            )
-            scores = (
-                scores.detach().cpu().numpy()
-                if not isinstance(scores, np.ndarray)
-                else scores
-            )
-            local_labels = (
-                local_labels
-                if isinstance(local_labels, np.ndarray)
-                else local_labels.detach().cpu().numpy()
-            )
-
-            boxes_list.append(boxes)
-            scores_list.append(scores)
 
-            labels = []
-            # Save bbox visualizations
-            fig, ax = plt.subplots(1)
-            ax.imshow(image)
-            for box, score, label in zip(boxes, scores, local_labels):
-                labels.append(label)
-                x1, y1, x2, y2 = box
-                rect = patches.Rectangle(
-                    (x1, y1),
-                    x2 - x1,
-                    y2 - y1,
-                    linewidth=2,
-                    edgecolor="r",
-                    facecolor="none",
-                )
-                ax.add_patch(rect)
-                label_text = args.class_names[label]
-                plt.text(
-                    x1,
-                    y1,
-                    f"{label_text} {score:.2f}",
-                    bbox=dict(facecolor="yellow", alpha=0.5),
+            boxes_list.extend(boxes_batch)
+            scores_list.extend(scores_batch)
+
+            for j, image in enumerate(images):
+                labels = []
+                # Save bbox visualizations
+                fig, ax = plt.subplots(1)
+                ax.imshow(image)
+                for box, score, label in zip(
+                    boxes_batch[j], scores_batch[j], local_labels_batch[j]
+                ):
+                    labels.append(label)
+                    x1, y1, x2, y2 = box
+                    rect = patches.Rectangle(
+                        (x1, y1),
+                        x2 - x1,
+                        y2 - y1,
+                        linewidth=2,
+                        edgecolor="r",
+                        facecolor="none",
+                    )
+                    ax.add_patch(rect)
+                    label_text = args.class_names[label]
+                    plt.text(
+                        x1,
+                        y1,
+                        f"{label_text} {score:.2f}",
+                        bbox=dict(facecolor="yellow", alpha=0.5),
+                    )
+                    # Add prompt text as title
+                    plt.title(generated_prompts[i * args.batch_size_annotation + j][1])
+
+                labels_list.append(np.array(labels))
+
+                plt.savefig(
+                    os.path.join(
+                        bbox_dir, f"bbox_{i * args.batch_size_annotation + j}.jpg"
+                    )
                 )
-                # Add prompt text as title
-                plt.title(generated_prompts[i][1])
-
-            labels_list.append(np.array(labels))
-
-            plt.savefig(os.path.join(bbox_dir, f"bbox_{i}.jpg"))
-            plt.close()
+                plt.close()
 
         # Save annotations as JSON files
         save_det_annotations_to_json(
diff --git a/examples/image_annotation_example.py b/examples/image_annotation_example.py
index c1ce649..411ed20 100644
--- a/examples/image_annotation_example.py
+++ b/examples/image_annotation_example.py
@@ -8,7 +8,7 @@
 # Initialize the OWLv2Annotator
 annotator = OWLv2Annotator(
     seed=42,
-    device="cuda",  # Use "cuda" for GPU or "cpu" for CPU
+    device="cpu",  # Use "cuda" for GPU or "cpu" for CPU
 )
 
 # Load your image
@@ -22,10 +22,12 @@
 prompts = list(class_map.keys())
 
 # Perform object detection
-boxes, scores, labels = annotator.annotate(
-    image, prompts, conf_threshold=0.15, use_tta=True
+boxes, scores, labels = annotator.annotate_batch(
+    [image], prompts, conf_threshold=0.15, use_tta=True
 )
 
+boxes, scores, labels = boxes[0], scores[0], labels[0]
+
 # Convert to numpy arrays
 if not isinstance(boxes, np.ndarray):
     boxes = boxes.detach().cpu().numpy()
diff --git a/examples/step_by_step_dataset_generation_pipeline.ipynb b/examples/step_by_step_dataset_generation_pipeline.ipynb
index de29456..b3466a1 100644
--- a/examples/step_by_step_dataset_generation_pipeline.ipynb
+++ b/examples/step_by_step_dataset_generation_pipeline.ipynb
@@ -2,23 +2,20 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-02-20 14:37:33.592243: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2024-02-20 14:37:33.645672: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
-      "2024-02-20 14:37:33.645721: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
-      "2024-02-20 14:37:33.647238: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-      "2024-02-20 14:37:33.655817: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2024-02-20 14:37:34.821585: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
-     ]
-    }
-   ],
+   "outputs": [],
+   "source": [
+    "!pip install datadreamer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
    "source": [
     "import matplotlib.patches as patches\n",
     "import matplotlib.pyplot as plt\n",
@@ -39,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -52,7 +49,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "35bb357df74b426d8811d38f375c4a84",
+       "model_id": "14dc4004b7d14980b1b2bf3346ef64c6",
        "version_major": 2,
        "version_minor": 0
       },
@@ -84,16 +81,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Generating prompts...:  70%|███████   | 7/10 [00:23<00:09,  3.16s/it]/opt/conda/lib/python3.11/site-packages/transformers/pipelines/base.py:1123: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n",
+      "Generating prompts...:  70%|███████   | 7/10 [00:23<00:09,  3.17s/it]/opt/conda/lib/python3.11/site-packages/transformers/pipelines/base.py:1157: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n",
       "  warnings.warn(\n",
-      "Generating prompts...: 100%|██████████| 10/10 [00:35<00:00,  3.53s/it]"
+      "Generating prompts...: 100%|██████████| 10/10 [00:35<00:00,  3.54s/it]"
      ]
     },
     {
@@ -119,7 +116,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -135,7 +132,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -148,7 +145,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e25d658ea3ae4fe18d045879a0b92a9b",
+       "model_id": "91eba34a2c3140bf9e5502a1d09c75d8",
        "version_major": 2,
        "version_minor": 0
       },
@@ -172,7 +169,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -182,7 +179,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -195,7 +192,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e6879668be6f4c1aa00fa45076a79cf6",
+       "model_id": "9f69944f168940e6a9db2cb4ceab9acf",
        "version_major": 2,
        "version_minor": 0
       },
@@ -210,13 +207,13 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Generating images:  10%|█         | 1/10 [00:11<01:42, 11.40s/it]"
+      "Generating images:  10%|█         | 1/10 [01:45<15:47, 105.22s/it]"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2db33d6d57914c808859e1bbec22ba59",
+       "model_id": "a2160b68d9aa4df2bc36a669f3e03201",
        "version_major": 2,
        "version_minor": 0
       },
@@ -231,13 +228,13 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Generating images:  20%|██        | 2/10 [00:16<01:00,  7.57s/it]"
+      "Generating images:  20%|██        | 2/10 [01:50<06:10, 46.36s/it] "
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7d6228e28ce246d18069cef6f949932a",
+       "model_id": "948e56dc4e164492bbd7225f12afb2c9",
        "version_major": 2,
        "version_minor": 0
       },
@@ -252,13 +249,13 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Generating images:  30%|███       | 3/10 [00:20<00:42,  6.00s/it]"
+      "Generating images:  30%|███       | 3/10 [01:54<03:10, 27.19s/it]"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1cb852722f4c4ca98a446f204fee9a22",
+       "model_id": "b5b5222207274b10b1ba75469fec282b",
        "version_major": 2,
        "version_minor": 0
       },
@@ -273,13 +270,13 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Generating images:  40%|████      | 4/10 [00:24<00:31,  5.28s/it]"
+      "Generating images:  40%|████      | 4/10 [01:59<01:48, 18.14s/it]"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5dc49c660c8145918d395e5ba3ab481c",
+       "model_id": "6566828bc2a74385991dfd82efe50dd6",
        "version_major": 2,
        "version_minor": 0
       },
@@ -294,13 +291,13 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Generating images:  50%|█████     | 5/10 [00:28<00:24,  4.88s/it]"
+      "Generating images:  50%|█████     | 5/10 [02:03<01:05, 13.09s/it]"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9007293187954f6eae3c88f81acb7d84",
+       "model_id": "e047acdff8e54df5886790251dfb33fa",
        "version_major": 2,
        "version_minor": 0
       },
@@ -315,13 +312,13 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Generating images:  60%|██████    | 6/10 [00:32<00:18,  4.59s/it]"
+      "Generating images:  60%|██████    | 6/10 [02:07<00:40, 10.05s/it]"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1488a0b44c90479e8efa63981340ad9c",
+       "model_id": "a1b80c1feaaf4ec4952cc7cdb2464fd5",
        "version_major": 2,
        "version_minor": 0
       },
@@ -336,13 +333,13 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Generating images:  70%|███████   | 7/10 [00:36<00:13,  4.41s/it]"
+      "Generating images:  70%|███████   | 7/10 [02:11<00:24,  8.13s/it]"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2382da1c5de848dd8c4db0200448cdf3",
+       "model_id": "c09c5adc383e4e82b968caf1a6a9cc37",
        "version_major": 2,
        "version_minor": 0
       },
@@ -357,13 +354,13 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Generating images:  80%|████████  | 8/10 [00:40<00:08,  4.31s/it]"
+      "Generating images:  80%|████████  | 8/10 [02:15<00:13,  6.85s/it]"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1d466fe6bb9d4e3fbe1bafd339bfbfdc",
+       "model_id": "1febee5bc5db4bee90c4aff3924942a1",
        "version_major": 2,
        "version_minor": 0
       },
@@ -378,13 +375,13 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Generating images:  90%|█████████ | 9/10 [00:44<00:04,  4.21s/it]"
+      "Generating images:  90%|█████████ | 9/10 [02:19<00:06,  6.00s/it]"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a963d42464434619bb883fa16de2e899",
+       "model_id": "79404d67bd804f0a927eb9c19098919c",
        "version_major": 2,
        "version_minor": 0
       },
@@ -399,7 +396,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Generating images: 100%|██████████| 10/10 [00:48<00:00,  4.89s/it]\n"
+      "Generating images: 100%|██████████| 10/10 [02:24<00:00, 14.40s/it]\n"
      ]
     }
    ],
@@ -412,7 +409,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -420,7 +417,7 @@
      "output_type": "stream",
      "text": [
       "(['aeroplane'], 'A photo of an aeroplane soaring over a cityscape at sunset, casting a golden glow over the rooftops and creating a stunning backdrop for the bustling metropolis below.')\n",
-      "<PIL.Image.Image image mode=RGB size=512x512 at 0x7DA938C72390>\n"
+      "<PIL.Image.Image image mode=RGB size=512x512 at 0x79D01CD488D0>\n"
      ]
     },
     {
@@ -438,7 +435,7 @@
      "output_type": "stream",
      "text": [
       "(['car', 'aeroplane'], 'A photo of a car and a helicopter in the city skyline, showing the coexistence of man-made machines in urban life.')\n",
-      "<PIL.Image.Image image mode=RGB size=512x512 at 0x7DA8F45C7650>\n"
+      "<PIL.Image.Image image mode=RGB size=512x512 at 0x79D01CD34E90>\n"
      ]
     },
     {
@@ -456,7 +453,7 @@
      "output_type": "stream",
      "text": [
       "(['unicorn'], 'A photo of majestic unicorns frolicking in a lush green meadow, surrounded by vibrant wildflowers and the clear blue sky.')\n",
-      "<PIL.Image.Image image mode=RGB size=512x512 at 0x7DA8F4067AD0>\n"
+      "<PIL.Image.Image image mode=RGB size=512x512 at 0x79D01CD110D0>\n"
      ]
     },
     {
@@ -474,7 +471,7 @@
      "output_type": "stream",
      "text": [
       "(['car'], 'A photo of car: A rusty old car sits abandoned in a fields, surrounded by towering wheat stalks. The cars doors are thrown open, revealing a worn-out interior. The scene is bathed in the golden light of the setting sun, casting shadows across the field.')\n",
-      "<PIL.Image.Image image mode=RGB size=512x512 at 0x7DA8F5ADFD10>\n"
+      "<PIL.Image.Image image mode=RGB size=512x512 at 0x79D01C60EF50>\n"
      ]
     },
     {
@@ -492,7 +489,7 @@
      "output_type": "stream",
      "text": [
       "(['aeroplane'], 'A photo of an aeroplane soaring over a beautiful sunset and a bustling city.')\n",
-      "<PIL.Image.Image image mode=RGB size=512x512 at 0x7DA8F40B2FD0>\n"
+      "<PIL.Image.Image image mode=RGB size=512x512 at 0x79D01CD38D50>\n"
      ]
     },
     {
@@ -510,7 +507,7 @@
      "output_type": "stream",
      "text": [
       "(['unicorn'], 'A photo of unicorns grazing in a serene meadow, their ethereal beauty and pureness illuminating the scene.')\n",
-      "<PIL.Image.Image image mode=RGB size=512x512 at 0x7DA918FB4490>\n"
+      "<PIL.Image.Image image mode=RGB size=512x512 at 0x79D01CD3C410>\n"
      ]
     },
     {
@@ -528,7 +525,7 @@
      "output_type": "stream",
      "text": [
       "(['aeroplane', 'unicorn'], 'A photo of an aeroplane and a unicorn soaring above the clouds in the sunset - A serene image of two majestic creatures, gracefully flying together in harmony.')\n",
-      "<PIL.Image.Image image mode=RGB size=512x512 at 0x7DA8F40D7C50>\n"
+      "<PIL.Image.Image image mode=RGB size=512x512 at 0x79D01C60EE90>\n"
      ]
     },
     {
@@ -546,7 +543,7 @@
      "output_type": "stream",
      "text": [
       "(['car'], 'A photo of a car. A sleek, red sports car speeds down a winding mountain road, surrounded by the breathtaking views of a scenic landscape.')\n",
-      "<PIL.Image.Image image mode=RGB size=512x512 at 0x7DA8F45C0210>\n"
+      "<PIL.Image.Image image mode=RGB size=512x512 at 0x79D02C468B10>\n"
      ]
     },
     {
@@ -564,7 +561,7 @@
      "output_type": "stream",
      "text": [
       "(['aeroplane', 'car'], 'A photo of aeroplane flying above a busy city, soaring high above the buildings and the cars below.')\n",
-      "<PIL.Image.Image image mode=RGB size=512x512 at 0x7DA8F40E6790>\n"
+      "<PIL.Image.Image image mode=RGB size=512x512 at 0x79D01CD10C10>\n"
      ]
     },
     {
@@ -582,7 +579,7 @@
      "output_type": "stream",
      "text": [
       "(['person', 'aeroplane'], 'A photo of a person waving goodbye to an aeroplane, as they bid farewell to a loved one who has left for a foreign land. The image captures the bittersweet moment of separation and the longing for a loved one to return, set against the backdrop of an aeroplane.')\n",
-      "<PIL.Image.Image image mode=RGB size=512x512 at 0x7DA8F40B0F90>\n"
+      "<PIL.Image.Image image mode=RGB size=512x512 at 0x79D01C81D9D0>\n"
      ]
     },
     {
@@ -608,7 +605,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -617,7 +614,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -638,7 +635,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -837,7 +834,8 @@
     "    prompts = prompt_objs\n",
     "\n",
     "    # Perform object detection\n",
-    "    boxes, scores, labels = annotator.annotate(image, prompts, conf_threshold=0.2, use_tta=True)\n",
+    "    boxes_batch, scores_batch, labels_batch = annotator.annotate_batch([image], prompts, conf_threshold=0.2, use_tta=True)\n",
+    "    boxes, scores, labels = boxes_batch[0], scores_batch[0], labels_batch[0]\n",
     "\n",
     "    # Convert to numpy arrays\n",
     "    if not isinstance(boxes, np.ndarray):\n",
@@ -905,7 +903,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/media/coverage_badge.svg b/media/coverage_badge.svg
index d7667be..1581a9a 100644
--- a/media/coverage_badge.svg
+++ b/media/coverage_badge.svg
@@ -15,7 +15,7 @@
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
         <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
         <text x="31.5" y="14">coverage</text>
-        <text x="80" y="15" fill="#010101" fill-opacity=".3">47%</text>
-        <text x="80" y="14">47%</text>
+        <text x="80" y="15" fill="#010101" fill-opacity=".3">48%</text>
+        <text x="80" y="14">48%</text>
     </g>
 </svg>
diff --git a/tests/integration/test_pipeline.py b/tests/integration/test_pipeline.py
index 06a4e6b..2a5cdc8 100644
--- a/tests/integration/test_pipeline.py
+++ b/tests/integration/test_pipeline.py
@@ -168,6 +168,12 @@ def test_invalid_batch_size_prompt():
     _check_wrong_value(cmd)
 
 
+def test_invalid_batch_size_annotation():
+    # Define the cmd
+    cmd = "datadreamer --batch_size_annotation -1"
+    _check_wrong_value(cmd)
+
+
 def test_invalid_batch_size_image():
     # Define the cmd
     cmd = "datadreamer --batch_size_image -1"
diff --git a/tests/unittests/test_annotators.py b/tests/unittests/test_annotators.py
index 0926d85..638e35b 100644
--- a/tests/unittests/test_annotators.py
+++ b/tests/unittests/test_annotators.py
@@ -1,3 +1,4 @@
+import numpy as np
 import psutil
 import pytest
 import requests
@@ -14,23 +15,25 @@ def _check_owlv2_annotator(device: str):
     url = "https://ultralytics.com/images/bus.jpg"
     im = Image.open(requests.get(url, stream=True).raw)
     annotator = OWLv2Annotator(device=device)
-    final_boxes, final_scores, final_labels = annotator.annotate(im, ["bus", "people"])
+    final_boxes, final_scores, final_labels = annotator.annotate_batch(
+        [im], ["bus", "people"]
+    )
     # Assert that the boxes, scores and labels are tensors
-    assert type(final_boxes) == torch.Tensor
-    assert type(final_scores) == torch.Tensor
-    assert type(final_labels) == torch.Tensor
+    assert isinstance(final_boxes, list) and len(final_boxes) == 1
+    assert isinstance(final_scores, list) and len(final_scores) == 1
+    assert isinstance(final_labels, list) and len(final_labels) == 1
     # Get the number of objects detected
-    num_objects = final_boxes.shape[0]
+    num_objects = final_boxes[0].shape[0]
     # Check that the boxes has correct shape
-    assert final_boxes.shape == (num_objects, 4)
+    assert final_boxes[0].shape == (num_objects, 4)
     # Check that the scores has correct shape
-    assert final_scores.shape == (num_objects,)
+    assert final_scores[0].shape == (num_objects,)
     # Check that the labels has correct shape
-    assert final_labels.shape == (num_objects,)
+    assert final_labels[0].shape == (num_objects,)
     # Check that the scores are not zero
-    assert torch.all(final_scores > 0)
+    assert np.all(final_scores[0] > 0)
     # Check that the labels are bigger or equal to zero
-    assert torch.all(final_labels >= 0)
+    assert np.all(final_labels[0] >= 0)
 
 
 @pytest.mark.skipif(