From 4dd5ab3e2585baef339a870f181a4bba1ff8e9dd Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Fri, 7 Feb 2025 18:03:03 +0100
Subject: [PATCH] [SYSTEMDS-3830] Add join operator to Scuro

This patch adds a new join operator to Scuro. The join operation takes
two modalities as well as a join condition as input and joins the two
modalities on their common dimension (temporal for now). This includes
two new modalities and the ability to apply new representations on top
of a joined modality. In the future the join operator will also serve
as a simple alignment operator by joining two modalities by a given
offset.

Closes #2220
---
 .github/workflows/python.yml                  |   6 +-
 src/main/python/systemds/__init__.py          |   5 +-
 .../systemds/scuro/dataloader/audio_loader.py |   9 +-
 .../systemds/scuro/dataloader/base_loader.py  |  55 +++-
 .../systemds/scuro/dataloader/json_loader.py  |   6 +-
 .../systemds/scuro/dataloader/text_loader.py  |   5 +-
 .../systemds/scuro/dataloader/video_loader.py |  21 +-
 .../python/systemds/scuro/modality/joined.py  | 274 ++++++++++++++++++
 .../scuro/modality/joined_transformed.py      |  70 +++++
 .../systemds/scuro/modality/modality.py       |  79 ++++-
 .../systemds/scuro/modality/transformed.py    |  54 +++-
 .../python/systemds/scuro/modality/type.py    |  93 +++++-
 .../scuro/modality/unimodal_modality.py       |  56 +++-
 .../scuro/representations/aggregate.py        |  51 ++++
 .../systemds/scuro/representations/bert.py    |  29 +-
 .../systemds/scuro/representations/bow.py     |  11 +-
 .../systemds/scuro/representations/glove.py   |   5 +-
 .../systemds/scuro/representations/lstm.py    |   6 +-
 .../scuro/representations/mel_spectrogram.py  |  50 ++--
 .../systemds/scuro/representations/resnet.py  | 126 +++++---
 .../systemds/scuro/representations/tfidf.py   |  12 +-
 .../systemds/scuro/representations/window.py  |  49 ++++
 .../scuro/representations/word2vec.py         |  19 +-
 .../python/systemds/scuro/utils/__init__.py   |  20 ++
 .../systemds/scuro/utils/schema_helpers.py    |  43 +++
 src/main/python/tests/scuro/data_generator.py |  56 +++-
 .../python/tests/scuro/test_data_loaders.py   |  63 ++--
 src/main/python/tests/scuro/test_dr_search.py |  58 ++--
 .../tests/scuro/test_multimodal_join.py       | 135 +++++++++
 .../scuro/test_unimodal_representations.py    | 120 ++++++++
 30 files changed, 1362 insertions(+), 224 deletions(-)
 create mode 100644 src/main/python/systemds/scuro/modality/joined.py
 create mode 100644 src/main/python/systemds/scuro/modality/joined_transformed.py
 create mode 100644 src/main/python/systemds/scuro/representations/aggregate.py
 create mode 100644 src/main/python/systemds/scuro/representations/window.py
 create mode 100644 src/main/python/systemds/scuro/utils/__init__.py
 create mode 100644 src/main/python/systemds/scuro/utils/schema_helpers.py
 create mode 100644 src/main/python/tests/scuro/test_multimodal_join.py
 create mode 100644 src/main/python/tests/scuro/test_unimodal_representations.py

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 9f39f07ecb7..54da49f8fb5 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -114,10 +114,10 @@ jobs:
           torch \
           librosa \
           h5py \
-          nltk \
           gensim \
-          black
-
+          black \
+          opt-einsum
+                
     - name: Build Python Package
       run: |
         cd src/main/python
diff --git a/src/main/python/systemds/__init__.py b/src/main/python/systemds/__init__.py
index 443b5d23d90..a618ff6e9dd 100644
--- a/src/main/python/systemds/__init__.py
+++ b/src/main/python/systemds/__init__.py
@@ -26,13 +26,14 @@
 __all__ = ["context", "operator", "examples"]
 
 required_packages = [
-    ("torch", "2.4.1"),
-    ("torchvision", "0.19.1"),
+    ("torch", "2.5.1"),
+    ("torchvision", "0.20.1"),
     ("librosa", "0.10.2"),
     ("opencv-python", "4.10.0.84"),
     ("opt-einsum", "3.3.0"),
     ("h5py", "3.11.0"),
     ("transformers", "4.46.3"),
+    ("nltk", "3.9.1"),
     ("gensim", "4.3.3"),
 ]
 
diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py
index f85b1b80faa..f7319fe1912 100644
--- a/src/main/python/systemds/scuro/dataloader/audio_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py
@@ -18,10 +18,11 @@
 # under the License.
 #
 # -------------------------------------------------------------
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import librosa
 from systemds.scuro.dataloader.base_loader import BaseLoader
+from systemds.scuro.utils.schema_helpers import create_timestamps
 
 
 class AudioLoader(BaseLoader):
@@ -33,7 +34,11 @@ def __init__(
     ):
         super().__init__(source_path, indices, chunk_size)
 
-    def extract(self, file: str):
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
         audio, sr = librosa.load(file)
+        self.metadata[file] = {"sample_rate": sr, "length": audio.shape[0]}
+        self.metadata[file]["timestamp"] = create_timestamps(
+            self.metadata[file]["sample_rate"], self.metadata[file]["length"]
+        )
         self.data.append(audio)
diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py
index 2ef60677c67..5cdf63f584c 100644
--- a/src/main/python/systemds/scuro/dataloader/base_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/base_loader.py
@@ -35,32 +35,68 @@ def __init__(
         (otherwise please provide your own Dataloader that knows about the file name convention)
         """
         self.data = []
+        self.metadata = (
+            {}
+        )  # TODO: check what the index should be for storing the metadata (file_name, counter, ...)
         self.source_path = source_path
         self.indices = indices
-        self.chunk_size = chunk_size
-        self.next_chunk = 0
+        self._next_chunk = 0
+        self._num_chunks = 1
+        self._chunk_size = None
 
-        if self.chunk_size:
-            self.num_chunks = int(len(self.indices) / self.chunk_size)
+        if chunk_size:
+            self.chunk_size = chunk_size
+
+    @property
+    def chunk_size(self):
+        return self._chunk_size
+
+    @chunk_size.setter
+    def chunk_size(self, value):
+        self._chunk_size = value
+        self._num_chunks = int(len(self.indices) / self._chunk_size)
+
+    @property
+    def num_chunks(self):
+        return self._num_chunks
+
+    @property
+    def next_chunk(self):
+        return self._next_chunk
 
     def load(self):
         """
         Takes care of loading the raw data either chunk wise (if chunk size is defined) or all at once
         """
-        if self.chunk_size:
+        if self._chunk_size:
             return self._load_next_chunk()
 
         return self._load(self.indices)
 
+    def update_chunk_sizes(self, other):
+        if not self._chunk_size and not other.chunk_size:
+            return
+
+        if (
+            self._chunk_size
+            and not other.chunk_size
+            or self._chunk_size < other.chunk_size
+        ):
+            other.chunk_size = self.chunk_size
+        else:
+            self.chunk_size = other.chunk_size
+
     def _load_next_chunk(self):
         """
         Loads the next chunk of data
         """
         self.data = []
         next_chunk_indices = self.indices[
-            self.next_chunk * self.chunk_size : (self.next_chunk + 1) * self.chunk_size
+            self._next_chunk
+            * self._chunk_size : (self._next_chunk + 1)
+            * self._chunk_size
         ]
-        self.next_chunk += 1
+        self._next_chunk += 1
         return self._load(next_chunk_indices)
 
     def _load(self, indices: List[str]):
@@ -73,13 +109,14 @@ def _load(self, indices: List[str]):
         else:
             self.extract(self.source_path, indices)
 
-        return self.data
+        return self.data, self.metadata
 
     @abstractmethod
     def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         pass
 
-    def file_sanity_check(self, file):
+    @staticmethod
+    def file_sanity_check(file):
         """
         Checks if the file can be found is not empty
         """
diff --git a/src/main/python/systemds/scuro/dataloader/json_loader.py b/src/main/python/systemds/scuro/dataloader/json_loader.py
index c4e3b956111..ac375451888 100644
--- a/src/main/python/systemds/scuro/dataloader/json_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/json_loader.py
@@ -21,7 +21,7 @@
 import json
 
 from systemds.scuro.dataloader.base_loader import BaseLoader
-from typing import Optional, List
+from typing import Optional, List, Union
 
 
 class JSONLoader(BaseLoader):
@@ -35,9 +35,9 @@ def __init__(
         super().__init__(source_path, indices, chunk_size)
         self.field = field
 
-    def extract(self, file: str, indices: List[str]):
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
         with open(file) as f:
             json_file = json.load(f)
-            for idx in indices:
+            for idx in index:
                 self.data.append(json_file[idx][self.field])
diff --git a/src/main/python/systemds/scuro/dataloader/text_loader.py b/src/main/python/systemds/scuro/dataloader/text_loader.py
index f614472bce6..bf34cf85c7f 100644
--- a/src/main/python/systemds/scuro/dataloader/text_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/text_loader.py
@@ -19,7 +19,7 @@
 #
 # -------------------------------------------------------------
 from systemds.scuro.dataloader.base_loader import BaseLoader
-from typing import Optional, Pattern, List
+from typing import Optional, Pattern, List, Union
 import re
 
 
@@ -34,11 +34,12 @@ def __init__(
         super().__init__(source_path, indices, chunk_size)
         self.prefix = prefix
 
-    def extract(self, file: str):
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
         with open(file) as text_file:
             for i, line in enumerate(text_file):
                 if self.prefix:
                     line = re.sub(self.prefix, "", line)
                 line = line.replace("\n", "")
+                self.metadata[file] = {"length": len(line.split())}
                 self.data.append(line)
diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py
index 6da20b34756..807a43b21cc 100644
--- a/src/main/python/systemds/scuro/dataloader/video_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/video_loader.py
@@ -18,11 +18,12 @@
 # under the License.
 #
 # -------------------------------------------------------------
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import numpy as np
 
 from systemds.scuro.dataloader.base_loader import BaseLoader
+from systemds.scuro.utils.schema_helpers import create_timestamps
 import cv2
 
 
@@ -35,9 +36,25 @@ def __init__(
     ):
         super().__init__(source_path, indices, chunk_size)
 
-    def extract(self, file: str):
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
         cap = cv2.VideoCapture(file)
+
+        if not cap.isOpened():
+            raise f"Could not read video at path: {file}"
+
+        self.metadata[file] = {
+            "fps": cap.get(cv2.CAP_PROP_FPS),
+            "length": int(cap.get(cv2.CAP_PROP_FRAME_COUNT)),
+            "width": int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
+            "height": int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
+            "num_channels": 3,
+        }
+
+        self.metadata[file]["timestamp"] = create_timestamps(
+            self.metadata[file]["fps"], self.metadata[file]["length"]
+        )
+
         frames = []
         while cap.isOpened():
             ret, frame = cap.read()
diff --git a/src/main/python/systemds/scuro/modality/joined.py b/src/main/python/systemds/scuro/modality/joined.py
new file mode 100644
index 00000000000..acdf4fb94f1
--- /dev/null
+++ b/src/main/python/systemds/scuro/modality/joined.py
@@ -0,0 +1,274 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import sys
+
+import numpy as np
+
+from systemds.scuro.modality.joined_transformed import JoinedTransformedModality
+from systemds.scuro.modality.modality import Modality
+from systemds.scuro.representations.aggregate import Aggregation
+from systemds.scuro.representations.utils import pad_sequences
+
+
+class JoinCondition:
+    def __init__(self, leftField, rightField, joinType, alignment=None):
+        self.leftField = leftField
+        self.rightField = rightField
+        self.join_type = joinType
+        self.alignment = alignment
+
+
+class JoinedModality(Modality):
+
+    def __init__(
+        self,
+        modality_type,
+        left_modality,
+        right_modality,
+        join_condition: JoinCondition,
+        chunked_execution=False,
+    ):
+        """
+        TODO
+        :param modality_type: Type of the original modality(ies)
+        """
+        super().__init__(modality_type)
+        self.aggregation = None
+        self.joined_right = None
+        self.left_modality = left_modality
+        self.right_modality = right_modality
+        self.condition = join_condition
+        self.chunked_execution = (
+            chunked_execution  # TODO: maybe move this into parent class
+        )
+        self.left_type = type(left_modality)
+        self.right_type = type(right_modality)
+        self.chunk_left = False
+        if self.chunked_execution and self.left_type.__name__.__contains__("Unimodal"):
+            self.chunk_left = left_modality.data_loader.chunk_size is not None
+
+    def execute(self, starting_idx=0):
+        self.joined_right = self.right_modality.copy_from_instance()
+
+        start, end = 0, len(self.left_modality.data)
+        if self.chunked_execution and not self.chunk_left:
+            start = starting_idx
+            end = (
+                self.right_modality.data_loader.chunk_size
+                * self.right_modality.data_loader.next_chunk
+            )
+
+        for i in range(start, end):
+            idx_1 = list(self.left_modality.metadata.values())[i + starting_idx][
+                self.condition.leftField
+            ]
+            if (
+                self.condition.alignment is None and self.condition.join_type == "<"
+            ):  # TODO compute correct alignment timestamps/spatial params
+                nextIdx = np.zeros(len(idx_1), dtype=int)
+                nextIdx[:-1] = idx_1[1:]
+                nextIdx[-1] = sys.maxsize
+
+            if self.chunk_left:
+                i = i + starting_idx
+
+            idx_2 = list(self.right_modality.metadata.values())[i][
+                self.condition.rightField
+            ]
+            self.joined_right.data.append([])
+
+            c = 0
+            # Assumes ordered lists (temporal)
+            # TODO: need to extract the shape of the data from the metadata
+            # video: list of lists of numpy array
+            # audio: list of numpy array
+            for j in range(0, len(idx_1)):
+                self.joined_right.data[i - starting_idx].append([])
+                right = np.array([])
+                if self.condition.join_type == "<":
+                    while c < len(idx_2) and idx_2[c] < nextIdx[j]:
+                        if right.size == 0:
+                            right = self.right_modality.data[i][c]
+                            if right.ndim == 1:
+                                right = right[np.newaxis, :]
+                        else:
+                            if self.right_modality.data[i][c].ndim == 1:
+                                right = np.concatenate(
+                                    [
+                                        right,
+                                        self.right_modality.data[i][c][np.newaxis, :],
+                                    ],
+                                    axis=0,
+                                )
+                            else:
+                                right = np.concatenate(
+                                    [right, self.right_modality.data[i][c]],
+                                    axis=0,
+                                )
+                        c = c + 1
+                else:
+                    while c < len(idx_2) and idx_2[c] <= idx_1[j]:
+                        if idx_2[c] == idx_1[j]:
+                            right.append(self.right_modality.data[i][c])
+                        c = c + 1
+
+                if (
+                    len(right) == 0
+                ):  # Audio and video length sometimes do not match so we add the average all audio samples for this specific frame
+                    right = np.mean(self.right_modality.data[i][c - 1 : c], axis=0)
+                    if right.ndim == 1:
+                        right = right[
+                            np.newaxis, :
+                        ]  # TODO: check correct loading for all data layouts, this is similar to missing data, add a different operation for this
+
+                self.joined_right.data[i - starting_idx][j] = right
+
+    def apply_representation(self, representation, aggregation):
+        self.aggregation = aggregation
+        if self.chunked_execution:
+            return self._handle_chunked_execution(representation)
+        elif self.left_type.__name__.__contains__("Unimodal"):
+            self.left_modality.extract_raw_data()
+            if self.left_type == self.right_type:
+                self.right_modality.extract_raw_data()
+        elif self.right_type.__name__.__contains__("Unimodal"):
+            self.right_modality.extract_raw_data()
+
+        self.execute()
+        left_transformed = self._apply_representation(
+            self.left_modality, representation
+        )
+        right_transformed = self._apply_representation(
+            self.joined_right, representation
+        )
+        left_transformed.update_metadata()
+        right_transformed.update_metadata()
+        return JoinedTransformedModality(
+            left_transformed, right_transformed, f"joined_{representation.name}"
+        )
+
+    def aggregate(
+        self, aggregation_function, field_name
+    ):  # TODO: use the filed name to extract data entries from modalities
+        self.aggregation = Aggregation(aggregation_function, field_name)
+
+        if not self.chunked_execution and self.joined_right:
+            return self.aggregation.aggregate(self.joined_right)
+
+        return self
+
+    def combine(self, fusion_method):
+        """
+        Combines two or more modalities with each other using a dedicated fusion method
+        :param other: The modality to be combined
+        :param fusion_method: The fusion method to be used to combine modalities
+        """
+        modalities = [self.left_modality, self.right_modality]
+        self.data = []
+        reshape = False
+        if self.left_modality.get_data_shape() != self.joined_right.get_data_shape():
+            reshape = True
+        for i in range(0, len(self.left_modality.data)):
+            self.data.append([])
+            for j in range(0, len(self.left_modality.data[i])):
+                self.data[i].append([])
+                if reshape:
+                    self.joined_right.data[i][j] = self.joined_right.data[i][j].reshape(
+                        self.left_modality.get_data_shape()
+                    )
+                fused = np.concatenate(
+                    [self.left_modality.data[i][j], self.joined_right.data[i][j]],
+                    axis=0,
+                )
+                self.data[i][j] = fused
+        # self.data = fusion_method.transform(modalities)
+
+        for i, instance in enumerate(
+            self.data
+        ):  # TODO: only if the layout is list_of_lists_of_numpy_array
+            r = []
+            [r.extend(l) for l in instance]
+            self.data[i] = np.array(r)
+        self.data = pad_sequences(self.data)
+        return self
+
+    def _handle_chunked_execution(self, representation):
+        if self.left_type == self.right_type:
+            return self._apply_representation_chunked(
+                self.left_modality, self.right_modality, True, representation
+            )
+        elif self.chunk_left:
+            return self._apply_representation_chunked(
+                self.left_modality, self.right_modality, False, representation
+            )
+        else:  # TODO: refactor this approach (it is changing the way the modalities are joined)
+            return self._apply_representation_chunked(
+                self.right_modality, self.left_modality, False, representation
+            )
+
+    def _apply_representation_chunked(
+        self, left_modality, right_modality, chunk_right, representation
+    ):
+        new_left = Modality(left_modality.modality_type, {})
+        new_right = Modality(right_modality.modality_type, {})
+
+        while (
+            left_modality.data_loader.next_chunk < left_modality.data_loader.num_chunks
+        ):
+            if chunk_right:
+                right_modality.extract_raw_data()
+                starting_idx = 0
+            else:
+                starting_idx = (
+                    left_modality.data_loader.next_chunk
+                    * left_modality.data_loader.chunk_size
+                )
+            left_modality.extract_raw_data()
+
+            self.execute(starting_idx)
+
+            right_transformed = self._apply_representation(
+                self.joined_right, representation
+            )
+            new_right.data.extend(right_transformed.data)
+            new_right.metadata.update(right_transformed.metadata)
+
+            left_transformed = self._apply_representation(left_modality, representation)
+            new_left.data.extend(left_transformed.data)
+            new_left.metadata.update(left_transformed.metadata)
+
+        new_left.update_metadata()
+        new_right.update_metadata()
+        return JoinedTransformedModality(
+            new_left, new_right, f"joined_{representation.name}"
+        )
+
+    def _apply_representation(self, modality, representation):
+        transformed = representation.transform(modality)
+        if self.aggregation:
+            aggregated_data_left = self.aggregation.window(transformed)
+            transformed = Modality(
+                transformed.modality_type,
+                transformed.metadata,
+            )
+            transformed.data = aggregated_data_left
+
+        return transformed
diff --git a/src/main/python/systemds/scuro/modality/joined_transformed.py b/src/main/python/systemds/scuro/modality/joined_transformed.py
new file mode 100644
index 00000000000..e2b53671aa8
--- /dev/null
+++ b/src/main/python/systemds/scuro/modality/joined_transformed.py
@@ -0,0 +1,70 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from functools import reduce
+from operator import or_
+
+import numpy as np
+
+from systemds.scuro.modality.modality import Modality
+from systemds.scuro.representations.utils import pad_sequences
+
+
+class JoinedTransformedModality(Modality):
+
+    def __init__(self, left_modality, right_modality, transformation):
+        """
+        Parent class of the different Modalities (unimodal & multimodal)
+        :param transformation: Representation to be applied on the modality
+        """
+        super().__init__(
+            reduce(or_, [left_modality.modality_type], right_modality.modality_type)
+        )
+        self.transformation = transformation
+        self.left_modality = left_modality
+        self.right_modality = right_modality
+
+    def combine(self, fusion_method):
+        """
+        Combines two or more modalities with each other using a dedicated fusion method
+        :param other: The modality to be combined
+        :param fusion_method: The fusion method to be used to combine modalities
+        """
+        modalities = [self.left_modality, self.right_modality]
+        self.data = []
+        for i in range(0, len(self.left_modality.data)):
+            self.data.append([])
+            for j in range(0, len(self.left_modality.data[i])):
+                self.data[i].append([])
+                fused = np.concatenate(
+                    [self.left_modality.data[i][j], self.right_modality.data[i][j]],
+                    axis=0,
+                )
+                self.data[i][j] = fused
+        # self.data = fusion_method.transform(modalities)
+
+        for i, instance in enumerate(
+            self.data
+        ):  # TODO: only if the layout is list_of_lists_of_numpy_array
+            r = []
+            [r.extend(l) for l in instance]
+            self.data[i] = np.array(r)
+        self.data = pad_sequences(self.data)
+        return self
diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
index 9a3d1b148d2..cce26eee014 100644
--- a/src/main/python/systemds/scuro/modality/modality.py
+++ b/src/main/python/systemds/scuro/modality/modality.py
@@ -20,25 +20,94 @@
 # -------------------------------------------------------------
 from typing import List
 
+import numpy as np
+
 from systemds.scuro.modality.type import ModalityType
 
 
 class Modality:
 
-    def __init__(self, modality_type: ModalityType):
+    def __init__(self, modalityType: ModalityType, metadata=None):
         """
         Parent class of the different Modalities (unimodal & multimodal)
         :param modality_type: Type of the modality
         """
-        self.type = modality_type
-        self.data = None
+        self.modality_type = modalityType
+        self.schema = modalityType.get_schema()
+        self.data = []
         self.data_type = None
         self.cost = None
         self.shape = None
-        self.schema = {}
+        self.dataIndex = None
+        self.metadata = metadata
 
     def get_modality_names(self) -> List[str]:
         """
         Extracts the individual unimodal modalities for a given transformed modality.
         """
-        return [modality.name for modality in ModalityType if modality in self.type]
+        return [
+            modality.name for modality in ModalityType if modality in self.modality_type
+        ]
+
+    def copy_from_instance(self):
+        return type(self)(self.modality_type, self.metadata)
+
+    def update_metadata(self):
+        md_copy = self.metadata
+        self.metadata = {}
+        for i, (md_k, md_v) in enumerate(md_copy.items()):
+            updated_md = self.modality_type.update_metadata(md_v, self.data[i])
+            self.metadata[md_k] = updated_md
+
+    def get_metadata_at_position(self, position: int):
+        return self.metadata[self.dataIndex][position]
+
+    def flatten(self):
+        for num_instance, instance in enumerate(self.data):
+            if type(instance) is np.ndarray:
+                self.data[num_instance] = instance.flatten()
+            elif type(instance) is list:
+                self.data[num_instance] = np.array(
+                    [item for sublist in instance for item in sublist]
+                )
+
+        self.data = np.array(self.data)
+        return self
+
+    def get_data_layout(self):
+        if not self.data:
+            return self.data
+
+        if isinstance(self.data[0], list):
+            return "list_of_lists_of_numpy_array"
+        elif isinstance(self.data[0], np.ndarray):
+            return "list_of_numpy_array"
+
+    def get_data_shape(self):
+        layout = self.get_data_layout()
+        if not layout:
+            return None
+
+        if layout == "list_of_lists_of_numpy_array":
+            return self.data[0][0].shape
+        elif layout == "list_of_numpy_array":
+            return self.data[0].shape
+
+    def get_data_dtype(self):
+        layout = self.get_data_layout()
+        if not layout:
+            return None
+
+        if layout == "list_of_lists_of_numpy_array":
+            return self.data[0][0].dtype
+        elif layout == "list_of_numpy_array":
+            return self.data[0].dtype
+
+    def update_data_layout(self):
+        if not self.data:
+            return
+
+        self.schema["data_layout"]["representation"] = self.get_data_layout()
+
+        self.shape = self.get_data_shape()
+        self.schema["data_layout"]["type"] = self.get_data_dtype()
diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py
index 61c327e469e..64bfba0819f 100644
--- a/src/main/python/systemds/scuro/modality/transformed.py
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -21,20 +21,64 @@
 from functools import reduce
 from operator import or_
 
+from systemds.scuro.modality.joined import JoinedModality
 from systemds.scuro.modality.modality import Modality
-from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.representations.window import WindowAggregation
 
 
 class TransformedModality(Modality):
 
-    def __init__(self, modality_type: ModalityType, transformation):
+    def __init__(self, modality_type, transformation, metadata):
         """
         Parent class of the different Modalities (unimodal & multimodal)
         :param modality_type: Type of the original modality(ies)
         :param transformation: Representation to be applied on the modality
         """
-        super().__init__(modality_type)
+        super().__init__(modality_type, metadata)
         self.transformation = transformation
+        self.data = []
+
+    def copy_from_instance(self):
+        return type(self)(self.modality_type, self.transformation, self.metadata)
+
+    def join(self, right, join_condition):
+        chunked_execution = False
+        if type(right).__name__.__contains__("Unimodal"):
+            if right.data_loader.chunk_size:
+                chunked_execution = True
+            elif right.data is None or len(right.data) == 0:
+                right.extract_raw_data()
+
+        joined_modality = JoinedModality(
+            reduce(or_, [right.modality_type], self.modality_type),
+            self,
+            right,
+            join_condition,
+            chunked_execution,
+        )
+
+        if not chunked_execution:
+            joined_modality.execute(0)
+
+        return joined_modality
+
+    def window(self, windowSize, aggregationFunction, fieldName=None):
+        transformed_modality = TransformedModality(
+            self.modality_type, "window", self.metadata
+        )
+        w = WindowAggregation(windowSize, aggregationFunction)
+        transformed_modality.data = w.window(self)
+
+        return transformed_modality
+
+    def apply_representation(self, representation, aggregation):
+        new_modality = representation.transform(self)
+
+        if aggregation:
+            new_modality.data = aggregation.window(new_modality)
+
+        new_modality.update_metadata()
+        return new_modality
 
     def combine(self, other, fusion_method):
         """
@@ -43,7 +87,9 @@ def combine(self, other, fusion_method):
         :param fusion_method: The fusion method to be used to combine modalities
         """
         fused_modality = TransformedModality(
-            reduce(or_, (o.type for o in other), self.type), fusion_method
+            reduce(or_, (o.modality_type for o in other), self.modality_type),
+            fusion_method,
+            self.metadata,
         )
         modalities = [self]
         modalities.extend(other)
diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py
index c451eea6f1d..197ad23c540 100644
--- a/src/main/python/systemds/scuro/modality/type.py
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -18,7 +18,90 @@
 # under the License.
 #
 # -------------------------------------------------------------
-from enum import Enum, Flag, auto
+from enum import Flag, auto
+from systemds.scuro.utils.schema_helpers import (
+    calculate_new_frequency,
+    create_timestamps,
+)
+
+
+# TODO: needs a way to define if data comes from a dataset with multiple instances or is like a streaming scenario where we only have one instance
+# right now it is a list of instances (if only one instance the list would contain only a single item)
+class ModalitySchemas:
+    TEXT_SCHEMA = {"type": "string", "length": "int"}
+
+    AUDIO_SCHEMA = {
+        "timestamp": "array",
+        "data_layout": {"type": "?", "representation": "?"},
+        "sample_rate": "integer",
+        "length": "integer",
+    }
+
+    VIDEO_SCHEMA = {
+        "timestamp": "array",
+        "data_layout": {"type": "?", "representation": "?"},
+        "fps": "integer",
+        "length": "integer",
+        "width": "integer",
+        "height": "integer",
+        "num_channels": "integer",
+    }
+
+    _metadata_handlers = {}
+
+    @classmethod
+    def get(cls, name):
+        return getattr(cls, f"{name}_SCHEMA", None)
+
+    @classmethod
+    def add_schema(cls, name, schema):
+        setattr(cls, f"{name}_SCHEMA", schema)
+
+    @classmethod
+    def register_metadata_handler(cls, name):
+        def decorator(metadata_handler):
+            cls._metadata_handlers[name] = metadata_handler
+            return metadata_handler
+
+        return decorator
+
+    @classmethod
+    def update_metadata(cls, name, md, data):
+        mdHandler = cls._metadata_handlers.get(name)
+        if mdHandler:
+            return mdHandler(md, data)
+
+    def extract_data(self, data, index):
+        if self.get("data_layout").get("representation") == "list_array":
+            return data[index]
+        else:
+            return data[index]
+
+
+@ModalitySchemas.register_metadata_handler("AUDIO")
+def handle_audio_metadata(md, data):
+    new_frequency = calculate_new_frequency(len(data), md["length"], md["sample_rate"])
+    md.update(
+        {
+            "length": len(data),
+            "sample_rate": new_frequency,
+            "timestamp": create_timestamps(new_frequency, len(data)),
+        }
+    )
+    return md
+
+
+@ModalitySchemas.register_metadata_handler("VIDEO")
+def handle_video_metadata(md, data):
+    new_frequency = calculate_new_frequency(len(data), md["length"], md["fps"])
+    md.update(
+        {
+            "length": len(data),
+            "fps": new_frequency,
+            "timestamp": create_timestamps(new_frequency, len(data)),
+        }
+    )
+    return md
 
 
 class ModalityType(Flag):
@@ -26,6 +109,8 @@ class ModalityType(Flag):
     AUDIO = auto()
     VIDEO = auto()
 
-    # def __init__(self, value, name):
-    #     self._value_ = value
-    #     self.name = name
+    def get_schema(self):
+        return ModalitySchemas.get(self.name)
+
+    def update_metadata(self, md, data):
+        return ModalitySchemas.update_metadata(self.name, md, data)
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index 976d4194d47..ae33b6605ba 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -18,8 +18,13 @@
 # under the License.
 #
 # -------------------------------------------------------------
+from functools import reduce
+from operator import or_
+
+
 from systemds.scuro.dataloader.base_loader import BaseLoader
 from systemds.scuro.modality.modality import Modality
+from systemds.scuro.modality.joined import JoinedModality
 from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.modality.type import ModalityType
 
@@ -32,28 +37,65 @@ def __init__(self, data_loader: BaseLoader, modality_type: ModalityType):
         :param data_loader: Defines how the raw data should be loaded
         :param modality_type: Type of the modality
         """
-        super().__init__(modality_type)
+        super().__init__(modality_type, None)
         self.data_loader = data_loader
 
+    def copy_from_instance(self):
+        new_instance = type(self)(self.data_loader, self.modality_type)
+        if self.metadata:
+            new_instance.metadata = self.metadata.copy()
+        return new_instance
+
+    def get_metadata_at_position(self, position: int):
+        if self.data_loader.chunk_size:
+            return self.metadata[
+                self.data_loader.chunk_size * self.data_loader.next_chunk + position
+            ]
+
+        return self.metadata[self.dataIndex][position]
+
     def extract_raw_data(self):
         """
         Uses the data loader to read the raw data from a specified location
         and stores the data in the data location.
-        TODO: schema
         """
-        self.data = self.data_loader.load()
+        self.data, self.metadata = self.data_loader.load()
+
+    def join(self, other, join_condition):
+        if isinstance(other, UnimodalModality):
+            self.data_loader.update_chunk_sizes(other.data_loader)
+
+        joined_modality = JoinedModality(
+            reduce(or_, [other.modality_type], self.modality_type),
+            self,
+            other,
+            join_condition,
+            self.data_loader.chunk_size is not None,
+        )
 
-    def apply_representation(self, representation):
-        new_modality = TransformedModality(self.type, representation)
+        return joined_modality
+
+    def apply_representation(self, representation, aggregation=None):
+        new_modality = TransformedModality(
+            self.modality_type, representation.name, self.data_loader.metadata.copy()
+        )
         new_modality.data = []
 
         if self.data_loader.chunk_size:
             while self.data_loader.next_chunk < self.data_loader.num_chunks:
                 self.extract_raw_data()
-                new_modality.data.extend(representation.transform(self.data))
+                transformed_chunk = representation.transform(self)
+                if aggregation:
+                    transformed_chunk.data = aggregation.window(transformed_chunk)
+                new_modality.data.extend(transformed_chunk.data)
+                new_modality.metadata.update(transformed_chunk.metadata)
         else:
             if not self.data:
                 self.extract_raw_data()
-            new_modality.data = representation.transform(self.data)
+            new_modality = representation.transform(self)
+
+            if aggregation:
+                new_modality.data = aggregation.window(new_modality)
 
+        new_modality.update_metadata()
         return new_modality
diff --git a/src/main/python/systemds/scuro/representations/aggregate.py b/src/main/python/systemds/scuro/representations/aggregate.py
new file mode 100644
index 00000000000..7c8d1c68d12
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/aggregate.py
@@ -0,0 +1,51 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import numpy as np
+
+from systemds.scuro.modality.modality import Modality
+
+
+# TODO: make this a Representation and add a fusion method that fuses two modalities with each other
+
+
+class Aggregation:
+    def __init__(self, aggregation_function, field_name):
+        self.aggregation_function = aggregation_function
+        self.field_name = field_name
+
+    def aggregate(self, modality):
+        aggregated_modality = Modality(modality.modality_type, modality.metadata)
+        aggregated_modality.data = []
+        for i, instance in enumerate(modality.data):
+            aggregated_modality.data.append([])
+            for j, entry in enumerate(instance):
+                if self.aggregation_function == "sum":
+                    aggregated_modality.data[i].append(np.sum(entry, axis=0))
+                elif self.aggregation_function == "mean":
+                    aggregated_modality.data[i].append(np.mean(entry, axis=0))
+                elif self.aggregation_function == "min":
+                    aggregated_modality.data[i].append(np.min(entry, axis=0))
+                elif self.aggregation_function == "max":
+                    aggregated_modality.data[i].append(np.max(entry, axis=0))
+                else:
+                    raise ValueError("Invalid aggregation function")
+
+        return aggregated_modality
diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py
index 0fcf1e8d280..bfaaa22642a 100644
--- a/src/main/python/systemds/scuro/representations/bert.py
+++ b/src/main/python/systemds/scuro/representations/bert.py
@@ -21,6 +21,7 @@
 
 import numpy as np
 
+from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 import torch
 from transformers import BertTokenizer, BertModel
@@ -28,30 +29,29 @@
 
 
 class Bert(UnimodalRepresentation):
-    def __init__(self, avg_layers=None, output_file=None):
+    def __init__(self, output_file=None):
         super().__init__("Bert")
 
-        self.avg_layers = avg_layers
         self.output_file = output_file
 
-    def transform(self, data):
-
+    def transform(self, modality):
+        transformed_modality = TransformedModality(
+            modality.modality_type, self, modality.metadata
+        )
         model_name = "bert-base-uncased"
         tokenizer = BertTokenizer.from_pretrained(
             model_name, clean_up_tokenization_spaces=True
         )
 
-        if self.avg_layers is not None:
-            model = BertModel.from_pretrained(model_name, output_hidden_states=True)
-        else:
-            model = BertModel.from_pretrained(model_name)
+        model = BertModel.from_pretrained(model_name)
 
-        embeddings = self.create_embeddings(data, model, tokenizer)
+        embeddings = self.create_embeddings(modality.data, model, tokenizer)
 
         if self.output_file is not None:
             save_embeddings(embeddings, self.output_file)
 
-        return embeddings
+        transformed_modality.data = embeddings
+        return transformed_modality
 
     def create_embeddings(self, data, model, tokenizer):
         embeddings = []
@@ -61,15 +61,8 @@ def create_embeddings(self, data, model, tokenizer):
             with torch.no_grad():
                 outputs = model(**inputs)
 
-            if self.avg_layers is not None:
-                cls_embedding = [
-                    outputs.hidden_states[i][:, 0, :]
-                    for i in range(-self.avg_layers, 0)
-                ]
-                cls_embedding = torch.mean(torch.stack(cls_embedding), dim=0).numpy()
-            else:
                 cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
-            embeddings.append(cls_embedding)
+                embeddings.append(cls_embedding)
 
         embeddings = np.array(embeddings)
         return embeddings.reshape((embeddings.shape[0], embeddings.shape[-1]))
diff --git a/src/main/python/systemds/scuro/representations/bow.py b/src/main/python/systemds/scuro/representations/bow.py
index bd54654a5cb..f16f6ec04d8 100644
--- a/src/main/python/systemds/scuro/representations/bow.py
+++ b/src/main/python/systemds/scuro/representations/bow.py
@@ -21,6 +21,7 @@
 
 from sklearn.feature_extraction.text import CountVectorizer
 
+from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from systemds.scuro.representations.utils import save_embeddings
 
@@ -32,14 +33,18 @@ def __init__(self, ngram_range, min_df, output_file=None):
         self.min_df = min_df
         self.output_file = output_file
 
-    def transform(self, data):
+    def transform(self, modality):
+        transformed_modality = TransformedModality(
+            modality.modality_type, self, modality.metadata
+        )
         vectorizer = CountVectorizer(
             ngram_range=(1, self.ngram_range), min_df=self.min_df
         )
 
-        X = vectorizer.fit_transform(data).toarray()
+        X = vectorizer.fit_transform(modality.data).toarray()
 
         if self.output_file is not None:
             save_embeddings(X, self.output_file)
 
-        return X
+        transformed_modality.data = X
+        return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/glove.py b/src/main/python/systemds/scuro/representations/glove.py
index cf13c717d2f..767fc8d375e 100644
--- a/src/main/python/systemds/scuro/representations/glove.py
+++ b/src/main/python/systemds/scuro/representations/glove.py
@@ -19,7 +19,8 @@
 #
 # -------------------------------------------------------------
 import numpy as np
-from nltk import word_tokenize
+from gensim.utils import tokenize
+
 
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from systemds.scuro.representations.utils import read_data_from_file, save_embeddings
@@ -47,7 +48,7 @@ def transform(self, data):
 
         embeddings = []
         for sentences in data:
-            tokens = word_tokenize(sentences.lower())
+            tokens = list(tokenize(sentences.lower()))
             embeddings.append(
                 np.mean(
                     [
diff --git a/src/main/python/systemds/scuro/representations/lstm.py b/src/main/python/systemds/scuro/representations/lstm.py
index 649b81117b2..6f06e762a56 100644
--- a/src/main/python/systemds/scuro/representations/lstm.py
+++ b/src/main/python/systemds/scuro/representations/lstm.py
@@ -46,11 +46,11 @@ def transform(self, modalities: List[Modality]):
         result = np.zeros((size, 0))
 
         for modality in modalities:
-            if modality.type in self.unimodal_embeddings.keys():
-                out = self.unimodal_embeddings.get(modality.type)
+            if modality.modality_type in self.unimodal_embeddings.keys():
+                out = self.unimodal_embeddings.get(modality.modality_type)
             else:
                 out = self.run_lstm(modality.data)
-                self.unimodal_embeddings[modality.type] = out
+                self.unimodal_embeddings[modality.modality_type] = out
 
             result = np.concatenate([result, out], axis=-1)
 
diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
index 57a7fab83e2..483ea181b8f 100644
--- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
@@ -18,44 +18,40 @@
 # under the License.
 #
 # -------------------------------------------------------------
-
-import pickle
-
 import librosa
 import numpy as np
-from systemds.scuro.representations.utils import pad_sequences
 
+from systemds.scuro.modality.transformed import TransformedModality
+
+# import matplotlib.pyplot as plt
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 
 
 class MelSpectrogram(UnimodalRepresentation):
-    def __init__(self, avg=True, output_file=None):
+    def __init__(self):
         super().__init__("MelSpectrogram")
-        self.avg = avg
-        self.output_file = output_file
 
-    def transform(self, data):
+    def transform(self, modality):
+        transformed_modality = TransformedModality(
+            modality.modality_type, self, modality.metadata
+        )
         result = []
         max_length = 0
-        for sample in data:
-            S = librosa.feature.melspectrogram(y=sample)
+        for sample in modality.data:
+            S = librosa.feature.melspectrogram(y=sample, sr=22050)
             S_dB = librosa.power_to_db(S, ref=np.max)
             if S_dB.shape[-1] > max_length:
                 max_length = S_dB.shape[-1]
-            result.append(S_dB)
-
-        r = []
-        for elem in result:
-            d = pad_sequences(elem, maxlen=max_length, dtype="float32")
-            r.append(d)
-
-        np_array_r = np.array(r) if not self.avg else np.mean(np.array(r), axis=1)
-
-        if self.output_file is not None:
-            data = []
-            for i in range(0, np_array_r.shape[0]):
-                data.append(np_array_r[i])
-            with open(self.output_file, "wb") as file:
-                pickle.dump(data, file)
-
-        return np_array_r
+            result.append(S_dB.T)
+
+        transformed_modality.data = result
+        return transformed_modality
+
+    # def plot_spectrogram(self, spectrogram):
+    #     plt.figure(figsize=(10, 4))
+    #     librosa.display.specshow(
+    #         spectrogram, x_axis="time", y_axis="mel", sr=22050, cmap="viridis"
+    #     )
+    #     plt.colorbar(format="%+2.0f dB")
+    #     plt.title("Mel Spectrogram")
+    #     plt.savefig("spectrogram.jpg")
diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py
index 1c1bfa1d5ec..ff63e6766b6 100644
--- a/src/main/python/systemds/scuro/representations/resnet.py
+++ b/src/main/python/systemds/scuro/representations/resnet.py
@@ -19,9 +19,7 @@
 #
 # -------------------------------------------------------------
 
-
-import h5py
-
+from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from typing import Callable, Dict, Tuple, Any
 import torch.utils.data
@@ -30,23 +28,61 @@
 import torchvision.transforms as transforms
 import numpy as np
 
-DEVICE = "cpu"
+if torch.backends.mps.is_available():
+    DEVICE = torch.device("mps")
+elif torch.cuda.is_available():
+    DEVICE = torch.device("cuda")
+else:
+    DEVICE = torch.device("cpu")
 
 
 class ResNet(UnimodalRepresentation):
-    def __init__(self, layer="avgpool", output_file=None):
+    def __init__(self, layer="avgpool", model_name="ResNet18", output_file=None):
         super().__init__("ResNet")
 
         self.output_file = output_file
         self.layer_name = layer
+        self.model = model_name
+        self.model.eval()
+        for param in self.model.parameters():
+            param.requires_grad = False
 
-    def transform(self, data):
-
-        resnet = models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to(DEVICE)
-        resnet.eval()
+        class Identity(torch.nn.Module):
+            def forward(self, input_: torch.Tensor) -> torch.Tensor:
+                return input_
 
-        for param in resnet.parameters():
-            param.requires_grad = False
+        self.model.fc = Identity()
+
+    @property
+    def model(self):
+        return self._model
+
+    @model.setter
+    def model(self, model):
+        if model == "ResNet18":
+            self._model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT).to(
+                DEVICE
+            )
+        elif model == "ResNet34":
+            self._model = models.resnet34(weights=models.ResNet34_Weights.DEFAULT).to(
+                DEVICE
+            )
+        elif model == "ResNet50":
+            self._model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT).to(
+                DEVICE
+            )
+        elif model == "ResNet101":
+            self._model = models.resnet101(weights=models.ResNet101_Weights.DEFAULT).to(
+                DEVICE
+            )
+        elif model == "ResNet152":
+            self._model = models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to(
+                DEVICE
+            )
+        else:
+            raise NotImplementedError
+
+    def transform(self, modality):
 
         t = transforms.Compose(
             [
@@ -60,15 +96,9 @@ def transform(self, data):
             ]
         )
 
-        dataset = ResNetDataset(data, t)
+        dataset = ResNetDataset(modality.data, t)
         embeddings = {}
 
-        class Identity(torch.nn.Module):
-            def forward(self, input_: torch.Tensor) -> torch.Tensor:
-                return input_
-
-        resnet.fc = Identity()
-
         res5c_output = None
 
         def get_features(name_):
@@ -81,14 +111,14 @@ def hook(
             return hook
 
         if self.layer_name:
-            for name, layer in resnet.named_modules():
+            for name, layer in self.model.named_modules():
                 if name == self.layer_name:
                     layer.register_forward_hook(get_features(name))
                     break
 
         for instance in torch.utils.data.DataLoader(dataset):
             video_id = instance["id"][0]
-            frames = instance["frames"][0].to(DEVICE)
+            frames = instance["data"][0].to(DEVICE)
             embeddings[video_id] = []
             batch_size = 64
 
@@ -97,32 +127,21 @@ def hook(
                 frame_ids_range = range(start_index, end_index)
                 frame_batch = frames[frame_ids_range]
 
-                _ = resnet(frame_batch)
+                _ = self.model(frame_batch)
                 values = res5c_output
+                pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1))
 
-                if self.layer_name == "avgpool" or self.layer_name == "maxpool":
-                    embeddings[video_id].extend(
-                        torch.flatten(values, 1).detach().cpu().numpy()
-                    )
-
-                else:
-                    pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1))
-
-                    embeddings[video_id].extend(
-                        torch.flatten(pooled, 1).detach().cpu().numpy()
-                    )
+                embeddings[video_id].extend(
+                    torch.flatten(pooled, 1).detach().cpu().numpy()
+                )
 
-        if self.output_file is not None:
-            with h5py.File(self.output_file, "w") as hdf:
-                for key, value in embeddings.items():
-                    hdf.create_dataset(key, data=value)
-
-        emb = []
-
-        for video in embeddings.values():
-            emb.append(np.array(video).mean(axis=0).tolist())
+        transformed_modality = TransformedModality(
+            modality.modality_type, "resnet", modality.metadata
+        )
+        transformed_modality.data = list(embeddings.values())
+        transformed_modality.update_data_layout()
 
-        return np.array(emb)
+        return transformed_modality
 
 
 class ResNetDataset(torch.utils.data.Dataset):
@@ -131,12 +150,23 @@ def __init__(self, data: str, tf: Callable = None):
         self.tf = tf
 
     def __getitem__(self, index) -> Dict[str, object]:
-        video = self.data[index]
-        frames = torch.empty((len(video), 3, 224, 224))
-
-        for i, frame in enumerate(video):
-            frames[i] = self.tf(frame)
-        return {"id": index, "frames": frames}
+        data = self.data[index]
+        if type(data) is np.ndarray:
+            output = torch.empty((1, 3, 224, 224))
+            d = torch.tensor(data)
+            d = d.repeat(3, 1, 1)
+            output[0] = self.tf(d)
+        else:
+            output = torch.empty((len(data), 3, 224, 224))
+
+            for i, d in enumerate(data):
+                if data[0].ndim < 3:
+                    d = torch.tensor(d)
+                    d = d.repeat(3, 1, 1)
+
+                output[i] = self.tf(d)
+
+        return {"id": index, "data": output}
 
     def __len__(self) -> int:
         return len(self.data)
diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py
index 4849aba1360..02cfb927c71 100644
--- a/src/main/python/systemds/scuro/representations/tfidf.py
+++ b/src/main/python/systemds/scuro/representations/tfidf.py
@@ -20,7 +20,7 @@
 # -------------------------------------------------------------
 
 from sklearn.feature_extraction.text import TfidfVectorizer
-
+from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from systemds.scuro.representations.utils import read_data_from_file, save_embeddings
 
@@ -31,13 +31,17 @@ def __init__(self, min_df, output_file=None):
         self.min_df = min_df
         self.output_file = output_file
 
-    def transform(self, data):
+    def transform(self, modality):
+        transformed_modality = TransformedModality(
+            modality.modality_type, self, modality.metadata
+        )
         vectorizer = TfidfVectorizer(min_df=self.min_df)
 
-        X = vectorizer.fit_transform(data)
+        X = vectorizer.fit_transform(modality.data)
         X = X.toarray()
 
         if self.output_file is not None:
             save_embeddings(X, self.output_file)
 
-        return X
+        transformed_modality.data = X
+        return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/window.py b/src/main/python/systemds/scuro/representations/window.py
new file mode 100644
index 00000000000..af0301d0e3b
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/window.py
@@ -0,0 +1,49 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import numpy as np
+import math
+
+
+# TODO: move this into the aggregation class and add an aggregate() and a window(window_size) function there so they can use the same functionality.
+class WindowAggregation:
+    def __init__(self, window_size, aggregation_function):
+        self.window_size = window_size
+        self.aggregation_function = aggregation_function
+
+    def window(self, modality):
+        # data is a 2d array
+        windowed_data = []
+        for instance in modality.data:
+            window_length = math.ceil(len(instance) / self.window_size)
+            result = [[] for _ in range(0, window_length)]
+            # if modality.schema["data_layout"]["representation"] == "list_of_lists_of_numpy_array":
+            data = np.stack(instance)
+            for i in range(0, window_length):
+                result[i] = np.mean(
+                    data[
+                        i * self.window_size : i * self.window_size + self.window_size
+                    ],
+                    axis=0,
+                )  # TODO: add actual aggregation function here
+
+            windowed_data.append(result)
+
+        return windowed_data
diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py
index 209091648d5..b68a9fd3eb4 100644
--- a/src/main/python/systemds/scuro/representations/word2vec.py
+++ b/src/main/python/systemds/scuro/representations/word2vec.py
@@ -19,11 +19,11 @@
 #
 # -------------------------------------------------------------
 import numpy as np
-
+from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from systemds.scuro.representations.utils import save_embeddings
 from gensim.models import Word2Vec
-from nltk.tokenize import word_tokenize
+from gensim.utils import tokenize
 
 
 def get_embedding(sentence, model):
@@ -43,8 +43,11 @@ def __init__(self, vector_size, min_count, window, output_file=None):
         self.window = window
         self.output_file = output_file
 
-    def transform(self, data):
-        t = [word_tokenize(s.lower()) for s in data]
+    def transform(self, modality):
+        transformed_modality = TransformedModality(
+            modality.modality_type, self, modality.metadata
+        )
+        t = [list(tokenize(s.lower())) for s in modality.data]
         model = Word2Vec(
             sentences=t,
             vector_size=self.vector_size,
@@ -52,11 +55,11 @@ def transform(self, data):
             min_count=self.min_count,
         )
         embeddings = []
-        for sentences in data:
-            tokens = word_tokenize(sentences.lower())
+        for sentences in modality.data:
+            tokens = list(tokenize(sentences.lower()))
             embeddings.append(get_embedding(tokens, model))
 
         if self.output_file is not None:
             save_embeddings(np.array(embeddings), self.output_file)
-
-        return np.array(embeddings)
+        transformed_modality.data = np.array(embeddings)
+        return transformed_modality
diff --git a/src/main/python/systemds/scuro/utils/__init__.py b/src/main/python/systemds/scuro/utils/__init__.py
new file mode 100644
index 00000000000..e66abb4646f
--- /dev/null
+++ b/src/main/python/systemds/scuro/utils/__init__.py
@@ -0,0 +1,20 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
diff --git a/src/main/python/systemds/scuro/utils/schema_helpers.py b/src/main/python/systemds/scuro/utils/schema_helpers.py
new file mode 100644
index 00000000000..a88e81f7161
--- /dev/null
+++ b/src/main/python/systemds/scuro/utils/schema_helpers.py
@@ -0,0 +1,43 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import math
+import numpy as np
+
+
+def create_timestamps(frequency, sample_length, start_datetime=None):
+    start_time = (
+        start_datetime
+        if start_datetime is not None
+        else np.datetime64("1970-01-01T00:00:00.000000")
+    )
+    time_increment = 1 / frequency
+    time_increments_array = np.arange(sample_length) * np.timedelta64(
+        int(time_increment * 1e6)
+    )
+    timestamps = start_time + time_increments_array
+
+    return timestamps.astype(np.int64)
+
+
+def calculate_new_frequency(new_length, old_length, old_frequency):
+    duration = old_length / old_frequency
+    new_frequency = new_length / duration
+    return new_frequency
diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py
index 6856ee70442..ec0783df9cb 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -18,30 +18,78 @@
 # under the License.
 #
 # -------------------------------------------------------------
+import shutil
+
 import cv2
 import numpy as np
 from scipy.io.wavfile import write
 import random
 import os
+
+from systemds.scuro import VideoLoader, AudioLoader, TextLoader, UnimodalModality
 from systemds.scuro.modality.type import ModalityType
 
 
+def setup_data(modalities, num_instances, path):
+    if os.path.isdir(path):
+        shutil.rmtree(path)
+
+    os.makedirs(path)
+
+    indizes = [str(i) for i in range(0, num_instances)]
+
+    modalities_to_create = []
+    for modality in modalities:
+        mod_path = path + "/" + modality.name + "/"
+
+        if modality == ModalityType.VIDEO:
+            data_loader = VideoLoader(mod_path, indizes)
+        elif modality == ModalityType.AUDIO:
+            data_loader = AudioLoader(mod_path, indizes)
+        elif modality == ModalityType.TEXT:
+            data_loader = TextLoader(mod_path, indizes)
+        else:
+            raise "Modality not supported in DataGenerator"
+
+        modalities_to_create.append(UnimodalModality(data_loader, modality))
+
+    data_generator = TestDataGenerator(modalities_to_create, path)
+    data_generator.create_multimodal_data(num_instances)
+    return data_generator
+
+
 class TestDataGenerator:
     def __init__(self, modalities, path, balanced=True):
+
         self.modalities = modalities
+        self.modalities_by_type = {}
+        for modality in modalities:
+            self.modalities_by_type[modality.modality_type] = modality
+
+        self._indices = None
         self.path = path
         self.balanced = balanced
 
         for modality in modalities:
-            mod_path = f"{self.path}/{modality.type.name}/"
+            mod_path = f"{self.path}/{modality.modality_type.name}/"
             os.mkdir(mod_path)
             modality.file_path = mod_path
         self.labels = []
         self.label_path = f"{path}/labels.npy"
 
+    def get_modality_path(self, modality_type):
+        return self.modalities_by_type[modality_type].data_loader.source_path
+
+    @property
+    def indices(self):
+        if self._indices is None:
+            raise "No indices available, please call setup_data first"
+        return self._indices
+
     def create_multimodal_data(self, num_instances, duration=2, seed=42):
         speed_fast = 0
         speed_slow = 0
+        self._indices = [str(i) for i in range(0, num_instances)]
         for idx in range(num_instances):
             np.random.seed(seed)
             if self.balanced:
@@ -69,11 +117,11 @@ def create_multimodal_data(self, num_instances, duration=2, seed=42):
                 speed_slow += 1
 
             for modality in self.modalities:
-                if modality.type == ModalityType.VIDEO:
+                if modality.modality_type == ModalityType.VIDEO:
                     self.__create_video_data(idx, duration, 30, speed_factor)
-                if modality.type == ModalityType.AUDIO:
+                if modality.modality_type == ModalityType.AUDIO:
                     self.__create_audio_data(idx, duration, speed_factor)
-                if modality.type == ModalityType.TEXT:
+                if modality.modality_type == ModalityType.TEXT:
                     self.__create_text_data(idx, speed_factor)
 
         np.save(f"{self.path}/labels.npy", np.array(self.labels))
diff --git a/src/main/python/tests/scuro/test_data_loaders.py b/src/main/python/tests/scuro/test_data_loaders.py
index 55704b8d8af..4ca77b205d0 100644
--- a/src/main/python/tests/scuro/test_data_loaders.py
+++ b/src/main/python/tests/scuro/test_data_loaders.py
@@ -26,7 +26,7 @@
 from systemds.scuro.representations.bert import Bert
 from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
 from systemds.scuro.representations.resnet import ResNet
-from tests.scuro.data_generator import TestDataGenerator
+from tests.scuro.data_generator import setup_data
 
 from systemds.scuro.dataloader.audio_loader import AudioLoader
 from systemds.scuro.dataloader.video_loader import VideoLoader
@@ -42,39 +42,25 @@ class TestDataLoaders(unittest.TestCase):
     video = None
     data_generator = None
     num_instances = 0
-    indizes = []
 
     @classmethod
     def setUpClass(cls):
         cls.test_file_path = "test_data"
-
-        if os.path.isdir(cls.test_file_path):
-            shutil.rmtree(cls.test_file_path)
-
-        os.makedirs(f"{cls.test_file_path}/embeddings")
-
         cls.num_instances = 2
-        cls.indizes = [str(i) for i in range(0, cls.num_instances)]
+        cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
+        cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path)
 
-        cls.video_path = cls.test_file_path + "/" + ModalityType.VIDEO.name + "/"
-        cls.audio_path = cls.test_file_path + "/" + ModalityType.AUDIO.name + "/"
-        cls.text_path = cls.test_file_path + "/" + ModalityType.TEXT.name + "/"
-
-        video_data_loader = VideoLoader(cls.video_path, cls.indizes)
-        audio_data_loader = AudioLoader(cls.audio_path, cls.indizes)
-        text_data_loader = TextLoader(cls.text_path, cls.indizes)
-
-        # Load modalities (audio, video, text)
-        video = UnimodalModality(video_data_loader, ModalityType.VIDEO)
-        audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO)
-        text = UnimodalModality(text_data_loader, ModalityType.TEXT)
+        os.makedirs(f"{cls.test_file_path}/embeddings")
 
-        cls.mods = [video, audio, text]
-        cls.data_generator = TestDataGenerator(cls.mods, cls.test_file_path)
-        cls.data_generator.create_multimodal_data(cls.num_instances)
-        cls.text_ref = text.apply_representation(Bert())
-        cls.audio_ref = audio.apply_representation(MelSpectrogram())
-        cls.video_ref = video.apply_representation(ResNet())
+        cls.text_ref = cls.data_generator.modalities_by_type[
+            ModalityType.TEXT
+        ].apply_representation(Bert())
+        cls.audio_ref = cls.data_generator.modalities_by_type[
+            ModalityType.AUDIO
+        ].apply_representation(MelSpectrogram())
+        cls.video_ref = cls.data_generator.modalities_by_type[
+            ModalityType.VIDEO
+        ].apply_representation(ResNet())
 
     @classmethod
     def tearDownClass(cls):
@@ -82,25 +68,38 @@ def tearDownClass(cls):
         shutil.rmtree(cls.test_file_path)
 
     def test_load_audio_data_from_file(self):
-        audio_data_loader = AudioLoader(self.audio_path, self.indizes)
+        audio_data_loader = AudioLoader(
+            self.data_generator.get_modality_path(ModalityType.AUDIO),
+            self.data_generator.indices,
+        )
         audio = UnimodalModality(
             audio_data_loader, ModalityType.AUDIO
         ).apply_representation(MelSpectrogram())
 
         for i in range(0, self.num_instances):
-            assert round(sum(self.audio_ref.data[i]), 4) == round(sum(audio.data[i]), 4)
+            assert round(sum(sum(self.audio_ref.data[i])), 4) == round(
+                sum(sum(audio.data[i])), 4
+            )
 
     def test_load_video_data_from_file(self):
-        video_data_loader = VideoLoader(self.video_path, self.indizes)
+        video_data_loader = VideoLoader(
+            self.data_generator.get_modality_path(ModalityType.VIDEO),
+            self.data_generator.indices,
+        )
         video = UnimodalModality(
             video_data_loader, ModalityType.VIDEO
         ).apply_representation(ResNet())
 
         for i in range(0, self.num_instances):
-            assert round(sum(self.video_ref.data[i]), 4) == round(sum(video.data[i]), 4)
+            assert round(sum(sum(self.video_ref.data[i])), 4) == round(
+                sum(sum(video.data[i])), 4
+            )
 
     def test_load_text_data_from_file(self):
-        text_data_loader = TextLoader(self.text_path, self.indizes)
+        text_data_loader = TextLoader(
+            self.data_generator.get_modality_path(ModalityType.TEXT),
+            self.data_generator.indices,
+        )
         text = UnimodalModality(
             text_data_loader, ModalityType.TEXT
         ).apply_representation(Bert())
diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py
index d0d7ef50770..f2ba9d2d790 100644
--- a/src/main/python/tests/scuro/test_dr_search.py
+++ b/src/main/python/tests/scuro/test_dr_search.py
@@ -25,14 +25,10 @@
 import numpy as np
 from sklearn import svm
 from sklearn.metrics import classification_report
-from sklearn.model_selection import train_test_split, KFold
+from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import MinMaxScaler
 
-from systemds.scuro.modality.unimodal_modality import UnimodalModality
 from systemds.scuro.modality.type import ModalityType
-from systemds.scuro.dataloader.text_loader import TextLoader
-from systemds.scuro.dataloader.audio_loader import AudioLoader
-from systemds.scuro.dataloader.video_loader import VideoLoader
 from systemds.scuro.aligner.dr_search import DRSearch
 from systemds.scuro.aligner.task import Task
 from systemds.scuro.models.model import Model
@@ -45,7 +41,7 @@
 from systemds.scuro.representations.multiplication import Multiplication
 from systemds.scuro.representations.resnet import ResNet
 from systemds.scuro.representations.sum import Sum
-from tests.scuro.data_generator import TestDataGenerator
+from tests.scuro.data_generator import setup_data
 
 import warnings
 
@@ -89,56 +85,54 @@ class TestDataLoaders(unittest.TestCase):
     video = None
     data_generator = None
     num_instances = 0
-    indizes = []
     representations = None
 
     @classmethod
     def setUpClass(cls):
         cls.test_file_path = "test_data_dr_search"
+        cls.num_instances = 20
+        modalities = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
 
-        if os.path.isdir(cls.test_file_path):
-            shutil.rmtree(cls.test_file_path)
-
+        cls.data_generator = setup_data(
+            modalities, cls.num_instances, cls.test_file_path
+        )
         os.makedirs(f"{cls.test_file_path}/embeddings")
 
-        cls.num_instances = 8
-        cls.indizes = [str(i) for i in range(0, cls.num_instances)]
+        # TODO: adapt the representation so they return non aggregated values. Apply windowing operation instead
 
-        video_data_loader = VideoLoader(
-            cls.test_file_path + "/" + ModalityType.VIDEO.name + "/", cls.indizes
-        )
-        audio_data_loader = AudioLoader(
-            cls.test_file_path + "/" + ModalityType.AUDIO.name + "/", cls.indizes
+        cls.bert = cls.data_generator.modalities_by_type[
+            ModalityType.TEXT
+        ].apply_representation(Bert())
+        cls.mel_spe = (
+            cls.data_generator.modalities_by_type[ModalityType.AUDIO]
+            .apply_representation(MelSpectrogram())
+            .flatten()
         )
-        text_data_loader = TextLoader(
-            cls.test_file_path + "/" + ModalityType.TEXT.name + "/", cls.indizes
+        cls.resnet = (
+            cls.data_generator.modalities_by_type[ModalityType.VIDEO]
+            .apply_representation(ResNet())
+            .window(10, "avg")
+            .flatten()
         )
-        video = UnimodalModality(video_data_loader, ModalityType.VIDEO)
-        audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO)
-        text = UnimodalModality(text_data_loader, ModalityType.TEXT)
-        cls.data_generator = TestDataGenerator([video, audio, text], cls.test_file_path)
-        cls.data_generator.create_multimodal_data(cls.num_instances)
-
-        cls.bert = text.apply_representation(Bert())
-        cls.mel_spe = audio.apply_representation(MelSpectrogram())
-        cls.resnet = video.apply_representation(ResNet())
-
         cls.mods = [cls.bert, cls.mel_spe, cls.resnet]
 
         split = train_test_split(
-            cls.indizes, cls.data_generator.labels, test_size=0.2, random_state=42
+            cls.data_generator.indices,
+            cls.data_generator.labels,
+            test_size=0.2,
+            random_state=42,
         )
         cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [
             int(i) for i in split[1]
         ]
 
         for m in cls.mods:
-            m.data = scale_data(m.data, [int(i) for i in cls.train_indizes])
+            m.data = scale_data(m.data, cls.train_indizes)
 
         cls.representations = [
             Concatenation(),
             Average(),
-            RowMax(),
+            RowMax(100),
             Multiplication(),
             Sum(),
             LSTM(width=256, depth=3),
diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py
new file mode 100644
index 00000000000..c48f5f56b2a
--- /dev/null
+++ b/src/main/python/tests/scuro/test_multimodal_join.py
@@ -0,0 +1,135 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# TODO: Test edge cases: unequal number of audio-video timestamps (should still work and add the average over all audio/video samples)
+
+import shutil
+import unittest
+
+from systemds.scuro.modality.joined import JoinCondition
+from systemds.scuro.representations.window import WindowAggregation
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
+from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
+from systemds.scuro.representations.resnet import ResNet
+from tests.scuro.data_generator import setup_data
+
+from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.video_loader import VideoLoader
+from systemds.scuro.modality.type import ModalityType
+
+
+class TestMultimodalJoin(unittest.TestCase):
+    test_file_path = None
+    mods = None
+    text = None
+    audio = None
+    video = None
+    data_generator = None
+    num_instances = 0
+    indizes = []
+
+    @classmethod
+    def setUpClass(cls):
+        cls.test_file_path = "join_test_data"
+        cls.num_instances = 4
+        cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO]
+
+        cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path)
+
+    @classmethod
+    def tearDownClass(cls):
+        print("Cleaning up test data")
+        shutil.rmtree(cls.test_file_path)
+
+    def test_video_audio_join(self):
+        self._execute_va_join()
+
+    def test_chunked_video_audio_join(self):
+        self._execute_va_join(2)
+
+    def test_video_chunked_audio_join(self):
+        self._execute_va_join(None, 2)
+
+    def test_chunked_video_chunked_audio_join(self):
+        self._execute_va_join(2, 2)
+
+    def test_audio_video_join(self):
+        # Audio has a much higher frequency than video, hence we would need to
+        # duplicate or interpolate frames to match them to the audio frequency
+        self._execute_av_join()
+
+    # TODO
+    # def test_chunked_audio_video_join(self):
+    #     self._execute_av_join(2)
+
+    # TODO
+    # def test_chunked_audio_chunked_video_join(self):
+    #     self._execute_av_join(2, 2)
+
+    def _execute_va_join(self, l_chunk_size=None, r_chunk_size=None):
+        video, audio = self._prepare_data(l_chunk_size, r_chunk_size)
+        self._join(video, audio, 2)
+
+    def _execute_av_join(self, l_chunk_size=None, r_chunk_size=None):
+        video, audio = self._prepare_data(l_chunk_size, r_chunk_size)
+        self._join(audio, video, 2)
+
+    def _prepare_data(self, l_chunk_size=None, r_chunk_size=None):
+        video_data_loader = VideoLoader(
+            self.data_generator.get_modality_path(ModalityType.VIDEO),
+            self.data_generator.indices,
+            chunk_size=l_chunk_size,
+        )
+        video = UnimodalModality(video_data_loader, ModalityType.VIDEO)
+
+        audio_data_loader = AudioLoader(
+            self.data_generator.get_modality_path(ModalityType.AUDIO),
+            self.data_generator.indices,
+            r_chunk_size,
+        )
+        audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO)
+
+        mel_audio = audio.apply_representation(MelSpectrogram())
+
+        return video, mel_audio
+
+    def _join(self, left_modality, right_modality, window_size):
+        resnet_modality = (
+            left_modality.join(
+                right_modality, JoinCondition("timestamp", "timestamp", "<")
+            )
+            .apply_representation(
+                ResNet(layer="layer1.0.conv2", model_name="ResNet50"),
+                WindowAggregation(window_size=window_size, aggregation_function="mean"),
+            )
+            .combine("concat")
+        )
+
+        assert resnet_modality.left_modality is not None
+        assert resnet_modality.right_modality is not None
+        assert len(resnet_modality.left_modality.data) == self.num_instances
+        assert len(resnet_modality.right_modality.data) == self.num_instances
+        assert resnet_modality.data is not None
+
+        return resnet_modality
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py b/src/main/python/tests/scuro/test_unimodal_representations.py
new file mode 100644
index 00000000000..d566830697f
--- /dev/null
+++ b/src/main/python/tests/scuro/test_unimodal_representations.py
@@ -0,0 +1,120 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import os
+import shutil
+import unittest
+
+from systemds.scuro.representations.bow import BoW
+from systemds.scuro.representations.word2vec import W2V
+from systemds.scuro.representations.tfidf import TfIdf
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
+from systemds.scuro.representations.bert import Bert
+from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
+from systemds.scuro.representations.resnet import ResNet
+from tests.scuro.data_generator import setup_data
+
+from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.video_loader import VideoLoader
+from systemds.scuro.dataloader.text_loader import TextLoader
+from systemds.scuro.modality.type import ModalityType
+
+
+class TestUnimodalRepresentations(unittest.TestCase):
+    test_file_path = None
+    mods = None
+    text = None
+    audio = None
+    video = None
+    data_generator = None
+    num_instances = 0
+    indizes = []
+
+    @classmethod
+    def setUpClass(cls):
+        cls.test_file_path = "unimodal_test_data"
+
+        cls.num_instances = 4
+        cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
+
+        cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path)
+        os.makedirs(f"{cls.test_file_path}/embeddings")
+
+    @classmethod
+    def tearDownClass(cls):
+        print("Cleaning up test data")
+        shutil.rmtree(cls.test_file_path)
+
+    def test_audio_representations(self):
+        audio_representations = [MelSpectrogram()]  # TODO: add FFT, TFN, 1DCNN
+        audio_data_loader = AudioLoader(
+            self.data_generator.get_modality_path(ModalityType.AUDIO),
+            self.data_generator.indices,
+        )
+        audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO)
+
+        for representation in audio_representations:
+            r = audio.apply_representation(representation)
+            assert r.data is not None
+            assert len(r.data) == self.num_instances
+
+    def test_video_representations(self):
+        video_representations = [ResNet()]  # Todo: add other video representations
+        video_data_loader = VideoLoader(
+            self.data_generator.get_modality_path(ModalityType.VIDEO),
+            self.data_generator.indices,
+        )
+        video = UnimodalModality(video_data_loader, ModalityType.VIDEO)
+        for representation in video_representations:
+            r = video.apply_representation(representation)
+            assert r.data is not None
+            assert len(r.data) == self.num_instances
+
+    def test_text_representations(self):
+        # TODO: check params fro BOW, W2V, TfIdf
+        test_representations = [BoW(2, 2), W2V(5, 2, 2), TfIdf(2), Bert()]
+        text_data_loader = TextLoader(
+            self.data_generator.get_modality_path(ModalityType.TEXT),
+            self.data_generator.indices,
+        )
+        text = UnimodalModality(text_data_loader, ModalityType.TEXT)
+
+        for representation in test_representations:
+            r = text.apply_representation(representation)
+            assert r.data is not None
+            assert len(r.data) == self.num_instances
+
+    def test_chunked_video_representations(self):
+        video_representations = [ResNet()]
+        video_data_loader = VideoLoader(
+            self.data_generator.get_modality_path(ModalityType.VIDEO),
+            self.data_generator.indices,
+            chunk_size=2,
+        )
+        video = UnimodalModality(video_data_loader, ModalityType.VIDEO)
+        for representation in video_representations:
+            r = video.apply_representation(representation)
+            assert r.data is not None
+            assert len(r.data) == self.num_instances
+
+
+if __name__ == "__main__":
+    unittest.main()