From 715bc3794606037ae0d7b96d656ac88d1bedf346 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Mon, 13 Jan 2025 10:18:10 +0100
Subject: [PATCH 01/16] add schema for base modalities and create timestamps
 for audio+video

---
 .../systemds/scuro/dataloader/audio_loader.py |  6 ++--
 .../systemds/scuro/dataloader/base_loader.py  | 15 ++++++--
 .../systemds/scuro/dataloader/json_loader.py  |  6 ++--
 .../systemds/scuro/dataloader/text_loader.py  |  5 +--
 .../systemds/scuro/dataloader/video_loader.py | 20 +++++++++--
 .../systemds/scuro/modality/modality.py       |  3 +-
 .../python/systemds/scuro/modality/type.py    | 36 ++++++++++++++++---
 7 files changed, 75 insertions(+), 16 deletions(-)

diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py
index f85b1b80faa..d20042c84bd 100644
--- a/src/main/python/systemds/scuro/dataloader/audio_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py
@@ -18,7 +18,7 @@
 # under the License.
 #
 # -------------------------------------------------------------
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import librosa
 from systemds.scuro.dataloader.base_loader import BaseLoader
@@ -33,7 +33,9 @@ def __init__(
     ):
         super().__init__(source_path, indices, chunk_size)
 
-    def extract(self, file: str):
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
         audio, sr = librosa.load(file)
+        self.metadata[file] = {"sample_rate": sr, "length": audio.shape[0]}
+        self.metadata[file]["timestamp"] = self.create_timestamps(self.metadata[file]["sample_rate"], self.metadata[file]["length"])
         self.data.append(audio)
diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py
index 2ef60677c67..1cb7e625105 100644
--- a/src/main/python/systemds/scuro/dataloader/base_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/base_loader.py
@@ -21,7 +21,7 @@
 import os
 from abc import ABC, abstractmethod
 from typing import List, Optional, Union
-
+import numpy as np
 
 class BaseLoader(ABC):
     def __init__(
@@ -35,6 +35,9 @@ def __init__(
         (otherwise please provide your own Dataloader that knows about the file name convention)
         """
         self.data = []
+        self.metadata = (
+            {}
+        )  # TODO: check what the index should be for storing the metadata (file_name, counter, ...)
         self.source_path = source_path
         self.indices = indices
         self.chunk_size = chunk_size
@@ -78,7 +81,15 @@ def _load(self, indices: List[str]):
     @abstractmethod
     def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         pass
-
+    
+    def create_timestamps(self, frequency, sample_length, start_datetime=None):
+        start_time = start_datetime if start_datetime is not None else np.datetime64('1970-01-01T00:00:00.000000')
+        time_increment = 1 / frequency
+        time_increments_array = np.arange(sample_length) * np.timedelta64(int(time_increment * 1e6))
+        timestamps = start_time + time_increments_array
+        return timestamps
+        
+        
     def file_sanity_check(self, file):
         """
         Checks if the file can be found is not empty
diff --git a/src/main/python/systemds/scuro/dataloader/json_loader.py b/src/main/python/systemds/scuro/dataloader/json_loader.py
index c4e3b956111..ac375451888 100644
--- a/src/main/python/systemds/scuro/dataloader/json_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/json_loader.py
@@ -21,7 +21,7 @@
 import json
 
 from systemds.scuro.dataloader.base_loader import BaseLoader
-from typing import Optional, List
+from typing import Optional, List, Union
 
 
 class JSONLoader(BaseLoader):
@@ -35,9 +35,9 @@ def __init__(
         super().__init__(source_path, indices, chunk_size)
         self.field = field
 
-    def extract(self, file: str, indices: List[str]):
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
         with open(file) as f:
             json_file = json.load(f)
-            for idx in indices:
+            for idx in index:
                 self.data.append(json_file[idx][self.field])
diff --git a/src/main/python/systemds/scuro/dataloader/text_loader.py b/src/main/python/systemds/scuro/dataloader/text_loader.py
index f614472bce6..bf34cf85c7f 100644
--- a/src/main/python/systemds/scuro/dataloader/text_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/text_loader.py
@@ -19,7 +19,7 @@
 #
 # -------------------------------------------------------------
 from systemds.scuro.dataloader.base_loader import BaseLoader
-from typing import Optional, Pattern, List
+from typing import Optional, Pattern, List, Union
 import re
 
 
@@ -34,11 +34,12 @@ def __init__(
         super().__init__(source_path, indices, chunk_size)
         self.prefix = prefix
 
-    def extract(self, file: str):
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
         with open(file) as text_file:
             for i, line in enumerate(text_file):
                 if self.prefix:
                     line = re.sub(self.prefix, "", line)
                 line = line.replace("\n", "")
+                self.metadata[file] = {"length": len(line.split())}
                 self.data.append(line)
diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py
index 6da20b34756..505ae111ffb 100644
--- a/src/main/python/systemds/scuro/dataloader/video_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/video_loader.py
@@ -18,7 +18,7 @@
 # under the License.
 #
 # -------------------------------------------------------------
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import numpy as np
 
@@ -35,9 +35,25 @@ def __init__(
     ):
         super().__init__(source_path, indices, chunk_size)
 
-    def extract(self, file: str):
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
         cap = cv2.VideoCapture(file)
+
+        if not cap.isOpened():
+            raise f"Could not read video at path: {file}"
+
+        self.metadata[file] = {
+            "fps": int(cap.get(cv2.CAP_PROP_FPS)),
+            "length": int(cap.get(cv2.CAP_PROP_FRAME_COUNT)),
+            "width": int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
+            "height": int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
+            "num_channels": 3,
+        }
+
+        self.metadata[file]["timestamp"] = self.create_timestamps(
+            self.metadata[file]["fps"], self.metadata[file]["length"]
+        )
+
         frames = []
         while cap.isOpened():
             ret, frame = cap.read()
diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
index 9a3d1b148d2..6479c6247c4 100644
--- a/src/main/python/systemds/scuro/modality/modality.py
+++ b/src/main/python/systemds/scuro/modality/modality.py
@@ -31,11 +31,12 @@ def __init__(self, modality_type: ModalityType):
         :param modality_type: Type of the modality
         """
         self.type = modality_type
+        self.schema = modality_type.get_schema()
         self.data = None
         self.data_type = None
         self.cost = None
         self.shape = None
-        self.schema = {}
+        self.data_index = None
 
     def get_modality_names(self) -> List[str]:
         """
diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py
index c451eea6f1d..7da2744d0b3 100644
--- a/src/main/python/systemds/scuro/modality/type.py
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -18,7 +18,36 @@
 # under the License.
 #
 # -------------------------------------------------------------
-from enum import Enum, Flag, auto
+from enum import Flag, auto
+
+
+class ModalitySchemas:
+    TEXT_SCHEMA = {"type": "string", "length": "int"}
+
+    AUDIO_SCHEMA = {
+        "timestamp": "array",
+        "type": "float32",
+        "sample_rate": "integer",
+        "length": "integer",
+    }
+
+    VIDEO_SCHEMA = {
+        "timestamp": "array",
+        "type": "object",
+        "fps": "integer",
+        "length": "integer",
+        "width": "integer",
+        "height": "integer",
+        "num_channels": "integer",
+    }
+
+    @classmethod
+    def get(cls, name):
+        return getattr(cls, f"{name}_SCHEMA", None)
+
+    @classmethod
+    def add_schema(cls, name, schema):
+        setattr(cls, f"{name}_SCHEMA", schema)
 
 
 class ModalityType(Flag):
@@ -26,6 +55,5 @@ class ModalityType(Flag):
     AUDIO = auto()
     VIDEO = auto()
 
-    # def __init__(self, value, name):
-    #     self._value_ = value
-    #     self.name = name
+    def get_schema(self):
+        return ModalitySchemas.get(self.name)

From e0ebd6936995a0942e191891fd6a19149e945577 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Tue, 14 Jan 2025 13:16:14 +0100
Subject: [PATCH 02/16] add join for raw modalities

---
 .../systemds/scuro/dataloader/audio_loader.py |   4 +-
 .../systemds/scuro/dataloader/base_loader.py  |  49 +++++---
 .../python/systemds/scuro/modality/joined.py  | 109 ++++++++++++++++++
 .../systemds/scuro/modality/transformed.py    |   4 +-
 .../scuro/modality/unimodal_modality.py       |  27 ++++-
 .../python/systemds/scuro/utils/__init__.py   |  20 ++++
 .../systemds/scuro/utils/join_condition.py    |  28 +++++
 7 files changed, 222 insertions(+), 19 deletions(-)
 create mode 100644 src/main/python/systemds/scuro/modality/joined.py
 create mode 100644 src/main/python/systemds/scuro/utils/__init__.py
 create mode 100644 src/main/python/systemds/scuro/utils/join_condition.py

diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py
index d20042c84bd..121b5513502 100644
--- a/src/main/python/systemds/scuro/dataloader/audio_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py
@@ -37,5 +37,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
         audio, sr = librosa.load(file)
         self.metadata[file] = {"sample_rate": sr, "length": audio.shape[0]}
-        self.metadata[file]["timestamp"] = self.create_timestamps(self.metadata[file]["sample_rate"], self.metadata[file]["length"])
+        self.metadata[file]["timestamp"] = self.create_timestamps(
+            self.metadata[file]["sample_rate"], self.metadata[file]["length"]
+        )
         self.data.append(audio)
diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py
index 1cb7e625105..142f99ffc0e 100644
--- a/src/main/python/systemds/scuro/dataloader/base_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/base_loader.py
@@ -23,6 +23,7 @@
 from typing import List, Optional, Union
 import numpy as np
 
+
 class BaseLoader(ABC):
     def __init__(
         self, source_path: str, indices: List[str], chunk_size: Optional[int] = None
@@ -40,30 +41,46 @@ def __init__(
         )  # TODO: check what the index should be for storing the metadata (file_name, counter, ...)
         self.source_path = source_path
         self.indices = indices
-        self.chunk_size = chunk_size
-        self.next_chunk = 0
+        self._next_chunk = 0
+        self._num_chunks = 1
+        self._chunk_size = None
 
-        if self.chunk_size:
-            self.num_chunks = int(len(self.indices) / self.chunk_size)
+        if chunk_size:
+            self.update_chunk_size(chunk_size)
 
     def load(self):
         """
         Takes care of loading the raw data either chunk wise (if chunk size is defined) or all at once
         """
-        if self.chunk_size:
+        if self._chunk_size:
             return self._load_next_chunk()
 
         return self._load(self.indices)
 
+    def update_chunk_size(self, new_chunk_size):
+        self._chunk_size = new_chunk_size
+        self._num_chunks = int(len(self.indices) / self._chunk_size)
+
+    def get_chunk_size(self):
+        return self._chunk_size
+
+    def get_next_chunk_number(self):
+        return self._next_chunk
+
+    def get_num_total_chunks(self):
+        return self._num_chunks
+
     def _load_next_chunk(self):
         """
         Loads the next chunk of data
         """
         self.data = []
         next_chunk_indices = self.indices[
-            self.next_chunk * self.chunk_size : (self.next_chunk + 1) * self.chunk_size
+            self._next_chunk
+            * self._chunk_size : (self._next_chunk + 1)
+            * self._chunk_size
         ]
-        self.next_chunk += 1
+        self._next_chunk += 1
         return self._load(next_chunk_indices)
 
     def _load(self, indices: List[str]):
@@ -81,15 +98,21 @@ def _load(self, indices: List[str]):
     @abstractmethod
     def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         pass
-    
+
     def create_timestamps(self, frequency, sample_length, start_datetime=None):
-        start_time = start_datetime if start_datetime is not None else np.datetime64('1970-01-01T00:00:00.000000')
+        start_time = (
+            start_datetime
+            if start_datetime is not None
+            else np.datetime64("1970-01-01T00:00:00.000000")
+        )
         time_increment = 1 / frequency
-        time_increments_array = np.arange(sample_length) * np.timedelta64(int(time_increment * 1e6))
+        time_increments_array = np.arange(sample_length) * np.timedelta64(
+            int(time_increment * 1e6)
+        )
         timestamps = start_time + time_increments_array
-        return timestamps
-        
-        
+
+        return timestamps.astype(np.int64)
+
     def file_sanity_check(self, file):
         """
         Checks if the file can be found is not empty
diff --git a/src/main/python/systemds/scuro/modality/joined.py b/src/main/python/systemds/scuro/modality/joined.py
new file mode 100644
index 00000000000..d5ab6b3406d
--- /dev/null
+++ b/src/main/python/systemds/scuro/modality/joined.py
@@ -0,0 +1,109 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import sys
+
+import numpy as np
+
+from systemds.scuro.modality.modality import Modality
+from systemds.scuro.modality.transformed import TransformedModality
+from systemds.scuro.utils.join_condition import JoinCondition
+
+
+class JoinedModality(Modality):
+
+    def __init__(self, modality_type, primary, other, join_condition: JoinCondition):
+        """
+        TODO
+        :param modality_type: Type of the original modality(ies)
+        """
+        super().__init__(modality_type)
+        self.primary_modality = primary
+        self.other_modality = other
+        self.condition = join_condition
+        self.chunked_execution = False
+        self._check_chunked_data_extraction()
+
+    def execute(self):
+        self.primary_modality.extract_raw_data()
+        self.data = {"other": []}
+        self.other_modality.extract_raw_data()
+
+        for i, element in enumerate(self.primary_modality.data):
+            idx_1 = list(self.primary_modality.data_loader.metadata.values())[i][
+                self.condition.field_1
+            ]
+            if (
+                self.condition.alignment is None and self.condition.join_type == "<"
+            ):  # TODO compute correct alignment timestamps/spatial params
+                next_idx = np.zeros(len(idx_1), dtype=int)
+                next_idx[:-1] = idx_1[1:]
+                next_idx[-1] = sys.maxsize
+
+            idx_2 = list(self.other_modality.data_loader.metadata.values())[i][
+                self.condition.field_2
+            ]
+
+            c = 0
+            for j in range(0, len(idx_1)):
+                other = []
+                if self.condition.join_type == "<":
+                    while c < len(idx_2) and idx_2[c] < next_idx[j]:
+                        other.append(self.other_modality.data[i][c])
+                        c = c + 1
+                else:
+                    while c < len(idx_2) and idx_2[c] <= idx_1[j]:
+                        if idx_2[c] == idx_1[j]:
+                            other.append(self.other_modality.data[i][c])
+                        c = c + 1
+
+                self.data["other"].append(other)
+
+    def apply_representation(self, representation):
+        if self.chunked_execution:
+            new_modality = TransformedModality(
+                self.primary_modality.type, representation
+            )
+
+            while (
+                self.primary_modality.data_loader.get_next_chunk_number()
+                < self.primary_modality.data_loader.get_num_total_chunks()
+            ):
+                self.execute()
+
+    def _check_chunked_data_extraction(self):
+        if self.primary_modality.data_loader.get_chunk_size():
+            if not self.other_modality.data_loader.get_chunk_size():
+                self.other_modality.data_loader.update_chunk_size(
+                    self.primary_modality.data_loader.get_chunk_size()
+                )
+            elif (
+                self.other_modality.data_loader.get_chunk_size()
+                > self.primary_modality.data_loader.get_chunk_size()
+            ):
+                self.primary_modality.data_loader.update_chunk_size(
+                    self.other_modality.data_loader.get_chunk_size()
+                )
+            self.chunked_execution = True
+        elif self.other_modality.data_loader.get_chunk_size():
+            self.primary_modality.data_loader.update_chunk_size(
+                self.other_modality.data_loader.get_chunk_size()
+            )
+            self.chunked_execution = True
diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py
index 61c327e469e..4e17b1e4975 100644
--- a/src/main/python/systemds/scuro/modality/transformed.py
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -22,12 +22,11 @@
 from operator import or_
 
 from systemds.scuro.modality.modality import Modality
-from systemds.scuro.modality.type import ModalityType
 
 
 class TransformedModality(Modality):
 
-    def __init__(self, modality_type: ModalityType, transformation):
+    def __init__(self, modality_type, transformation):
         """
         Parent class of the different Modalities (unimodal & multimodal)
         :param modality_type: Type of the original modality(ies)
@@ -35,6 +34,7 @@ def __init__(self, modality_type: ModalityType, transformation):
         """
         super().__init__(modality_type)
         self.transformation = transformation
+        self.data = []
 
     def combine(self, other, fusion_method):
         """
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index 976d4194d47..84fbf537649 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -18,7 +18,12 @@
 # under the License.
 #
 # -------------------------------------------------------------
+from functools import reduce
+from operator import or_
+
+
 from systemds.scuro.dataloader.base_loader import BaseLoader
+from systemds.scuro.modality.joined import JoinedModality
 from systemds.scuro.modality.modality import Modality
 from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.modality.type import ModalityType
@@ -34,12 +39,12 @@ def __init__(self, data_loader: BaseLoader, modality_type: ModalityType):
         """
         super().__init__(modality_type)
         self.data_loader = data_loader
+        self.join_modality = None
 
     def extract_raw_data(self):
         """
         Uses the data loader to read the raw data from a specified location
         and stores the data in the data location.
-        TODO: schema
         """
         self.data = self.data_loader.load()
 
@@ -47,8 +52,11 @@ def apply_representation(self, representation):
         new_modality = TransformedModality(self.type, representation)
         new_modality.data = []
 
-        if self.data_loader.chunk_size:
-            while self.data_loader.next_chunk < self.data_loader.num_chunks:
+        if self.data_loader.get_chunk_size():
+            while (
+                self.data_loader.get_next_chunk_number()
+                < self.data_loader.get_num_total_chunks()
+            ):
                 self.extract_raw_data()
                 new_modality.data.extend(representation.transform(self.data))
         else:
@@ -57,3 +65,16 @@ def apply_representation(self, representation):
             new_modality.data = representation.transform(self.data)
 
         return new_modality
+
+    def join(self, other, join_condition):
+        joined_modality = JoinedModality(
+            reduce(or_, other.type, self.type), self, other, join_condition
+        )
+
+        if (
+            not self.data_loader.get_chunk_size()
+            and not other.data_loader.get_chunk_size()
+        ):
+            joined_modality.execute()
+
+        return joined_modality
diff --git a/src/main/python/systemds/scuro/utils/__init__.py b/src/main/python/systemds/scuro/utils/__init__.py
new file mode 100644
index 00000000000..0a47bfff92c
--- /dev/null
+++ b/src/main/python/systemds/scuro/utils/__init__.py
@@ -0,0 +1,20 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
\ No newline at end of file
diff --git a/src/main/python/systemds/scuro/utils/join_condition.py b/src/main/python/systemds/scuro/utils/join_condition.py
new file mode 100644
index 00000000000..62c8a4d0623
--- /dev/null
+++ b/src/main/python/systemds/scuro/utils/join_condition.py
@@ -0,0 +1,28 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+
+class JoinCondition:
+    def __init__(self, field_1, field_2, join_type, alignment=None):
+        self.field_1 = field_1
+        self.field_2 = field_2
+        self.join_type = join_type
+        self.alignment = alignment

From d0058b05e5d8d255667d1136b6cc71936dccd17c Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Tue, 28 Jan 2025 22:14:20 +0100
Subject: [PATCH 03/16] audio video join (initial working version)

---
 .../systemds/scuro/dataloader/audio_loader.py |   4 +-
 .../systemds/scuro/dataloader/base_loader.py  |  61 +++---
 .../systemds/scuro/dataloader/video_loader.py |   5 +-
 .../python/systemds/scuro/modality/joined.py  | 179 +++++++++++++-----
 .../scuro/modality/joined_transformed.py      |  62 ++++++
 .../systemds/scuro/modality/modality.py       |  26 ++-
 .../systemds/scuro/modality/transformed.py    |  11 +-
 .../python/systemds/scuro/modality/type.py    |  48 ++++-
 .../scuro/modality/unimodal_modality.py       |  49 ++---
 .../scuro/representations/aggregate.py        |  51 +++++
 .../systemds/scuro/representations/lstm.py    |   6 +-
 .../scuro/representations/mel_spectrogram.py  |  42 ++--
 .../systemds/scuro/representations/resnet.py  |  70 ++++---
 .../systemds/scuro/representations/window.py  |  46 +++++
 .../{join_condition.py => schema_helpers.py}  |  27 ++-
 src/main/python/tests/scuro/data_generator.py |   8 +-
 .../python/tests/scuro/test_data_loaders.py   |   2 +-
 src/main/python/tests/scuro/test_dr_search.py |   1 +
 18 files changed, 524 insertions(+), 174 deletions(-)
 create mode 100644 src/main/python/systemds/scuro/modality/joined_transformed.py
 create mode 100644 src/main/python/systemds/scuro/representations/aggregate.py
 create mode 100644 src/main/python/systemds/scuro/representations/window.py
 rename src/main/python/systemds/scuro/utils/{join_condition.py => schema_helpers.py} (57%)

diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py
index 121b5513502..b86d8a28763 100644
--- a/src/main/python/systemds/scuro/dataloader/audio_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py
@@ -22,7 +22,7 @@
 
 import librosa
 from systemds.scuro.dataloader.base_loader import BaseLoader
-
+from systemds.scuro.utils.schema_helpers import create_timestamps
 
 class AudioLoader(BaseLoader):
     def __init__(
@@ -37,7 +37,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
         audio, sr = librosa.load(file)
         self.metadata[file] = {"sample_rate": sr, "length": audio.shape[0]}
-        self.metadata[file]["timestamp"] = self.create_timestamps(
+        self.metadata[file]["timestamp"] = create_timestamps(
             self.metadata[file]["sample_rate"], self.metadata[file]["length"]
         )
         self.data.append(audio)
diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py
index 142f99ffc0e..33d4e4920e1 100644
--- a/src/main/python/systemds/scuro/dataloader/base_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/base_loader.py
@@ -21,7 +21,6 @@
 import os
 from abc import ABC, abstractmethod
 from typing import List, Optional, Union
-import numpy as np
 
 
 class BaseLoader(ABC):
@@ -46,7 +45,24 @@ def __init__(
         self._chunk_size = None
 
         if chunk_size:
-            self.update_chunk_size(chunk_size)
+            self.chunk_size = chunk_size
+
+    @property
+    def chunk_size(self):
+        return self._chunk_size
+
+    @chunk_size.setter
+    def chunk_size(self, value):
+        self._chunk_size = value
+        self._num_chunks = int(len(self.indices) / self._chunk_size)
+        
+    @property
+    def num_chunks(self):
+        return self._num_chunks
+    
+    @property
+    def next_chunk(self):
+        return self._next_chunk
 
     def load(self):
         """
@@ -57,18 +73,18 @@ def load(self):
 
         return self._load(self.indices)
 
-    def update_chunk_size(self, new_chunk_size):
-        self._chunk_size = new_chunk_size
-        self._num_chunks = int(len(self.indices) / self._chunk_size)
+    def update_chunk_sizes(self, other):
+        if not self._chunk_size and not other.chunk_size:
+            return
 
-    def get_chunk_size(self):
-        return self._chunk_size
-
-    def get_next_chunk_number(self):
-        return self._next_chunk
-
-    def get_num_total_chunks(self):
-        return self._num_chunks
+        if (
+            self._chunk_size
+            and not other.chunk_size
+            or self._chunk_size < other.chunk_size
+        ):
+            other.chunk_size = self.chunk_size
+        else:
+            self.chunk_size = other.chunk_size
 
     def _load_next_chunk(self):
         """
@@ -93,27 +109,14 @@ def _load(self, indices: List[str]):
         else:
             self.extract(self.source_path, indices)
 
-        return self.data
+        return self.data, self.metadata
 
     @abstractmethod
     def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         pass
 
-    def create_timestamps(self, frequency, sample_length, start_datetime=None):
-        start_time = (
-            start_datetime
-            if start_datetime is not None
-            else np.datetime64("1970-01-01T00:00:00.000000")
-        )
-        time_increment = 1 / frequency
-        time_increments_array = np.arange(sample_length) * np.timedelta64(
-            int(time_increment * 1e6)
-        )
-        timestamps = start_time + time_increments_array
-
-        return timestamps.astype(np.int64)
-
-    def file_sanity_check(self, file):
+    @staticmethod
+    def file_sanity_check(file):
         """
         Checks if the file can be found is not empty
         """
diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py
index 505ae111ffb..807a43b21cc 100644
--- a/src/main/python/systemds/scuro/dataloader/video_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/video_loader.py
@@ -23,6 +23,7 @@
 import numpy as np
 
 from systemds.scuro.dataloader.base_loader import BaseLoader
+from systemds.scuro.utils.schema_helpers import create_timestamps
 import cv2
 
 
@@ -43,14 +44,14 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
             raise f"Could not read video at path: {file}"
 
         self.metadata[file] = {
-            "fps": int(cap.get(cv2.CAP_PROP_FPS)),
+            "fps": cap.get(cv2.CAP_PROP_FPS),
             "length": int(cap.get(cv2.CAP_PROP_FRAME_COUNT)),
             "width": int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
             "height": int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
             "num_channels": 3,
         }
 
-        self.metadata[file]["timestamp"] = self.create_timestamps(
+        self.metadata[file]["timestamp"] = create_timestamps(
             self.metadata[file]["fps"], self.metadata[file]["length"]
         )
 
diff --git a/src/main/python/systemds/scuro/modality/joined.py b/src/main/python/systemds/scuro/modality/joined.py
index d5ab6b3406d..aeccdd3d321 100644
--- a/src/main/python/systemds/scuro/modality/joined.py
+++ b/src/main/python/systemds/scuro/modality/joined.py
@@ -22,88 +22,165 @@
 
 import numpy as np
 
+from systemds.scuro.modality.joined_transformed import JoinedTransformedModality
 from systemds.scuro.modality.modality import Modality
 from systemds.scuro.modality.transformed import TransformedModality
-from systemds.scuro.utils.join_condition import JoinCondition
+from systemds.scuro.representations.aggregate import Aggregation
+
+
+class JoinCondition:
+    def __init__(self, leftField, rightField, joinType, alignment=None):
+        self.leftField = leftField
+        self.rightField = rightField
+        self.join_type = joinType
+        self.alignment = alignment
 
 
 class JoinedModality(Modality):
 
-    def __init__(self, modality_type, primary, other, join_condition: JoinCondition):
+    def __init__(
+        self,
+        modality_type,
+        left_modality,
+        right_modality,
+        join_condition: JoinCondition,
+        chunked_execution=False,
+    ):
         """
         TODO
         :param modality_type: Type of the original modality(ies)
         """
         super().__init__(modality_type)
-        self.primary_modality = primary
-        self.other_modality = other
+        self.aggregation = None
+        self.joined_right = None
+        self.left_modality = left_modality
+        self.right_modality = right_modality
         self.condition = join_condition
-        self.chunked_execution = False
-        self._check_chunked_data_extraction()
+        self.chunked_execution = chunked_execution # TODO: maybe move this into parent class
+        self.left_type = type(left_modality)
+        self.right_type = type(right_modality)
+        if self.chunked_execution:
+            self.chunk_left = left_modality.data_loader.chunk_size is not None
 
-    def execute(self):
-        self.primary_modality.extract_raw_data()
-        self.data = {"other": []}
-        self.other_modality.extract_raw_data()
+    def execute(self, right_starting_idx=0):
+        self.joined_right = self.right_modality.copy_from_instance()
 
-        for i, element in enumerate(self.primary_modality.data):
-            idx_1 = list(self.primary_modality.data_loader.metadata.values())[i][
-                self.condition.field_1
+        for i, element in enumerate(self.left_modality.data):
+            idx_1 = list(self.left_modality.metadata.values())[i + right_starting_idx][
+                self.condition.leftField
             ]
             if (
                 self.condition.alignment is None and self.condition.join_type == "<"
             ):  # TODO compute correct alignment timestamps/spatial params
-                next_idx = np.zeros(len(idx_1), dtype=int)
-                next_idx[:-1] = idx_1[1:]
-                next_idx[-1] = sys.maxsize
+                nextIdx = np.zeros(len(idx_1), dtype=int)
+                nextIdx[:-1] = idx_1[1:]
+                nextIdx[-1] = sys.maxsize
 
-            idx_2 = list(self.other_modality.data_loader.metadata.values())[i][
-                self.condition.field_2
+            idx_2 = list(self.right_modality.metadata.values())[i + right_starting_idx][
+                self.condition.rightField
             ]
+            self.joined_right.data.append([])
 
             c = 0
+            # Assumes ordered lists (temporal)
+            # TODO: need to extract the shape of the data from the metadata
+            # video: list of lists of numpy array
+            # audio: list of numpy array
             for j in range(0, len(idx_1)):
-                other = []
+                self.joined_right.data[i].append([])
+                other = np.array([])
                 if self.condition.join_type == "<":
-                    while c < len(idx_2) and idx_2[c] < next_idx[j]:
-                        other.append(self.other_modality.data[i][c])
+                    while c < len(idx_2) and idx_2[c] < nextIdx[j]:
+                        if other.size == 0:
+                            other = self.right_modality.data[i + right_starting_idx][c][np.newaxis, :]
+                        else:
+                            other = np.concatenate([other, self.right_modality.data[i + right_starting_idx][c][np.newaxis, :]], axis=0)
+                        # other.append(self.right_modality.data[i][c])
                         c = c + 1
                 else:
                     while c < len(idx_2) and idx_2[c] <= idx_1[j]:
                         if idx_2[c] == idx_1[j]:
-                            other.append(self.other_modality.data[i][c])
+                            other.append(self.right_modality.data[i + right_starting_idx][c])
                         c = c + 1
+                        
+                if len(other) == 0: # Audio and video length sometimes do not match so we add the average all audio samples for this specific frame
+                    other = np.mean(self.right_modality.data[i + right_starting_idx], axis=0)[np.newaxis,:] # TODO: check correct loading for all data layouts, this is similar to missing data, add a different operation for htis
+                self.joined_right.data[i][j] = other
 
-                self.data["other"].append(other)
-
-    def apply_representation(self, representation):
+    def apply_representation(self, representation, aggregation):
+        self.aggregation = aggregation
         if self.chunked_execution:
-            new_modality = TransformedModality(
-                self.primary_modality.type, representation
+            return self._handle_chunked_execution(representation)
+        elif self.left_type.__name__.__contains__("Unimodal"):
+            self.left_modality.extract_raw_data()
+            if self.left_type == self.right_type:
+                self.right_modality.extract_raw_data()
+        elif self.right_type.__name__.__contains__("Unimodal"):
+            self.right_modality.extract_raw_data()
+
+        self.execute()
+        
+    def aggregate(self, aggregation_function, field_name): # TODO: use the filed name to extract data entries from modalities
+        self.aggregation = Aggregation(aggregation_function, field_name)
+        
+        if not self.chunked_execution and self.joined_right:
+            return self.aggregation.aggregate(self.joined_right)
+        
+        return self
+            
+
+    def _handle_chunked_execution(self, representation):
+        if self.left_type == self.right_type:
+            return self._apply_representation_chunked(
+                self.left_modality, self.right_modality, True, representation
+            )
+        elif self.chunk_left:
+            return self._apply_representation_chunked(
+                self.left_modality, self.right_modality, False, representation
+            )
+        else:
+            return self._apply_representation_chunked(
+                self.right_modality, self.left_modality, False, representation
+            )
+
+    def _apply_representation_chunked(
+        self, chunk_modality, other_modality, chunk_other, representation
+    ):
+        new_left= TransformedModality(
+            self.left_modality.modality_type,
+            representation,
+            self.left_modality.metadata,
+        )
+        new_right = TransformedModality(
+            self.right_modality.modality_type,
+            representation,
+            self.right_modality.metadata,
+        )
+        while (
+            chunk_modality.data_loader.next_chunk
+            < chunk_modality.data_loader.num_chunks
+        ):
+            print(chunk_modality.data_loader.next_chunk
+            )
+            if chunk_other:
+                other_modality.extract_raw_data()
+                starting_idx = 0
+            else:
+                starting_idx = chunk_modality.data_loader.next_chunk * chunk_modality.data_loader.chunk_size
+            chunk_modality.extract_raw_data()
+
+            self.execute(starting_idx)
+            left_transformed = representation.transform(self.left_modality)
+            left_aggregated = self.aggregation.window(left_transformed)
+            new_left.data.extend(
+                left_aggregated.data
             )
 
-            while (
-                self.primary_modality.data_loader.get_next_chunk_number()
-                < self.primary_modality.data_loader.get_num_total_chunks()
-            ):
-                self.execute()
-
-    def _check_chunked_data_extraction(self):
-        if self.primary_modality.data_loader.get_chunk_size():
-            if not self.other_modality.data_loader.get_chunk_size():
-                self.other_modality.data_loader.update_chunk_size(
-                    self.primary_modality.data_loader.get_chunk_size()
-                )
-            elif (
-                self.other_modality.data_loader.get_chunk_size()
-                > self.primary_modality.data_loader.get_chunk_size()
-            ):
-                self.primary_modality.data_loader.update_chunk_size(
-                    self.other_modality.data_loader.get_chunk_size()
-                )
-            self.chunked_execution = True
-        elif self.other_modality.data_loader.get_chunk_size():
-            self.primary_modality.data_loader.update_chunk_size(
-                self.other_modality.data_loader.get_chunk_size()
+            right_transformed = representation.transform(self.joined_right)
+            right_aggregated = self.aggregation.window(right_transformed)
+            new_right.data.extend(
+                right_aggregated.data
             )
-            self.chunked_execution = True
+            
+        return JoinedTransformedModality(new_left, new_right, f'joined_{representation.name}')
+    
diff --git a/src/main/python/systemds/scuro/modality/joined_transformed.py b/src/main/python/systemds/scuro/modality/joined_transformed.py
new file mode 100644
index 00000000000..558b0e3760e
--- /dev/null
+++ b/src/main/python/systemds/scuro/modality/joined_transformed.py
@@ -0,0 +1,62 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from functools import reduce
+from operator import or_
+
+import numpy as np
+
+from systemds.scuro.modality.modality import Modality
+from systemds.scuro.representations.utils import pad_sequences
+
+class JoinedTransformedModality(Modality):
+
+    def __init__(self, left_modality, right_modality, transformation):
+        """
+        Parent class of the different Modalities (unimodal & multimodal)
+        :param transformation: Representation to be applied on the modality
+        """
+        super().__init__(reduce(or_, [left_modality.modality_type], right_modality.modality_type))
+        self.transformation = transformation
+        self.left_modality = left_modality
+        self.right_modality = right_modality
+
+    def combine(self, fusion_method):
+        """
+        Combines two or more modalities with each other using a dedicated fusion method
+        :param other: The modality to be combined
+        :param fusion_method: The fusion method to be used to combine modalities
+        """
+        modalities = [self.left_modality, self.right_modality]
+        self.data = []
+        for i in range(0, len(self.left_modality.data)):
+            self.data.append([])
+            for j in range(0, len(self.left_modality.data[i])):
+                self.data[i].append([])
+                fused = np.concatenate([self.left_modality.data[i][j], self.right_modality.data[i][j]], axis=0)
+                self.data[i][j] = fused
+        # self.data = fusion_method.transform(modalities)
+        
+        for i, instance in enumerate(self.data): # TODO: only if the layout is list_of_lists_of_numpy_array
+            r = []
+            [r.extend(l) for l in instance]
+            self.data[i] = np.array(r)
+        self.data = pad_sequences(self.data)
+        return self
diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
index 6479c6247c4..28d27b04144 100644
--- a/src/main/python/systemds/scuro/modality/modality.py
+++ b/src/main/python/systemds/scuro/modality/modality.py
@@ -20,26 +20,42 @@
 # -------------------------------------------------------------
 from typing import List
 
+import numpy as np
+
 from systemds.scuro.modality.type import ModalityType
 
 
 class Modality:
 
-    def __init__(self, modality_type: ModalityType):
+    def __init__(self, modalityType: ModalityType, metadata=None):
         """
         Parent class of the different Modalities (unimodal & multimodal)
         :param modality_type: Type of the modality
         """
-        self.type = modality_type
-        self.schema = modality_type.get_schema()
+        self.modality_type = modalityType
+        self.schema = modalityType.get_schema()
         self.data = None
         self.data_type = None
         self.cost = None
         self.shape = None
-        self.data_index = None
+        self.dataIndex = None
+        self.metadata = metadata
 
     def get_modality_names(self) -> List[str]:
         """
         Extracts the individual unimodal modalities for a given transformed modality.
         """
-        return [modality.name for modality in ModalityType if modality in self.type]
+        return [modality.name for modality in ModalityType if modality in self.modality_type]
+    
+    
+    def update_metadata(self):
+        md_copy = self.metadata
+        self.metadata = {}
+        for i, (md_k, md_v) in enumerate(md_copy.items()):
+            updated_md = self.modality_type.update_metadata(md_v, self.data[i])
+            self.metadata[md_k] = updated_md
+            
+            
+    def window(self, windowSize, aggregationFunction, fieldName):
+        pass
+    
\ No newline at end of file
diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py
index 4e17b1e4975..e13395045f2 100644
--- a/src/main/python/systemds/scuro/modality/transformed.py
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -26,16 +26,20 @@
 
 class TransformedModality(Modality):
 
-    def __init__(self, modality_type, transformation):
+    def __init__(self, modality_type, transformation, metadata):
         """
         Parent class of the different Modalities (unimodal & multimodal)
         :param modality_type: Type of the original modality(ies)
         :param transformation: Representation to be applied on the modality
         """
-        super().__init__(modality_type)
+        super().__init__(modality_type, metadata)
         self.transformation = transformation
         self.data = []
 
+    def copy_from_instance(self):
+        return type(self)(self.modality_type, self.transformation, self.metadata)
+
+
     def combine(self, other, fusion_method):
         """
         Combines two or more modalities with each other using a dedicated fusion method
@@ -43,7 +47,8 @@ def combine(self, other, fusion_method):
         :param fusion_method: The fusion method to be used to combine modalities
         """
         fused_modality = TransformedModality(
-            reduce(or_, (o.type for o in other), self.type), fusion_method
+            reduce(or_, (o.modality_type for o in other), self.modality_type),
+            fusion_method, self.metadata
         )
         modalities = [self]
         modalities.extend(other)
diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py
index 7da2744d0b3..0dbacccef5a 100644
--- a/src/main/python/systemds/scuro/modality/type.py
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -19,21 +19,27 @@
 #
 # -------------------------------------------------------------
 from enum import Flag, auto
+from systemds.scuro.utils.schema_helpers import (
+    calculate_new_frequency,
+    create_timestamps,
+)
 
 
+# TODO: needs a way to define if data comes from a dataset with multiple instances or is like a streaming scenario where we only have one instance
+# right now it is a list of instances (if only one instance the list would contain only a single item)
 class ModalitySchemas:
     TEXT_SCHEMA = {"type": "string", "length": "int"}
 
     AUDIO_SCHEMA = {
         "timestamp": "array",
-        "type": "float32",
+        "data_layout": {"type": "?", "representation": "?"},
         "sample_rate": "integer",
         "length": "integer",
     }
 
     VIDEO_SCHEMA = {
         "timestamp": "array",
-        "type": "object",
+        "data_layout": {"type": "?", "representation": "?"},
         "fps": "integer",
         "length": "integer",
         "width": "integer",
@@ -41,6 +47,8 @@ class ModalitySchemas:
         "num_channels": "integer",
     }
 
+    _metadata_handlers = {}
+
     @classmethod
     def get(cls, name):
         return getattr(cls, f"{name}_SCHEMA", None)
@@ -49,6 +57,39 @@ def get(cls, name):
     def add_schema(cls, name, schema):
         setattr(cls, f"{name}_SCHEMA", schema)
 
+    @classmethod
+    def register_metadata_handler(cls, name):
+        def decorator(metadata_handler):
+            cls._metadata_handlers[name] = metadata_handler
+            return metadata_handler
+
+        return decorator
+
+    @classmethod
+    def update_metadata(cls, name, md, data):
+        mdHandler = cls._metadata_handlers.get(name)
+        if mdHandler:
+            return mdHandler(md, data)
+        
+    def extract_data(self, data, index):
+        if self.get("data_layout").get("representation") == "list_array":
+            return data[index]
+        else:
+            return data[index]
+
+
+@ModalitySchemas.register_metadata_handler("AUDIO")
+def handle_audio_metadata(md, data):
+    new_frequency = calculate_new_frequency(len(data), md["length"], md["sample_rate"])
+    md.update(
+        {
+            "length": len(data),
+            "sample_rate": new_frequency,
+            "timestamp": create_timestamps(new_frequency, len(data)),
+        }
+    )
+    return md
+
 
 class ModalityType(Flag):
     TEXT = auto()
@@ -57,3 +98,6 @@ class ModalityType(Flag):
 
     def get_schema(self):
         return ModalitySchemas.get(self.name)
+
+    def update_metadata(self, md, data):
+        return ModalitySchemas.update_metadata(self.name, md, data)
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index 84fbf537649..4fcf091afea 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -37,25 +37,42 @@ def __init__(self, data_loader: BaseLoader, modality_type: ModalityType):
         :param data_loader: Defines how the raw data should be loaded
         :param modality_type: Type of the modality
         """
-        super().__init__(modality_type)
+        super().__init__(modality_type, None)
         self.data_loader = data_loader
-        self.join_modality = None
+        
+    def copy_from_instance(self):
+        return type(self)(self.data_loader, self.modality_type)
 
     def extract_raw_data(self):
         """
         Uses the data loader to read the raw data from a specified location
         and stores the data in the data location.
         """
-        self.data = self.data_loader.load()
+        self.data, self.metadata = self.data_loader.load()
 
-    def apply_representation(self, representation):
-        new_modality = TransformedModality(self.type, representation)
+    def join(self, other, join_condition):
+        if isinstance(other, UnimodalModality):
+            self.data_loader.update_chunk_sizes(other.data_loader)
+        
+        joined_modality = JoinedModality(
+            reduce(or_, [other.modality_type], self.modality_type),
+            self,
+            other,
+            join_condition,
+            self.data_loader.chunk_size is not None
+        )
+
+        return joined_modality
+
+    # TODO: add aggregation method like in join
+    def apply_representation(self, representation, aggregation):
+        new_modality = TransformedModality(self.modality_type, representation, self.data_loader.metadata)
         new_modality.data = []
 
-        if self.data_loader.get_chunk_size():
+        if self.data_loader.chunk_size:
             while (
-                self.data_loader.get_next_chunk_number()
-                < self.data_loader.get_num_total_chunks()
+                self.data_loader.next_chunk
+                < self.data_loader.num_chunks
             ):
                 self.extract_raw_data()
                 new_modality.data.extend(representation.transform(self.data))
@@ -63,18 +80,6 @@ def apply_representation(self, representation):
             if not self.data:
                 self.extract_raw_data()
             new_modality.data = representation.transform(self.data)
-
+            
+        new_modality.update_metadata()
         return new_modality
-
-    def join(self, other, join_condition):
-        joined_modality = JoinedModality(
-            reduce(or_, other.type, self.type), self, other, join_condition
-        )
-
-        if (
-            not self.data_loader.get_chunk_size()
-            and not other.data_loader.get_chunk_size()
-        ):
-            joined_modality.execute()
-
-        return joined_modality
diff --git a/src/main/python/systemds/scuro/representations/aggregate.py b/src/main/python/systemds/scuro/representations/aggregate.py
new file mode 100644
index 00000000000..7c8d1c68d12
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/aggregate.py
@@ -0,0 +1,51 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import numpy as np
+
+from systemds.scuro.modality.modality import Modality
+
+
+# TODO: make this a Representation and add a fusion method that fuses two modalities with each other
+
+
+class Aggregation:
+    def __init__(self, aggregation_function, field_name):
+        self.aggregation_function = aggregation_function
+        self.field_name = field_name
+
+    def aggregate(self, modality):
+        aggregated_modality = Modality(modality.modality_type, modality.metadata)
+        aggregated_modality.data = []
+        for i, instance in enumerate(modality.data):
+            aggregated_modality.data.append([])
+            for j, entry in enumerate(instance):
+                if self.aggregation_function == "sum":
+                    aggregated_modality.data[i].append(np.sum(entry, axis=0))
+                elif self.aggregation_function == "mean":
+                    aggregated_modality.data[i].append(np.mean(entry, axis=0))
+                elif self.aggregation_function == "min":
+                    aggregated_modality.data[i].append(np.min(entry, axis=0))
+                elif self.aggregation_function == "max":
+                    aggregated_modality.data[i].append(np.max(entry, axis=0))
+                else:
+                    raise ValueError("Invalid aggregation function")
+
+        return aggregated_modality
diff --git a/src/main/python/systemds/scuro/representations/lstm.py b/src/main/python/systemds/scuro/representations/lstm.py
index 649b81117b2..6f06e762a56 100644
--- a/src/main/python/systemds/scuro/representations/lstm.py
+++ b/src/main/python/systemds/scuro/representations/lstm.py
@@ -46,11 +46,11 @@ def transform(self, modalities: List[Modality]):
         result = np.zeros((size, 0))
 
         for modality in modalities:
-            if modality.type in self.unimodal_embeddings.keys():
-                out = self.unimodal_embeddings.get(modality.type)
+            if modality.modality_type in self.unimodal_embeddings.keys():
+                out = self.unimodal_embeddings.get(modality.modality_type)
             else:
                 out = self.run_lstm(modality.data)
-                self.unimodal_embeddings[modality.type] = out
+                self.unimodal_embeddings[modality.modality_type] = out
 
             result = np.concatenate([result, out], axis=-1)
 
diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
index 57a7fab83e2..31b7f222cad 100644
--- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
@@ -24,7 +24,7 @@
 import librosa
 import numpy as np
 from systemds.scuro.representations.utils import pad_sequences
-
+import matplotlib.pyplot as plt
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 
 
@@ -38,24 +38,34 @@ def transform(self, data):
         result = []
         max_length = 0
         for sample in data:
-            S = librosa.feature.melspectrogram(y=sample)
+            S = librosa.feature.melspectrogram(
+                y=sample, sr=22050
+            )
             S_dB = librosa.power_to_db(S, ref=np.max)
             if S_dB.shape[-1] > max_length:
                 max_length = S_dB.shape[-1]
-            result.append(S_dB)
-
-        r = []
-        for elem in result:
-            d = pad_sequences(elem, maxlen=max_length, dtype="float32")
-            r.append(d)
+            result.append(S_dB.T)
 
-        np_array_r = np.array(r) if not self.avg else np.mean(np.array(r), axis=1)
+        # r = []
+        # for elem in result:
+        #     d = pad_sequences(elem, maxlen=max_length, dtype="float32")
+        #     r.append(d)
 
-        if self.output_file is not None:
-            data = []
-            for i in range(0, np_array_r.shape[0]):
-                data.append(np_array_r[i])
-            with open(self.output_file, "wb") as file:
-                pickle.dump(data, file)
+        # np_array_r = np.array(r) if not self.avg else np.mean(np.array(r), axis=1)
+        #
+        # if self.output_file is not None:
+        #     data = []
+        #     for i in range(0, np_array_r.shape[0]):
+        #         data.append(np_array_r[i])
+        #     with open(self.output_file, "wb") as file:
+        #         pickle.dump(data, file)
 
-        return np_array_r
+        return result
+    
+    
+    def plot_spectrogram(self, spectrogram):
+        plt.figure(figsize=(10, 4))
+        librosa.display.specshow(spectrogram, x_axis='time', y_axis='mel', sr=22050, cmap='viridis')
+        plt.colorbar(format='%+2.0f dB')
+        plt.title('Mel Spectrogram')
+        plt.savefig('spectrogram.jpg')
diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py
index 1c1bfa1d5ec..de80562b16f 100644
--- a/src/main/python/systemds/scuro/representations/resnet.py
+++ b/src/main/python/systemds/scuro/representations/resnet.py
@@ -22,6 +22,7 @@
 
 import h5py
 
+from systemds.scuro.modality.modality import Modality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from typing import Callable, Dict, Tuple, Any
 import torch.utils.data
@@ -30,8 +31,10 @@
 import torchvision.transforms as transforms
 import numpy as np
 
-DEVICE = "cpu"
-
+if torch.backends.mps.is_available():
+    DEVICE = torch.device("mps")
+else:
+    DEVICE = torch.device("cpu")
 
 class ResNet(UnimodalRepresentation):
     def __init__(self, layer="avgpool", output_file=None):
@@ -40,7 +43,7 @@ def __init__(self, layer="avgpool", output_file=None):
         self.output_file = output_file
         self.layer_name = layer
 
-    def transform(self, data):
+    def transform(self, modality):
 
         resnet = models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to(DEVICE)
         resnet.eval()
@@ -60,7 +63,7 @@ def transform(self, data):
             ]
         )
 
-        dataset = ResNetDataset(data, t)
+        dataset = ResNetDataset(modality.data, t)
         embeddings = {}
 
         class Identity(torch.nn.Module):
@@ -88,7 +91,7 @@ def hook(
 
         for instance in torch.utils.data.DataLoader(dataset):
             video_id = instance["id"][0]
-            frames = instance["frames"][0].to(DEVICE)
+            frames = instance["data"][0].to(DEVICE)
             embeddings[video_id] = []
             batch_size = 64
 
@@ -99,30 +102,36 @@ def hook(
 
                 _ = resnet(frame_batch)
                 values = res5c_output
-
-                if self.layer_name == "avgpool" or self.layer_name == "maxpool":
-                    embeddings[video_id].extend(
-                        torch.flatten(values, 1).detach().cpu().numpy()
-                    )
-
-                else:
-                    pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1))
-
-                    embeddings[video_id].extend(
-                        torch.flatten(pooled, 1).detach().cpu().numpy()
-                    )
-
+                # if self.layer_name == "avgpool" or self.layer_name == "maxpool":
+                #     embeddings[video_id].extend(
+                #         torch.flatten(values, 1).detach().cpu().numpy()
+                #     )
+                #
+                # else:
+                pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1))
+
+                embeddings[video_id].extend(
+                    torch.flatten(pooled, 1).detach().cpu().numpy()
+                )
+
+        # TODO: this functionality could be used for operator reuse if the data stays the same
         if self.output_file is not None:
             with h5py.File(self.output_file, "w") as hdf:
                 for key, value in embeddings.items():
                     hdf.create_dataset(key, data=value)
 
-        emb = []
+        # emb = []
+
+        # TODO: this should be moved out to a windowing function
+        # for video in embeddings.values():
+        #     emb.append(np.array(video).mean(axis=0).tolist())
 
-        for video in embeddings.values():
-            emb.append(np.array(video).mean(axis=0).tolist())
+        transformed_modality = Modality(modality.modality_type, modality.metadata)
+        transformed_modality.data = list(embeddings.values())
+        transformed_modality.schema["data_layout"]["representation"] = "list_of_lists_of_numpy_array" # TODO: create infer data_layout method in modality
+        transformed_modality.schema["data_layout"]["type"] = transformed_modality.data[0][0].dtype # TODO: create infer data_layout method in modality
 
-        return np.array(emb)
+        return transformed_modality
 
 
 class ResNetDataset(torch.utils.data.Dataset):
@@ -131,12 +140,17 @@ def __init__(self, data: str, tf: Callable = None):
         self.tf = tf
 
     def __getitem__(self, index) -> Dict[str, object]:
-        video = self.data[index]
-        frames = torch.empty((len(video), 3, 224, 224))
-
-        for i, frame in enumerate(video):
-            frames[i] = self.tf(frame)
-        return {"id": index, "frames": frames}
+        data = self.data[index]
+        output = torch.empty((len(data), 3, 224, 224))
+        
+        for i, d in enumerate(data):
+            if data[0].ndim < 3:
+                d = torch.tensor(d)
+                d = d.repeat(3, 1, 1)
+                
+            output[i] = self.tf(d)
+
+        return {"id": index, "data": output}
 
     def __len__(self) -> int:
         return len(self.data)
diff --git a/src/main/python/systemds/scuro/representations/window.py b/src/main/python/systemds/scuro/representations/window.py
new file mode 100644
index 00000000000..b589c8664b5
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/window.py
@@ -0,0 +1,46 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import numpy as np
+import math
+from systemds.scuro import TransformedModality
+from systemds.scuro.representations.aggregate import Aggregation
+
+
+class WindowAggregation:
+    def __init__(self, window_size, aggregation_function):
+        self.window_size = window_size
+        self.aggregation_function = aggregation_function
+        
+    def window(self, modality):
+        # data is a 2d array
+        transformed_modality = TransformedModality(modality.modality_type, "window", modality.metadata)
+        for instance in modality.data:
+            window_length = math.ceil(len(instance) / self.window_size)
+            result = [[] for _ in range(0, window_length)]
+            # if modality.schema["data_layout"]["representation"] == "list_of_lists_of_numpy_array":
+            data = np.stack(instance)
+            for i in range(0, window_length):
+                result[i] = np.mean(data[i * self.window_size: i * self.window_size + self.window_size], axis=0) # TODO: add actual aggregation function here
+            
+            transformed_modality.data.append(result)
+            
+        return transformed_modality
+        
\ No newline at end of file
diff --git a/src/main/python/systemds/scuro/utils/join_condition.py b/src/main/python/systemds/scuro/utils/schema_helpers.py
similarity index 57%
rename from src/main/python/systemds/scuro/utils/join_condition.py
rename to src/main/python/systemds/scuro/utils/schema_helpers.py
index 62c8a4d0623..dfad21012cd 100644
--- a/src/main/python/systemds/scuro/utils/join_condition.py
+++ b/src/main/python/systemds/scuro/utils/schema_helpers.py
@@ -18,11 +18,26 @@
 # under the License.
 #
 # -------------------------------------------------------------
+import math
+import numpy as np
 
 
-class JoinCondition:
-    def __init__(self, field_1, field_2, join_type, alignment=None):
-        self.field_1 = field_1
-        self.field_2 = field_2
-        self.join_type = join_type
-        self.alignment = alignment
+def create_timestamps(frequency, sample_length, start_datetime=None):
+    start_time = (
+        start_datetime
+        if start_datetime is not None
+        else np.datetime64("1970-01-01T00:00:00.000000")
+    )
+    time_increment = 1 / frequency
+    time_increments_array = np.arange(sample_length) * np.timedelta64(
+        int(time_increment * 1e6)
+    )
+    timestamps = start_time + time_increments_array
+
+    return timestamps.astype(np.int64)
+
+
+def calculate_new_frequency(new_length, old_length, old_frequency):
+    duration = old_length / old_frequency
+    new_frequency = new_length / duration
+    return math.floor(new_frequency)
diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py
index 6856ee70442..03bdb243d15 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -33,7 +33,7 @@ def __init__(self, modalities, path, balanced=True):
         self.balanced = balanced
 
         for modality in modalities:
-            mod_path = f"{self.path}/{modality.type.name}/"
+            mod_path = f"{self.path}/{modality.modality_type.name}/"
             os.mkdir(mod_path)
             modality.file_path = mod_path
         self.labels = []
@@ -69,11 +69,11 @@ def create_multimodal_data(self, num_instances, duration=2, seed=42):
                 speed_slow += 1
 
             for modality in self.modalities:
-                if modality.type == ModalityType.VIDEO:
+                if modality.modality_type == ModalityType.VIDEO:
                     self.__create_video_data(idx, duration, 30, speed_factor)
-                if modality.type == ModalityType.AUDIO:
+                if modality.modality_type == ModalityType.AUDIO:
                     self.__create_audio_data(idx, duration, speed_factor)
-                if modality.type == ModalityType.TEXT:
+                if modality.modality_type == ModalityType.TEXT:
                     self.__create_text_data(idx, speed_factor)
 
         np.save(f"{self.path}/labels.npy", np.array(self.labels))
diff --git a/src/main/python/tests/scuro/test_data_loaders.py b/src/main/python/tests/scuro/test_data_loaders.py
index 55704b8d8af..acd48113762 100644
--- a/src/main/python/tests/scuro/test_data_loaders.py
+++ b/src/main/python/tests/scuro/test_data_loaders.py
@@ -88,7 +88,7 @@ def test_load_audio_data_from_file(self):
         ).apply_representation(MelSpectrogram())
 
         for i in range(0, self.num_instances):
-            assert round(sum(self.audio_ref.data[i]), 4) == round(sum(audio.data[i]), 4)
+            assert round(sum(sum(self.audio_ref.data[i])), 4) == round(sum(sum(audio.data[i])), 4)
 
     def test_load_video_data_from_file(self):
         video_data_loader = VideoLoader(self.video_path, self.indizes)
diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py
index d0d7ef50770..88e063eef63 100644
--- a/src/main/python/tests/scuro/test_dr_search.py
+++ b/src/main/python/tests/scuro/test_dr_search.py
@@ -119,6 +119,7 @@ def setUpClass(cls):
         cls.data_generator = TestDataGenerator([video, audio, text], cls.test_file_path)
         cls.data_generator.create_multimodal_data(cls.num_instances)
 
+        #TODO: adapt the representation so they return non aggregated values. Apply windowing operation instead
         cls.bert = text.apply_representation(Bert())
         cls.mel_spe = audio.apply_representation(MelSpectrogram())
         cls.resnet = video.apply_representation(ResNet())

From 67966d063b9b67c3fa9439766b3049013e529509 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Wed, 29 Jan 2025 16:38:38 +0100
Subject: [PATCH 04/16] adapt existing representations to handle context and
 add tests

---
 .../python/systemds/scuro/modality/joined.py  |  37 ++++--
 .../scuro/modality/unimodal_modality.py       |  14 +-
 .../systemds/scuro/representations/bert.py    |  31 ++---
 .../systemds/scuro/representations/bow.py     |  11 +-
 .../scuro/representations/mel_spectrogram.py  |  34 ++---
 .../systemds/scuro/representations/tfidf.py   |  12 +-
 .../scuro/representations/word2vec.py         |  13 +-
 src/main/python/tests/scuro/data_generator.py |  50 +++++++-
 .../python/tests/scuro/test_data_loaders.py   |  46 ++-----
 src/main/python/tests/scuro/test_dr_search.py |  46 ++-----
 .../tests/scuro/test_multimodal_join.py       | 103 +++++++++++++++
 .../scuro/test_unimodal_representations.py    | 120 ++++++++++++++++++
 12 files changed, 375 insertions(+), 142 deletions(-)
 create mode 100644 src/main/python/tests/scuro/test_multimodal_join.py
 create mode 100644 src/main/python/tests/scuro/test_unimodal_representations.py

diff --git a/src/main/python/systemds/scuro/modality/joined.py b/src/main/python/systemds/scuro/modality/joined.py
index aeccdd3d321..5cb3e0f6de8 100644
--- a/src/main/python/systemds/scuro/modality/joined.py
+++ b/src/main/python/systemds/scuro/modality/joined.py
@@ -119,6 +119,11 @@ def apply_representation(self, representation, aggregation):
             self.right_modality.extract_raw_data()
 
         self.execute()
+        left_transformed, right_transformed = self._apply_representation(representation)
+        left_transformed.update_metadata()
+        right_transformed.update_metadata()
+        return JoinedTransformedModality(left_transformed, right_transformed, f'joined_{representation.name}')
+        
         
     def aggregate(self, aggregation_function, field_name): # TODO: use the filed name to extract data entries from modalities
         self.aggregation = Aggregation(aggregation_function, field_name)
@@ -149,12 +154,12 @@ def _apply_representation_chunked(
         new_left= TransformedModality(
             self.left_modality.modality_type,
             representation,
-            self.left_modality.metadata,
+            {},
         )
         new_right = TransformedModality(
             self.right_modality.modality_type,
             representation,
-            self.right_modality.metadata,
+            {},
         )
         while (
             chunk_modality.data_loader.next_chunk
@@ -170,17 +175,29 @@ def _apply_representation_chunked(
             chunk_modality.extract_raw_data()
 
             self.execute(starting_idx)
-            left_transformed = representation.transform(self.left_modality)
-            left_aggregated = self.aggregation.window(left_transformed)
+            
+            left_transformed, right_transformed = self._apply_representation(representation)
             new_left.data.extend(
-                left_aggregated.data
+                left_transformed.data
             )
-
-            right_transformed = representation.transform(self.joined_right)
-            right_aggregated = self.aggregation.window(right_transformed)
+            new_left.metadata.update(left_transformed.metadata)
             new_right.data.extend(
-                right_aggregated.data
+                right_transformed.data
             )
-            
+            new_right.metadata.update(right_transformed.metadata)
+        
+        new_left.update_metadata()
+        new_right.update_metadata()
         return JoinedTransformedModality(new_left, new_right, f'joined_{representation.name}')
     
+
+    def _apply_representation(self, representation):
+        left_transformed = representation.transform(self.left_modality)
+        if self.aggregation:
+            left_transformed = self.aggregation.window(left_transformed)
+       
+        right_transformed = representation.transform(self.joined_right)
+        if self.aggregation:
+            right_transformed = self.aggregation.window(right_transformed)
+        
+        return left_transformed, right_transformed
\ No newline at end of file
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index 4fcf091afea..60d9ad004fe 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -64,8 +64,8 @@ def join(self, other, join_condition):
 
         return joined_modality
 
-    # TODO: add aggregation method like in join
-    def apply_representation(self, representation, aggregation):
+    # TODO: maybe this can be made generic so it can be used in the join class as well
+    def apply_representation(self, representation, aggregation=None):
         new_modality = TransformedModality(self.modality_type, representation, self.data_loader.metadata)
         new_modality.data = []
 
@@ -75,11 +75,17 @@ def apply_representation(self, representation, aggregation):
                 < self.data_loader.num_chunks
             ):
                 self.extract_raw_data()
-                new_modality.data.extend(representation.transform(self.data))
+                transformed_chunk = representation.transform(self)
+                if aggregation:
+                    transformed_chunk = aggregation.window(transformed_chunk)
+                new_modality.data.extend(transformed_chunk.data)
         else:
             if not self.data:
                 self.extract_raw_data()
-            new_modality.data = representation.transform(self.data)
+            new_modality = representation.transform(self)
             
+            if aggregation:
+                new_modality = aggregation.window(new_modality)
+                
         new_modality.update_metadata()
         return new_modality
diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py
index 0fcf1e8d280..08cb85e7395 100644
--- a/src/main/python/systemds/scuro/representations/bert.py
+++ b/src/main/python/systemds/scuro/representations/bert.py
@@ -21,6 +21,7 @@
 
 import numpy as np
 
+from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 import torch
 from transformers import BertTokenizer, BertModel
@@ -28,30 +29,27 @@
 
 
 class Bert(UnimodalRepresentation):
-    def __init__(self, avg_layers=None, output_file=None):
+    def __init__(self, output_file=None):
         super().__init__("Bert")
 
-        self.avg_layers = avg_layers
         self.output_file = output_file
 
-    def transform(self, data):
-
+    def transform(self, modality):
+        transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata)
         model_name = "bert-base-uncased"
         tokenizer = BertTokenizer.from_pretrained(
             model_name, clean_up_tokenization_spaces=True
         )
 
-        if self.avg_layers is not None:
-            model = BertModel.from_pretrained(model_name, output_hidden_states=True)
-        else:
-            model = BertModel.from_pretrained(model_name)
+        model = BertModel.from_pretrained(model_name)
 
-        embeddings = self.create_embeddings(data, model, tokenizer)
+        embeddings = self.create_embeddings(modality.data, model, tokenizer)
 
         if self.output_file is not None:
             save_embeddings(embeddings, self.output_file)
-
-        return embeddings
+        
+        transformed_modality.data = embeddings
+        return transformed_modality
 
     def create_embeddings(self, data, model, tokenizer):
         embeddings = []
@@ -60,16 +58,9 @@ def create_embeddings(self, data, model, tokenizer):
 
             with torch.no_grad():
                 outputs = model(**inputs)
-
-            if self.avg_layers is not None:
-                cls_embedding = [
-                    outputs.hidden_states[i][:, 0, :]
-                    for i in range(-self.avg_layers, 0)
-                ]
-                cls_embedding = torch.mean(torch.stack(cls_embedding), dim=0).numpy()
-            else:
+                
                 cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
-            embeddings.append(cls_embedding)
+                embeddings.append(cls_embedding)
 
         embeddings = np.array(embeddings)
         return embeddings.reshape((embeddings.shape[0], embeddings.shape[-1]))
diff --git a/src/main/python/systemds/scuro/representations/bow.py b/src/main/python/systemds/scuro/representations/bow.py
index bd54654a5cb..52863aaae3e 100644
--- a/src/main/python/systemds/scuro/representations/bow.py
+++ b/src/main/python/systemds/scuro/representations/bow.py
@@ -21,6 +21,7 @@
 
 from sklearn.feature_extraction.text import CountVectorizer
 
+from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from systemds.scuro.representations.utils import save_embeddings
 
@@ -32,14 +33,16 @@ def __init__(self, ngram_range, min_df, output_file=None):
         self.min_df = min_df
         self.output_file = output_file
 
-    def transform(self, data):
+    def transform(self, modality):
+        transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata)
         vectorizer = CountVectorizer(
             ngram_range=(1, self.ngram_range), min_df=self.min_df
         )
 
-        X = vectorizer.fit_transform(data).toarray()
+        X = vectorizer.fit_transform(modality.data).toarray()
 
         if self.output_file is not None:
             save_embeddings(X, self.output_file)
-
-        return X
+        
+        transformed_modality.data = X
+        return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
index 31b7f222cad..3ac026374d7 100644
--- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
@@ -18,26 +18,23 @@
 # under the License.
 #
 # -------------------------------------------------------------
-
-import pickle
-
 import librosa
 import numpy as np
-from systemds.scuro.representations.utils import pad_sequences
+
+from systemds.scuro.modality.transformed import TransformedModality
 import matplotlib.pyplot as plt
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 
 
 class MelSpectrogram(UnimodalRepresentation):
-    def __init__(self, avg=True, output_file=None):
+    def __init__(self):
         super().__init__("MelSpectrogram")
-        self.avg = avg
-        self.output_file = output_file
 
-    def transform(self, data):
+    def transform(self, modality):
+        transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata)
         result = []
         max_length = 0
-        for sample in data:
+        for sample in modality.data:
             S = librosa.feature.melspectrogram(
                 y=sample, sr=22050
             )
@@ -45,22 +42,9 @@ def transform(self, data):
             if S_dB.shape[-1] > max_length:
                 max_length = S_dB.shape[-1]
             result.append(S_dB.T)
-
-        # r = []
-        # for elem in result:
-        #     d = pad_sequences(elem, maxlen=max_length, dtype="float32")
-        #     r.append(d)
-
-        # np_array_r = np.array(r) if not self.avg else np.mean(np.array(r), axis=1)
-        #
-        # if self.output_file is not None:
-        #     data = []
-        #     for i in range(0, np_array_r.shape[0]):
-        #         data.append(np_array_r[i])
-        #     with open(self.output_file, "wb") as file:
-        #         pickle.dump(data, file)
-
-        return result
+        
+        transformed_modality.data = result
+        return transformed_modality
     
     
     def plot_spectrogram(self, spectrogram):
diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py
index 4849aba1360..0d149f30a79 100644
--- a/src/main/python/systemds/scuro/representations/tfidf.py
+++ b/src/main/python/systemds/scuro/representations/tfidf.py
@@ -20,7 +20,7 @@
 # -------------------------------------------------------------
 
 from sklearn.feature_extraction.text import TfidfVectorizer
-
+from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from systemds.scuro.representations.utils import read_data_from_file, save_embeddings
 
@@ -31,13 +31,15 @@ def __init__(self, min_df, output_file=None):
         self.min_df = min_df
         self.output_file = output_file
 
-    def transform(self, data):
+    def transform(self, modality):
+        transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata)
         vectorizer = TfidfVectorizer(min_df=self.min_df)
 
-        X = vectorizer.fit_transform(data)
+        X = vectorizer.fit_transform(modality.data)
         X = X.toarray()
 
         if self.output_file is not None:
             save_embeddings(X, self.output_file)
-
-        return X
+            
+        transformed_modality.data = X
+        return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py
index 209091648d5..a460e918bfe 100644
--- a/src/main/python/systemds/scuro/representations/word2vec.py
+++ b/src/main/python/systemds/scuro/representations/word2vec.py
@@ -19,7 +19,7 @@
 #
 # -------------------------------------------------------------
 import numpy as np
-
+from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from systemds.scuro.representations.utils import save_embeddings
 from gensim.models import Word2Vec
@@ -43,8 +43,9 @@ def __init__(self, vector_size, min_count, window, output_file=None):
         self.window = window
         self.output_file = output_file
 
-    def transform(self, data):
-        t = [word_tokenize(s.lower()) for s in data]
+    def transform(self, modality):
+        transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata)
+        t = [word_tokenize(s.lower()) for s in modality.data]
         model = Word2Vec(
             sentences=t,
             vector_size=self.vector_size,
@@ -52,11 +53,11 @@ def transform(self, data):
             min_count=self.min_count,
         )
         embeddings = []
-        for sentences in data:
+        for sentences in modality.data:
             tokens = word_tokenize(sentences.lower())
             embeddings.append(get_embedding(tokens, model))
 
         if self.output_file is not None:
             save_embeddings(np.array(embeddings), self.output_file)
-
-        return np.array(embeddings)
+        transformed_modality.data = np.array(embeddings)
+        return transformed_modality
diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py
index 03bdb243d15..bce71ebefaf 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -18,17 +18,55 @@
 # under the License.
 #
 # -------------------------------------------------------------
+import shutil
+
 import cv2
 import numpy as np
 from scipy.io.wavfile import write
 import random
 import os
+
+from systemds.scuro import VideoLoader, AudioLoader, TextLoader, UnimodalModality
 from systemds.scuro.modality.type import ModalityType
 
 
+def setup_data(modalities, num_instances, path):
+    if os.path.isdir(path):
+        shutil.rmtree(path)
+        
+    os.makedirs(path)
+    
+    indizes = [str(i) for i in range(0, num_instances)]
+    
+    modalities_to_create = []
+    for modality in modalities:
+        mod_path = path + "/" + modality.name + "/"
+        
+        if modality == ModalityType.VIDEO:
+            data_loader = VideoLoader(mod_path, indizes)
+        elif modality == ModalityType.AUDIO:
+            data_loader = AudioLoader(mod_path, indizes)
+        elif modality == ModalityType.TEXT:
+            data_loader = TextLoader(mod_path, indizes)
+        else:
+            raise 'Modality not supported in DataGenerator'
+        
+        modalities_to_create.append(UnimodalModality(data_loader, modality))
+    
+    data_generator = TestDataGenerator(modalities_to_create, path)
+    data_generator.create_multimodal_data(num_instances)
+    return data_generator
+
+
 class TestDataGenerator:
     def __init__(self, modalities, path, balanced=True):
+        
         self.modalities = modalities
+        self.modalities_by_type = {}
+        for modality in modalities:
+            self.modalities_by_type[modality.modality_type] = modality
+        
+        self._indices = None
         self.path = path
         self.balanced = balanced
 
@@ -38,10 +76,20 @@ def __init__(self, modalities, path, balanced=True):
             modality.file_path = mod_path
         self.labels = []
         self.label_path = f"{path}/labels.npy"
-
+        
+    def get_modality_path(self, modality_type):
+        return self.modalities_by_type[modality_type].data_loader.source_path
+    
+    @property
+    def indices(self):
+        if self._indices is None:
+            raise 'No indices available, please call setup_data first'
+        return self._indices
+    
     def create_multimodal_data(self, num_instances, duration=2, seed=42):
         speed_fast = 0
         speed_slow = 0
+        self._indices = [str(i) for i in range(0, num_instances)]
         for idx in range(num_instances):
             np.random.seed(seed)
             if self.balanced:
diff --git a/src/main/python/tests/scuro/test_data_loaders.py b/src/main/python/tests/scuro/test_data_loaders.py
index acd48113762..a8058c1391b 100644
--- a/src/main/python/tests/scuro/test_data_loaders.py
+++ b/src/main/python/tests/scuro/test_data_loaders.py
@@ -26,7 +26,7 @@
 from systemds.scuro.representations.bert import Bert
 from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
 from systemds.scuro.representations.resnet import ResNet
-from tests.scuro.data_generator import TestDataGenerator
+from tests.scuro.data_generator import setup_data
 
 from systemds.scuro.dataloader.audio_loader import AudioLoader
 from systemds.scuro.dataloader.video_loader import VideoLoader
@@ -42,39 +42,19 @@ class TestDataLoaders(unittest.TestCase):
     video = None
     data_generator = None
     num_instances = 0
-    indizes = []
 
     @classmethod
     def setUpClass(cls):
         cls.test_file_path = "test_data"
-
-        if os.path.isdir(cls.test_file_path):
-            shutil.rmtree(cls.test_file_path)
-
-        os.makedirs(f"{cls.test_file_path}/embeddings")
-
         cls.num_instances = 2
-        cls.indizes = [str(i) for i in range(0, cls.num_instances)]
-
-        cls.video_path = cls.test_file_path + "/" + ModalityType.VIDEO.name + "/"
-        cls.audio_path = cls.test_file_path + "/" + ModalityType.AUDIO.name + "/"
-        cls.text_path = cls.test_file_path + "/" + ModalityType.TEXT.name + "/"
-
-        video_data_loader = VideoLoader(cls.video_path, cls.indizes)
-        audio_data_loader = AudioLoader(cls.audio_path, cls.indizes)
-        text_data_loader = TextLoader(cls.text_path, cls.indizes)
-
-        # Load modalities (audio, video, text)
-        video = UnimodalModality(video_data_loader, ModalityType.VIDEO)
-        audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO)
-        text = UnimodalModality(text_data_loader, ModalityType.TEXT)
-
-        cls.mods = [video, audio, text]
-        cls.data_generator = TestDataGenerator(cls.mods, cls.test_file_path)
-        cls.data_generator.create_multimodal_data(cls.num_instances)
-        cls.text_ref = text.apply_representation(Bert())
-        cls.audio_ref = audio.apply_representation(MelSpectrogram())
-        cls.video_ref = video.apply_representation(ResNet())
+        cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
+        cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path)
+        
+        os.makedirs(f"{cls.test_file_path}/embeddings")
+        
+        cls.text_ref = cls.data_generator.modalities_by_type[ModalityType.TEXT].apply_representation(Bert())
+        cls.audio_ref = cls.data_generator.modalities_by_type[ModalityType.AUDIO].apply_representation(MelSpectrogram())
+        cls.video_ref = cls.data_generator.modalities_by_type[ModalityType.VIDEO].apply_representation(ResNet())
 
     @classmethod
     def tearDownClass(cls):
@@ -82,7 +62,7 @@ def tearDownClass(cls):
         shutil.rmtree(cls.test_file_path)
 
     def test_load_audio_data_from_file(self):
-        audio_data_loader = AudioLoader(self.audio_path, self.indizes)
+        audio_data_loader = AudioLoader(self.data_generator.get_modality_path(ModalityType.AUDIO), self.data_generator.indices)
         audio = UnimodalModality(
             audio_data_loader, ModalityType.AUDIO
         ).apply_representation(MelSpectrogram())
@@ -91,16 +71,16 @@ def test_load_audio_data_from_file(self):
             assert round(sum(sum(self.audio_ref.data[i])), 4) == round(sum(sum(audio.data[i])), 4)
 
     def test_load_video_data_from_file(self):
-        video_data_loader = VideoLoader(self.video_path, self.indizes)
+        video_data_loader = VideoLoader(self.data_generator.get_modality_path(ModalityType.VIDEO), self.data_generator.indices)
         video = UnimodalModality(
             video_data_loader, ModalityType.VIDEO
         ).apply_representation(ResNet())
 
         for i in range(0, self.num_instances):
-            assert round(sum(self.video_ref.data[i]), 4) == round(sum(video.data[i]), 4)
+            assert round(sum(sum(self.video_ref.data[i])), 4) == round(sum(sum(video.data[i])), 4)
 
     def test_load_text_data_from_file(self):
-        text_data_loader = TextLoader(self.text_path, self.indizes)
+        text_data_loader = TextLoader(self.data_generator.get_modality_path(ModalityType.TEXT), self.data_generator.indices)
         text = UnimodalModality(
             text_data_loader, ModalityType.TEXT
         ).apply_representation(Bert())
diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py
index 88e063eef63..eda23348404 100644
--- a/src/main/python/tests/scuro/test_dr_search.py
+++ b/src/main/python/tests/scuro/test_dr_search.py
@@ -25,14 +25,10 @@
 import numpy as np
 from sklearn import svm
 from sklearn.metrics import classification_report
-from sklearn.model_selection import train_test_split, KFold
+from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import MinMaxScaler
 
-from systemds.scuro.modality.unimodal_modality import UnimodalModality
 from systemds.scuro.modality.type import ModalityType
-from systemds.scuro.dataloader.text_loader import TextLoader
-from systemds.scuro.dataloader.audio_loader import AudioLoader
-from systemds.scuro.dataloader.video_loader import VideoLoader
 from systemds.scuro.aligner.dr_search import DRSearch
 from systemds.scuro.aligner.task import Task
 from systemds.scuro.models.model import Model
@@ -45,7 +41,7 @@
 from systemds.scuro.representations.multiplication import Multiplication
 from systemds.scuro.representations.resnet import ResNet
 from systemds.scuro.representations.sum import Sum
-from tests.scuro.data_generator import TestDataGenerator
+from tests.scuro.data_generator import setup_data
 
 import warnings
 
@@ -89,52 +85,34 @@ class TestDataLoaders(unittest.TestCase):
     video = None
     data_generator = None
     num_instances = 0
-    indizes = []
     representations = None
 
     @classmethod
     def setUpClass(cls):
         cls.test_file_path = "test_data_dr_search"
-
-        if os.path.isdir(cls.test_file_path):
-            shutil.rmtree(cls.test_file_path)
-
-        os.makedirs(f"{cls.test_file_path}/embeddings")
-
         cls.num_instances = 8
-        cls.indizes = [str(i) for i in range(0, cls.num_instances)]
-
-        video_data_loader = VideoLoader(
-            cls.test_file_path + "/" + ModalityType.VIDEO.name + "/", cls.indizes
-        )
-        audio_data_loader = AudioLoader(
-            cls.test_file_path + "/" + ModalityType.AUDIO.name + "/", cls.indizes
-        )
-        text_data_loader = TextLoader(
-            cls.test_file_path + "/" + ModalityType.TEXT.name + "/", cls.indizes
-        )
-        video = UnimodalModality(video_data_loader, ModalityType.VIDEO)
-        audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO)
-        text = UnimodalModality(text_data_loader, ModalityType.TEXT)
-        cls.data_generator = TestDataGenerator([video, audio, text], cls.test_file_path)
-        cls.data_generator.create_multimodal_data(cls.num_instances)
+        modalities = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
+        
+        cls.data_generator = setup_data(modalities, cls.num_instances, cls.test_file_path)
+        os.makedirs(f"{cls.test_file_path}/embeddings")
 
         #TODO: adapt the representation so they return non aggregated values. Apply windowing operation instead
-        cls.bert = text.apply_representation(Bert())
-        cls.mel_spe = audio.apply_representation(MelSpectrogram())
-        cls.resnet = video.apply_representation(ResNet())
+        
+        cls.bert = cls.data_generator.modalities_by_type[ModalityType.TEXT].apply_representation(Bert())
+        cls.mel_spe = cls.data_generator.modalities_by_type[ModalityType.AUDIO].apply_representation(MelSpectrogram())
+        cls.resnet = cls.data_generator.modalities_by_type[ModalityType.VIDEO].apply_representation(ResNet())
 
         cls.mods = [cls.bert, cls.mel_spe, cls.resnet]
 
         split = train_test_split(
-            cls.indizes, cls.data_generator.labels, test_size=0.2, random_state=42
+            cls.data_generator.indices, cls.data_generator.labels, test_size=0.2, random_state=42
         )
         cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [
             int(i) for i in split[1]
         ]
 
         for m in cls.mods:
-            m.data = scale_data(m.data, [int(i) for i in cls.train_indizes])
+            m.data = scale_data(m.data, cls.train_indizes)
 
         cls.representations = [
             Concatenation(),
diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py
new file mode 100644
index 00000000000..a21895c98f0
--- /dev/null
+++ b/src/main/python/tests/scuro/test_multimodal_join.py
@@ -0,0 +1,103 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# Test edge cases: unequal number of audio-video timestamps (should still work and add the average over all audio/video samples)
+
+
+import os
+import shutil
+import unittest
+
+from systemds.scuro.modality.joined import JoinCondition
+from systemds.scuro.representations.window import WindowAggregation
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
+from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
+from systemds.scuro.representations.resnet import ResNet
+from tests.scuro.data_generator import setup_data
+
+from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.video_loader import VideoLoader
+from systemds.scuro.modality.type import ModalityType
+
+
+class TestUnimodalRepresentations(unittest.TestCase):
+    test_file_path = None
+    mods = None
+    text = None
+    audio = None
+    video = None
+    data_generator = None
+    num_instances = 0
+    indizes = []
+
+    @classmethod
+    def setUpClass(cls):
+        cls.test_file_path = "join_test_data"
+        cls.num_instances = 4
+        cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO]
+       
+        cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path)
+
+    @classmethod
+    def tearDownClass(cls):
+        print("Cleaning up test data")
+        shutil.rmtree(cls.test_file_path)
+
+    def test_video_audio_join(self):
+        self._execute_av_join()
+
+    def test_chunked_video_audio_join(self):
+        self._execute_av_join(2)
+        
+    def test_video_chunked_audio_join(self):
+        self._execute_av_join(None, 2)
+
+    def test_chunked_video_chunked_audio_join(self):
+        self._execute_av_join(2, 2)
+
+    def _execute_av_join(self, l_chunk_size=None, r_chunk_size=None):
+        window_size = 2
+        video_data_loader = VideoLoader(
+            self.data_generator.get_modality_path(ModalityType.VIDEO), self.data_generator.indices, chunk_size=l_chunk_size
+        )
+        video = UnimodalModality(video_data_loader, ModalityType.VIDEO)
+        
+        audio_data_loader = AudioLoader(self.data_generator.get_modality_path(ModalityType.AUDIO), self.data_generator.indices, r_chunk_size)
+        audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO)
+        
+        mel_audio = audio.apply_representation(MelSpectrogram())
+        
+        resnet_modality = (
+            video.join(mel_audio, JoinCondition("timestamp", "timestamp", "<"))
+            .apply_representation(
+                ResNet(layer="layer1.0.conv2"),
+                WindowAggregation(window_size=window_size, aggregation_function="mean"),
+            )
+            .combine("concat")
+        )
+
+        assert resnet_modality.left_modality is not None
+        assert resnet_modality.right_modality is not None
+        assert len(resnet_modality.left_modality.data) == self.num_instances
+        assert len(resnet_modality.right_modality.data) == self.num_instances
+        assert resnet_modality.data is not None
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py b/src/main/python/tests/scuro/test_unimodal_representations.py
new file mode 100644
index 00000000000..aea37eb93b0
--- /dev/null
+++ b/src/main/python/tests/scuro/test_unimodal_representations.py
@@ -0,0 +1,120 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import os
+import shutil
+import unittest
+
+from systemds.scuro.representations.bow import BoW
+from systemds.scuro.representations.word2vec import W2V
+from systemds.scuro.representations.tfidf import TfIdf
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
+from systemds.scuro.representations.bert import Bert
+from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
+from systemds.scuro.representations.resnet import ResNet
+from tests.scuro.data_generator import setup_data
+
+from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.video_loader import VideoLoader
+from systemds.scuro.dataloader.text_loader import TextLoader
+from systemds.scuro.modality.type import ModalityType
+
+
+class TestUnimodalRepresentations(unittest.TestCase):
+    test_file_path = None
+    mods = None
+    text = None
+    audio = None
+    video = None
+    data_generator = None
+    num_instances = 0
+    indizes = []
+
+    @classmethod
+    def setUpClass(cls):
+        cls.test_file_path = "unimodal_test_data"
+
+        cls.num_instances = 4
+        cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
+
+        cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path)
+        os.makedirs(f"{cls.test_file_path}/embeddings")
+
+    @classmethod
+    def tearDownClass(cls):
+        print("Cleaning up test data")
+        shutil.rmtree(cls.test_file_path)
+
+    def test_audio_representations(self):
+        audio_representations = [MelSpectrogram()]  # TODO: add FFT, TFN, 1DCNN
+        audio_data_loader = AudioLoader(
+            self.data_generator.get_modality_path(ModalityType.AUDIO),
+            self.data_generator.indices,
+        )
+        audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO)
+
+        for representation in audio_representations:
+            r = audio.apply_representation(representation)
+            assert r.data is not None
+            assert len(r.data) == self.num_instances
+
+    def test_video_representations(self):
+        video_representations = [ResNet()]  # Todo: add other video representations
+        video_data_loader = VideoLoader(
+            self.data_generator.get_modality_path(ModalityType.VIDEO),
+            self.data_generator.indices,
+        )
+        video = UnimodalModality(video_data_loader, ModalityType.VIDEO)
+        for representation in video_representations:
+            r = video.apply_representation(representation)
+            assert r.data is not None
+            assert len(r.data) == self.num_instances
+
+    def test_text_representations(self):
+        # TODO: check params fro BOW, W2V, TfIdf
+        test_representations = [BoW(2, 2), W2V(5, 2, 2), TfIdf(2), Bert()]
+        text_data_loader = TextLoader(
+            self.data_generator.get_modality_path(ModalityType.TEXT),
+            self.data_generator.indices,
+        )
+        text = UnimodalModality(text_data_loader, ModalityType.TEXT)
+
+        for representation in test_representations:
+            r = text.apply_representation(representation)
+            assert r.data is not None
+            assert len(r.data) == self.num_instances
+            
+    def test_chunked_video_representations(self):
+        video_representations = [ResNet()]
+        video_data_loader = VideoLoader(
+            self.data_generator.get_modality_path(ModalityType.VIDEO),
+            self.data_generator.indices,
+            chunk_size=2,
+        )
+        video = UnimodalModality(video_data_loader, ModalityType.VIDEO)
+        for representation in video_representations:
+            r = video.apply_representation(representation)
+            assert r.data is not None
+            assert len(r.data) == self.num_instances
+
+
+if __name__ == "__main__":
+    unittest.main()

From 57217c4241cd10b181a0f5954c6a80d23f3df7b7 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Wed, 5 Feb 2025 10:20:12 +0100
Subject: [PATCH 05/16] add join for transformed modalities

---
 .../systemds/scuro/dataloader/audio_loader.py |   1 +
 .../systemds/scuro/dataloader/base_loader.py  |   4 +-
 .../python/systemds/scuro/modality/joined.py  | 213 ++++++++++++------
 .../scuro/modality/joined_transformed.py      |  16 +-
 .../systemds/scuro/modality/modality.py       |  70 +++++-
 .../systemds/scuro/modality/transformed.py    |  43 +++-
 .../python/systemds/scuro/modality/type.py    |  15 +-
 .../scuro/modality/unimodal_modality.py       |  40 ++--
 .../systemds/scuro/representations/bert.py    |   8 +-
 .../systemds/scuro/representations/bow.py     |   6 +-
 .../scuro/representations/mel_spectrogram.py  |  23 +-
 .../systemds/scuro/representations/resnet.py  |  97 ++++----
 .../systemds/scuro/representations/tfidf.py   |   6 +-
 .../systemds/scuro/representations/window.py  |  23 +-
 .../scuro/representations/word2vec.py         |   4 +-
 .../python/systemds/scuro/utils/__init__.py   |   2 +-
 .../systemds/scuro/utils/schema_helpers.py    |   2 +-
 src/main/python/tests/scuro/data_generator.py |  26 +--
 .../python/tests/scuro/test_data_loaders.py   |  39 +++-
 src/main/python/tests/scuro/test_dr_search.py |  33 ++-
 .../tests/scuro/test_multimodal_join.py       |  70 ++++--
 .../scuro/test_unimodal_representations.py    |   2 +-
 22 files changed, 518 insertions(+), 225 deletions(-)

diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py
index b86d8a28763..f7319fe1912 100644
--- a/src/main/python/systemds/scuro/dataloader/audio_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py
@@ -24,6 +24,7 @@
 from systemds.scuro.dataloader.base_loader import BaseLoader
 from systemds.scuro.utils.schema_helpers import create_timestamps
 
+
 class AudioLoader(BaseLoader):
     def __init__(
         self,
diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py
index 33d4e4920e1..5cdf63f584c 100644
--- a/src/main/python/systemds/scuro/dataloader/base_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/base_loader.py
@@ -55,11 +55,11 @@ def chunk_size(self):
     def chunk_size(self, value):
         self._chunk_size = value
         self._num_chunks = int(len(self.indices) / self._chunk_size)
-        
+
     @property
     def num_chunks(self):
         return self._num_chunks
-    
+
     @property
     def next_chunk(self):
         return self._next_chunk
diff --git a/src/main/python/systemds/scuro/modality/joined.py b/src/main/python/systemds/scuro/modality/joined.py
index 5cb3e0f6de8..08d9a1b7ae7 100644
--- a/src/main/python/systemds/scuro/modality/joined.py
+++ b/src/main/python/systemds/scuro/modality/joined.py
@@ -24,8 +24,8 @@
 
 from systemds.scuro.modality.joined_transformed import JoinedTransformedModality
 from systemds.scuro.modality.modality import Modality
-from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.aggregate import Aggregation
+from systemds.scuro.representations.utils import pad_sequences
 
 
 class JoinCondition:
@@ -56,17 +56,28 @@ def __init__(
         self.left_modality = left_modality
         self.right_modality = right_modality
         self.condition = join_condition
-        self.chunked_execution = chunked_execution # TODO: maybe move this into parent class
+        self.chunked_execution = (
+            chunked_execution  # TODO: maybe move this into parent class
+        )
         self.left_type = type(left_modality)
         self.right_type = type(right_modality)
-        if self.chunked_execution:
+        self.chunk_left = False
+        if self.chunked_execution and self.left_type.__name__.__contains__("Unimodal"):
             self.chunk_left = left_modality.data_loader.chunk_size is not None
 
-    def execute(self, right_starting_idx=0):
+    def execute(self, starting_idx=0):
         self.joined_right = self.right_modality.copy_from_instance()
 
-        for i, element in enumerate(self.left_modality.data):
-            idx_1 = list(self.left_modality.metadata.values())[i + right_starting_idx][
+        start, end = 0, len(self.left_modality.data)
+        if self.chunked_execution and not self.chunk_left:
+            start = starting_idx
+            end = (
+                self.right_modality.data_loader.chunk_size
+                * self.right_modality.data_loader.next_chunk
+            )
+
+        for i in range(start, end):
+            idx_1 = list(self.left_modality.metadata.values())[i + starting_idx][
                 self.condition.leftField
             ]
             if (
@@ -76,7 +87,10 @@ def execute(self, right_starting_idx=0):
                 nextIdx[:-1] = idx_1[1:]
                 nextIdx[-1] = sys.maxsize
 
-            idx_2 = list(self.right_modality.metadata.values())[i + right_starting_idx][
+            if self.chunk_left:
+                i = i + starting_idx
+
+            idx_2 = list(self.right_modality.metadata.values())[i][
                 self.condition.rightField
             ]
             self.joined_right.data.append([])
@@ -87,25 +101,53 @@ def execute(self, right_starting_idx=0):
             # video: list of lists of numpy array
             # audio: list of numpy array
             for j in range(0, len(idx_1)):
-                self.joined_right.data[i].append([])
-                other = np.array([])
+                self.joined_right.data[i - starting_idx].append([])
+                right = np.array([])
                 if self.condition.join_type == "<":
                     while c < len(idx_2) and idx_2[c] < nextIdx[j]:
-                        if other.size == 0:
-                            other = self.right_modality.data[i + right_starting_idx][c][np.newaxis, :]
+                        if right.size == 0:
+                            right = self.right_modality.data[i][c]
+                            if right.ndim == 1:
+                                right = right[np.newaxis, :]
                         else:
-                            other = np.concatenate([other, self.right_modality.data[i + right_starting_idx][c][np.newaxis, :]], axis=0)
-                        # other.append(self.right_modality.data[i][c])
+                            if len(self.right_modality.data) < i:
+                                print(f"i:{i}")
+                                print(f"starting_index:{starting_idx}")
+                                print(
+                                    f"right mod length:{len(self.right_modality.data)}"
+                                )
+                                print(f"left mod length:{len(self.left_modality.data)}")
+
+                            if self.right_modality.data[i][c].ndim == 1:
+                                right = np.concatenate(
+                                    [
+                                        right,
+                                        self.right_modality.data[i][c][np.newaxis, :],
+                                    ],
+                                    axis=0,
+                                )
+                            else:
+                                right = np.concatenate(
+                                    [right, self.right_modality.data[i][c]],
+                                    axis=0,
+                                )
                         c = c + 1
                 else:
                     while c < len(idx_2) and idx_2[c] <= idx_1[j]:
                         if idx_2[c] == idx_1[j]:
-                            other.append(self.right_modality.data[i + right_starting_idx][c])
+                            right.append(self.right_modality.data[i][c])
                         c = c + 1
-                        
-                if len(other) == 0: # Audio and video length sometimes do not match so we add the average all audio samples for this specific frame
-                    other = np.mean(self.right_modality.data[i + right_starting_idx], axis=0)[np.newaxis,:] # TODO: check correct loading for all data layouts, this is similar to missing data, add a different operation for htis
-                self.joined_right.data[i][j] = other
+
+                if (
+                    len(right) == 0
+                ):  # Audio and video length sometimes do not match so we add the average all audio samples for this specific frame
+                    right = np.mean(self.right_modality.data[i][c - 1 : c], axis=0)
+                    if right.ndim == 1:
+                        right = right[
+                            np.newaxis, :
+                        ]  # TODO: check correct loading for all data layouts, this is similar to missing data, add a different operation for this
+
+                self.joined_right.data[i - starting_idx][j] = right
 
     def apply_representation(self, representation, aggregation):
         self.aggregation = aggregation
@@ -119,20 +161,62 @@ def apply_representation(self, representation, aggregation):
             self.right_modality.extract_raw_data()
 
         self.execute()
-        left_transformed, right_transformed = self._apply_representation(representation)
+        left_transformed = self._apply_representation(
+            self.left_modality, representation
+        )
+        right_transformed = self._apply_representation(
+            self.joined_right, representation
+        )
         left_transformed.update_metadata()
         right_transformed.update_metadata()
-        return JoinedTransformedModality(left_transformed, right_transformed, f'joined_{representation.name}')
-        
-        
-    def aggregate(self, aggregation_function, field_name): # TODO: use the filed name to extract data entries from modalities
+        return JoinedTransformedModality(
+            left_transformed, right_transformed, f"joined_{representation.name}"
+        )
+
+    def aggregate(
+        self, aggregation_function, field_name
+    ):  # TODO: use the filed name to extract data entries from modalities
         self.aggregation = Aggregation(aggregation_function, field_name)
-        
+
         if not self.chunked_execution and self.joined_right:
             return self.aggregation.aggregate(self.joined_right)
-        
+
+        return self
+
+    def combine(self, fusion_method):
+        """
+        Combines two or more modalities with each other using a dedicated fusion method
+        :param other: The modality to be combined
+        :param fusion_method: The fusion method to be used to combine modalities
+        """
+        modalities = [self.left_modality, self.right_modality]
+        self.data = []
+        reshape = False
+        if self.left_modality.get_data_shape() != self.joined_right.get_data_shape():
+            reshape = True
+        for i in range(0, len(self.left_modality.data)):
+            self.data.append([])
+            for j in range(0, len(self.left_modality.data[i])):
+                self.data[i].append([])
+                if reshape:
+                    self.joined_right.data[i][j] = self.joined_right.data[i][j].reshape(
+                        self.left_modality.get_data_shape()
+                    )
+                fused = np.concatenate(
+                    [self.left_modality.data[i][j], self.joined_right.data[i][j]],
+                    axis=0,
+                )
+                self.data[i][j] = fused
+        # self.data = fusion_method.transform(modalities)
+
+        for i, instance in enumerate(
+            self.data
+        ):  # TODO: only if the layout is list_of_lists_of_numpy_array
+            r = []
+            [r.extend(l) for l in instance]
+            self.data[i] = np.array(r)
+        self.data = pad_sequences(self.data)
         return self
-            
 
     def _handle_chunked_execution(self, representation):
         if self.left_type == self.right_type:
@@ -143,61 +227,58 @@ def _handle_chunked_execution(self, representation):
             return self._apply_representation_chunked(
                 self.left_modality, self.right_modality, False, representation
             )
-        else:
+        else:  # TODO: refactor this approach (it is changing the way the modalities are joined)
             return self._apply_representation_chunked(
                 self.right_modality, self.left_modality, False, representation
             )
 
     def _apply_representation_chunked(
-        self, chunk_modality, other_modality, chunk_other, representation
+        self, left_modality, right_modality, chunk_right, representation
     ):
-        new_left= TransformedModality(
-            self.left_modality.modality_type,
-            representation,
-            {},
-        )
-        new_right = TransformedModality(
-            self.right_modality.modality_type,
-            representation,
-            {},
-        )
+        new_left = Modality(left_modality.modality_type, {})
+        new_right = Modality(right_modality.modality_type, {})
+
+        transform_right = True
         while (
-            chunk_modality.data_loader.next_chunk
-            < chunk_modality.data_loader.num_chunks
+            left_modality.data_loader.next_chunk < left_modality.data_loader.num_chunks
         ):
-            print(chunk_modality.data_loader.next_chunk
-            )
-            if chunk_other:
-                other_modality.extract_raw_data()
+            print(left_modality.data_loader.next_chunk)
+            if chunk_right:
+                right_modality.extract_raw_data()
                 starting_idx = 0
             else:
-                starting_idx = chunk_modality.data_loader.next_chunk * chunk_modality.data_loader.chunk_size
-            chunk_modality.extract_raw_data()
+                starting_idx = (
+                    left_modality.data_loader.next_chunk
+                    * left_modality.data_loader.chunk_size
+                )
+            left_modality.extract_raw_data()
 
             self.execute(starting_idx)
-            
-            left_transformed, right_transformed = self._apply_representation(representation)
-            new_left.data.extend(
-                left_transformed.data
-            )
-            new_left.metadata.update(left_transformed.metadata)
-            new_right.data.extend(
-                right_transformed.data
+
+            right_transformed = self._apply_representation(
+                self.joined_right, representation
             )
+            new_right.data.extend(right_transformed.data)
             new_right.metadata.update(right_transformed.metadata)
-        
+
+            left_transformed = self._apply_representation(left_modality, representation)
+            new_left.data.extend(left_transformed.data)
+            new_left.metadata.update(left_transformed.metadata)
+
         new_left.update_metadata()
         new_right.update_metadata()
-        return JoinedTransformedModality(new_left, new_right, f'joined_{representation.name}')
-    
+        return JoinedTransformedModality(
+            new_left, new_right, f"joined_{representation.name}"
+        )
 
-    def _apply_representation(self, representation):
-        left_transformed = representation.transform(self.left_modality)
-        if self.aggregation:
-            left_transformed = self.aggregation.window(left_transformed)
-       
-        right_transformed = representation.transform(self.joined_right)
+    def _apply_representation(self, modality, representation):
+        transformed = representation.transform(modality)
         if self.aggregation:
-            right_transformed = self.aggregation.window(right_transformed)
-        
-        return left_transformed, right_transformed
\ No newline at end of file
+            aggregated_data_left = self.aggregation.window(transformed)
+            transformed = Modality(
+                transformed.modality_type,
+                transformed.metadata,
+            )
+            transformed.data = aggregated_data_left
+
+        return transformed
diff --git a/src/main/python/systemds/scuro/modality/joined_transformed.py b/src/main/python/systemds/scuro/modality/joined_transformed.py
index 558b0e3760e..e2b53671aa8 100644
--- a/src/main/python/systemds/scuro/modality/joined_transformed.py
+++ b/src/main/python/systemds/scuro/modality/joined_transformed.py
@@ -26,6 +26,7 @@
 from systemds.scuro.modality.modality import Modality
 from systemds.scuro.representations.utils import pad_sequences
 
+
 class JoinedTransformedModality(Modality):
 
     def __init__(self, left_modality, right_modality, transformation):
@@ -33,7 +34,9 @@ def __init__(self, left_modality, right_modality, transformation):
         Parent class of the different Modalities (unimodal & multimodal)
         :param transformation: Representation to be applied on the modality
         """
-        super().__init__(reduce(or_, [left_modality.modality_type], right_modality.modality_type))
+        super().__init__(
+            reduce(or_, [left_modality.modality_type], right_modality.modality_type)
+        )
         self.transformation = transformation
         self.left_modality = left_modality
         self.right_modality = right_modality
@@ -50,11 +53,16 @@ def combine(self, fusion_method):
             self.data.append([])
             for j in range(0, len(self.left_modality.data[i])):
                 self.data[i].append([])
-                fused = np.concatenate([self.left_modality.data[i][j], self.right_modality.data[i][j]], axis=0)
+                fused = np.concatenate(
+                    [self.left_modality.data[i][j], self.right_modality.data[i][j]],
+                    axis=0,
+                )
                 self.data[i][j] = fused
         # self.data = fusion_method.transform(modalities)
-        
-        for i, instance in enumerate(self.data): # TODO: only if the layout is list_of_lists_of_numpy_array
+
+        for i, instance in enumerate(
+            self.data
+        ):  # TODO: only if the layout is list_of_lists_of_numpy_array
             r = []
             [r.extend(l) for l in instance]
             self.data[i] = np.array(r)
diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
index 28d27b04144..cce26eee014 100644
--- a/src/main/python/systemds/scuro/modality/modality.py
+++ b/src/main/python/systemds/scuro/modality/modality.py
@@ -34,7 +34,7 @@ def __init__(self, modalityType: ModalityType, metadata=None):
         """
         self.modality_type = modalityType
         self.schema = modalityType.get_schema()
-        self.data = None
+        self.data = []
         self.data_type = None
         self.cost = None
         self.shape = None
@@ -45,17 +45,69 @@ def get_modality_names(self) -> List[str]:
         """
         Extracts the individual unimodal modalities for a given transformed modality.
         """
-        return [modality.name for modality in ModalityType if modality in self.modality_type]
-    
-    
+        return [
+            modality.name for modality in ModalityType if modality in self.modality_type
+        ]
+
+    def copy_from_instance(self):
+        return type(self)(self.modality_type, self.metadata)
+
     def update_metadata(self):
         md_copy = self.metadata
         self.metadata = {}
         for i, (md_k, md_v) in enumerate(md_copy.items()):
             updated_md = self.modality_type.update_metadata(md_v, self.data[i])
             self.metadata[md_k] = updated_md
-            
-            
-    def window(self, windowSize, aggregationFunction, fieldName):
-        pass
-    
\ No newline at end of file
+
+    def get_metadata_at_position(self, position: int):
+        return self.metadata[self.dataIndex][position]
+
+    def flatten(self):
+        for num_instance, instance in enumerate(self.data):
+            if type(instance) is np.ndarray:
+                self.data[num_instance] = instance.flatten()
+            elif type(instance) is list:
+                self.data[num_instance] = np.array(
+                    [item for sublist in instance for item in sublist]
+                )
+
+        self.data = np.array(self.data)
+        return self
+
+    def get_data_layout(self):
+        if not self.data:
+            return self.data
+
+        if isinstance(self.data[0], list):
+            return "list_of_lists_of_numpy_array"
+        elif isinstance(self.data[0], np.ndarray):
+            return "list_of_numpy_array"
+
+    def get_data_shape(self):
+        layout = self.get_data_layout()
+        if not layout:
+            return None
+
+        if layout == "list_of_lists_of_numpy_array":
+            return self.data[0][0].shape
+        elif layout == "list_of_numpy_array":
+            return self.data[0].shape
+
+    def get_data_dtype(self):
+        layout = self.get_data_layout()
+        if not layout:
+            return None
+
+        if layout == "list_of_lists_of_numpy_array":
+            return self.data[0][0].dtype
+        elif layout == "list_of_numpy_array":
+            return self.data[0].dtype
+
+    def update_data_layout(self):
+        if not self.data:
+            return
+
+        self.schema["data_layout"]["representation"] = self.get_data_layout()
+
+        self.shape = self.get_data_shape()
+        self.schema["data_layout"]["type"] = self.get_data_dtype()
diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py
index e13395045f2..64bfba0819f 100644
--- a/src/main/python/systemds/scuro/modality/transformed.py
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -21,7 +21,9 @@
 from functools import reduce
 from operator import or_
 
+from systemds.scuro.modality.joined import JoinedModality
 from systemds.scuro.modality.modality import Modality
+from systemds.scuro.representations.window import WindowAggregation
 
 
 class TransformedModality(Modality):
@@ -39,6 +41,44 @@ def __init__(self, modality_type, transformation, metadata):
     def copy_from_instance(self):
         return type(self)(self.modality_type, self.transformation, self.metadata)
 
+    def join(self, right, join_condition):
+        chunked_execution = False
+        if type(right).__name__.__contains__("Unimodal"):
+            if right.data_loader.chunk_size:
+                chunked_execution = True
+            elif right.data is None or len(right.data) == 0:
+                right.extract_raw_data()
+
+        joined_modality = JoinedModality(
+            reduce(or_, [right.modality_type], self.modality_type),
+            self,
+            right,
+            join_condition,
+            chunked_execution,
+        )
+
+        if not chunked_execution:
+            joined_modality.execute(0)
+
+        return joined_modality
+
+    def window(self, windowSize, aggregationFunction, fieldName=None):
+        transformed_modality = TransformedModality(
+            self.modality_type, "window", self.metadata
+        )
+        w = WindowAggregation(windowSize, aggregationFunction)
+        transformed_modality.data = w.window(self)
+
+        return transformed_modality
+
+    def apply_representation(self, representation, aggregation):
+        new_modality = representation.transform(self)
+
+        if aggregation:
+            new_modality.data = aggregation.window(new_modality)
+
+        new_modality.update_metadata()
+        return new_modality
 
     def combine(self, other, fusion_method):
         """
@@ -48,7 +88,8 @@ def combine(self, other, fusion_method):
         """
         fused_modality = TransformedModality(
             reduce(or_, (o.modality_type for o in other), self.modality_type),
-            fusion_method, self.metadata
+            fusion_method,
+            self.metadata,
         )
         modalities = [self]
         modalities.extend(other)
diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py
index 0dbacccef5a..197ad23c540 100644
--- a/src/main/python/systemds/scuro/modality/type.py
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -70,7 +70,7 @@ def update_metadata(cls, name, md, data):
         mdHandler = cls._metadata_handlers.get(name)
         if mdHandler:
             return mdHandler(md, data)
-        
+
     def extract_data(self, data, index):
         if self.get("data_layout").get("representation") == "list_array":
             return data[index]
@@ -91,6 +91,19 @@ def handle_audio_metadata(md, data):
     return md
 
 
+@ModalitySchemas.register_metadata_handler("VIDEO")
+def handle_video_metadata(md, data):
+    new_frequency = calculate_new_frequency(len(data), md["length"], md["fps"])
+    md.update(
+        {
+            "length": len(data),
+            "fps": new_frequency,
+            "timestamp": create_timestamps(new_frequency, len(data)),
+        }
+    )
+    return md
+
+
 class ModalityType(Flag):
     TEXT = auto()
     AUDIO = auto()
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index 60d9ad004fe..ae33b6605ba 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -23,8 +23,8 @@
 
 
 from systemds.scuro.dataloader.base_loader import BaseLoader
-from systemds.scuro.modality.joined import JoinedModality
 from systemds.scuro.modality.modality import Modality
+from systemds.scuro.modality.joined import JoinedModality
 from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.modality.type import ModalityType
 
@@ -39,9 +39,20 @@ def __init__(self, data_loader: BaseLoader, modality_type: ModalityType):
         """
         super().__init__(modality_type, None)
         self.data_loader = data_loader
-        
+
     def copy_from_instance(self):
-        return type(self)(self.data_loader, self.modality_type)
+        new_instance = type(self)(self.data_loader, self.modality_type)
+        if self.metadata:
+            new_instance.metadata = self.metadata.copy()
+        return new_instance
+
+    def get_metadata_at_position(self, position: int):
+        if self.data_loader.chunk_size:
+            return self.metadata[
+                self.data_loader.chunk_size * self.data_loader.next_chunk + position
+            ]
+
+        return self.metadata[self.dataIndex][position]
 
     def extract_raw_data(self):
         """
@@ -53,39 +64,38 @@ def extract_raw_data(self):
     def join(self, other, join_condition):
         if isinstance(other, UnimodalModality):
             self.data_loader.update_chunk_sizes(other.data_loader)
-        
+
         joined_modality = JoinedModality(
             reduce(or_, [other.modality_type], self.modality_type),
             self,
             other,
             join_condition,
-            self.data_loader.chunk_size is not None
+            self.data_loader.chunk_size is not None,
         )
 
         return joined_modality
 
-    # TODO: maybe this can be made generic so it can be used in the join class as well
     def apply_representation(self, representation, aggregation=None):
-        new_modality = TransformedModality(self.modality_type, representation, self.data_loader.metadata)
+        new_modality = TransformedModality(
+            self.modality_type, representation.name, self.data_loader.metadata.copy()
+        )
         new_modality.data = []
 
         if self.data_loader.chunk_size:
-            while (
-                self.data_loader.next_chunk
-                < self.data_loader.num_chunks
-            ):
+            while self.data_loader.next_chunk < self.data_loader.num_chunks:
                 self.extract_raw_data()
                 transformed_chunk = representation.transform(self)
                 if aggregation:
-                    transformed_chunk = aggregation.window(transformed_chunk)
+                    transformed_chunk.data = aggregation.window(transformed_chunk)
                 new_modality.data.extend(transformed_chunk.data)
+                new_modality.metadata.update(transformed_chunk.metadata)
         else:
             if not self.data:
                 self.extract_raw_data()
             new_modality = representation.transform(self)
-            
+
             if aggregation:
-                new_modality = aggregation.window(new_modality)
-                
+                new_modality.data = aggregation.window(new_modality)
+
         new_modality.update_metadata()
         return new_modality
diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py
index 08cb85e7395..bfaaa22642a 100644
--- a/src/main/python/systemds/scuro/representations/bert.py
+++ b/src/main/python/systemds/scuro/representations/bert.py
@@ -35,7 +35,9 @@ def __init__(self, output_file=None):
         self.output_file = output_file
 
     def transform(self, modality):
-        transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata)
+        transformed_modality = TransformedModality(
+            modality.modality_type, self, modality.metadata
+        )
         model_name = "bert-base-uncased"
         tokenizer = BertTokenizer.from_pretrained(
             model_name, clean_up_tokenization_spaces=True
@@ -47,7 +49,7 @@ def transform(self, modality):
 
         if self.output_file is not None:
             save_embeddings(embeddings, self.output_file)
-        
+
         transformed_modality.data = embeddings
         return transformed_modality
 
@@ -58,7 +60,7 @@ def create_embeddings(self, data, model, tokenizer):
 
             with torch.no_grad():
                 outputs = model(**inputs)
-                
+
                 cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
                 embeddings.append(cls_embedding)
 
diff --git a/src/main/python/systemds/scuro/representations/bow.py b/src/main/python/systemds/scuro/representations/bow.py
index 52863aaae3e..f16f6ec04d8 100644
--- a/src/main/python/systemds/scuro/representations/bow.py
+++ b/src/main/python/systemds/scuro/representations/bow.py
@@ -34,7 +34,9 @@ def __init__(self, ngram_range, min_df, output_file=None):
         self.output_file = output_file
 
     def transform(self, modality):
-        transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata)
+        transformed_modality = TransformedModality(
+            modality.modality_type, self, modality.metadata
+        )
         vectorizer = CountVectorizer(
             ngram_range=(1, self.ngram_range), min_df=self.min_df
         )
@@ -43,6 +45,6 @@ def transform(self, modality):
 
         if self.output_file is not None:
             save_embeddings(X, self.output_file)
-        
+
         transformed_modality.data = X
         return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
index 3ac026374d7..73da83b74b9 100644
--- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
@@ -31,25 +31,26 @@ def __init__(self):
         super().__init__("MelSpectrogram")
 
     def transform(self, modality):
-        transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata)
+        transformed_modality = TransformedModality(
+            modality.modality_type, self, modality.metadata
+        )
         result = []
         max_length = 0
         for sample in modality.data:
-            S = librosa.feature.melspectrogram(
-                y=sample, sr=22050
-            )
+            S = librosa.feature.melspectrogram(y=sample, sr=22050)
             S_dB = librosa.power_to_db(S, ref=np.max)
             if S_dB.shape[-1] > max_length:
                 max_length = S_dB.shape[-1]
             result.append(S_dB.T)
-        
+
         transformed_modality.data = result
         return transformed_modality
-    
-    
+
     def plot_spectrogram(self, spectrogram):
         plt.figure(figsize=(10, 4))
-        librosa.display.specshow(spectrogram, x_axis='time', y_axis='mel', sr=22050, cmap='viridis')
-        plt.colorbar(format='%+2.0f dB')
-        plt.title('Mel Spectrogram')
-        plt.savefig('spectrogram.jpg')
+        librosa.display.specshow(
+            spectrogram, x_axis="time", y_axis="mel", sr=22050, cmap="viridis"
+        )
+        plt.colorbar(format="%+2.0f dB")
+        plt.title("Mel Spectrogram")
+        plt.savefig("spectrogram.jpg")
diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py
index de80562b16f..2b80436aa8f 100644
--- a/src/main/python/systemds/scuro/representations/resnet.py
+++ b/src/main/python/systemds/scuro/representations/resnet.py
@@ -22,7 +22,7 @@
 
 import h5py
 
-from systemds.scuro.modality.modality import Modality
+from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from typing import Callable, Dict, Tuple, Any
 import torch.utils.data
@@ -33,23 +33,49 @@
 
 if torch.backends.mps.is_available():
     DEVICE = torch.device("mps")
+elif torch.backends.cudnn.is_available():
+    DEVICE = torch.device("cuda")
 else:
     DEVICE = torch.device("cpu")
 
+
 class ResNet(UnimodalRepresentation):
-    def __init__(self, layer="avgpool", output_file=None):
+    def __init__(self, layer="avgpool", model_name="ResNet18", output_file=None):
         super().__init__("ResNet")
 
         self.output_file = output_file
         self.layer_name = layer
+        self.model = model_name
+        self.model.eval()
+        for param in self.model.parameters():
+            param.requires_grad = False
 
-    def transform(self, modality):
+        class Identity(torch.nn.Module):
+            def forward(self, input_: torch.Tensor) -> torch.Tensor:
+                return input_
 
-        resnet = models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to(DEVICE)
-        resnet.eval()
+        self.model.fc = Identity()
+
+    @property
+    def model(self):
+        return self._model
+
+    @model.setter
+    def model(self, model):
+        if model == "ResNet18":
+            self._model = models.resnet18(pretrained=True).to(DEVICE)
+        elif model == "ResNet34":
+            self._model = models.resnet34(pretrained=True).to(DEVICE)
+        elif model == "ResNet50":
+            self._model = models.resnet50(pretrained=True).to(DEVICE)
+        elif model == "ResNet101":
+            self._model = models.resnet101(pretrained=True).to(DEVICE)
+        elif model == "ResNet152":
+            self._model = models.resnet152(pretrained=True).to(DEVICE)
+        else:
+            raise NotImplementedError
 
-        for param in resnet.parameters():
-            param.requires_grad = False
+    def transform(self, modality):
 
         t = transforms.Compose(
             [
@@ -66,12 +92,6 @@ def transform(self, modality):
         dataset = ResNetDataset(modality.data, t)
         embeddings = {}
 
-        class Identity(torch.nn.Module):
-            def forward(self, input_: torch.Tensor) -> torch.Tensor:
-                return input_
-
-        resnet.fc = Identity()
-
         res5c_output = None
 
         def get_features(name_):
@@ -84,7 +104,7 @@ def hook(
             return hook
 
         if self.layer_name:
-            for name, layer in resnet.named_modules():
+            for name, layer in self.model.named_modules():
                 if name == self.layer_name:
                     layer.register_forward_hook(get_features(name))
                     break
@@ -100,36 +120,19 @@ def hook(
                 frame_ids_range = range(start_index, end_index)
                 frame_batch = frames[frame_ids_range]
 
-                _ = resnet(frame_batch)
+                _ = self.model(frame_batch)
                 values = res5c_output
-                # if self.layer_name == "avgpool" or self.layer_name == "maxpool":
-                #     embeddings[video_id].extend(
-                #         torch.flatten(values, 1).detach().cpu().numpy()
-                #     )
-                #
-                # else:
                 pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1))
 
                 embeddings[video_id].extend(
                     torch.flatten(pooled, 1).detach().cpu().numpy()
                 )
 
-        # TODO: this functionality could be used for operator reuse if the data stays the same
-        if self.output_file is not None:
-            with h5py.File(self.output_file, "w") as hdf:
-                for key, value in embeddings.items():
-                    hdf.create_dataset(key, data=value)
-
-        # emb = []
-
-        # TODO: this should be moved out to a windowing function
-        # for video in embeddings.values():
-        #     emb.append(np.array(video).mean(axis=0).tolist())
-
-        transformed_modality = Modality(modality.modality_type, modality.metadata)
+        transformed_modality = TransformedModality(
+            modality.modality_type, "resnet", modality.metadata
+        )
         transformed_modality.data = list(embeddings.values())
-        transformed_modality.schema["data_layout"]["representation"] = "list_of_lists_of_numpy_array" # TODO: create infer data_layout method in modality
-        transformed_modality.schema["data_layout"]["type"] = transformed_modality.data[0][0].dtype # TODO: create infer data_layout method in modality
+        transformed_modality.update_data_layout()
 
         return transformed_modality
 
@@ -141,14 +144,20 @@ def __init__(self, data: str, tf: Callable = None):
 
     def __getitem__(self, index) -> Dict[str, object]:
         data = self.data[index]
-        output = torch.empty((len(data), 3, 224, 224))
-        
-        for i, d in enumerate(data):
-            if data[0].ndim < 3:
-                d = torch.tensor(d)
-                d = d.repeat(3, 1, 1)
-                
-            output[i] = self.tf(d)
+        if type(data) is np.ndarray:
+            output = torch.empty((1, 3, 224, 224))
+            d = torch.tensor(data)
+            d = d.repeat(3, 1, 1)
+            output[0] = self.tf(d)
+        else:
+            output = torch.empty((len(data), 3, 224, 224))
+
+            for i, d in enumerate(data):
+                if data[0].ndim < 3:
+                    d = torch.tensor(d)
+                    d = d.repeat(3, 1, 1)
+
+                output[i] = self.tf(d)
 
         return {"id": index, "data": output}
 
diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py
index 0d149f30a79..02cfb927c71 100644
--- a/src/main/python/systemds/scuro/representations/tfidf.py
+++ b/src/main/python/systemds/scuro/representations/tfidf.py
@@ -32,7 +32,9 @@ def __init__(self, min_df, output_file=None):
         self.output_file = output_file
 
     def transform(self, modality):
-        transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata)
+        transformed_modality = TransformedModality(
+            modality.modality_type, self, modality.metadata
+        )
         vectorizer = TfidfVectorizer(min_df=self.min_df)
 
         X = vectorizer.fit_transform(modality.data)
@@ -40,6 +42,6 @@ def transform(self, modality):
 
         if self.output_file is not None:
             save_embeddings(X, self.output_file)
-            
+
         transformed_modality.data = X
         return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/window.py b/src/main/python/systemds/scuro/representations/window.py
index b589c8664b5..af0301d0e3b 100644
--- a/src/main/python/systemds/scuro/representations/window.py
+++ b/src/main/python/systemds/scuro/representations/window.py
@@ -20,27 +20,30 @@
 # -------------------------------------------------------------
 import numpy as np
 import math
-from systemds.scuro import TransformedModality
-from systemds.scuro.representations.aggregate import Aggregation
 
 
+# TODO: move this into the aggregation class and add an aggregate() and a window(window_size) function there so they can use the same functionality.
 class WindowAggregation:
     def __init__(self, window_size, aggregation_function):
         self.window_size = window_size
         self.aggregation_function = aggregation_function
-        
+
     def window(self, modality):
         # data is a 2d array
-        transformed_modality = TransformedModality(modality.modality_type, "window", modality.metadata)
+        windowed_data = []
         for instance in modality.data:
             window_length = math.ceil(len(instance) / self.window_size)
             result = [[] for _ in range(0, window_length)]
             # if modality.schema["data_layout"]["representation"] == "list_of_lists_of_numpy_array":
             data = np.stack(instance)
             for i in range(0, window_length):
-                result[i] = np.mean(data[i * self.window_size: i * self.window_size + self.window_size], axis=0) # TODO: add actual aggregation function here
-            
-            transformed_modality.data.append(result)
-            
-        return transformed_modality
-        
\ No newline at end of file
+                result[i] = np.mean(
+                    data[
+                        i * self.window_size : i * self.window_size + self.window_size
+                    ],
+                    axis=0,
+                )  # TODO: add actual aggregation function here
+
+            windowed_data.append(result)
+
+        return windowed_data
diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py
index a460e918bfe..51729d635fa 100644
--- a/src/main/python/systemds/scuro/representations/word2vec.py
+++ b/src/main/python/systemds/scuro/representations/word2vec.py
@@ -44,7 +44,9 @@ def __init__(self, vector_size, min_count, window, output_file=None):
         self.output_file = output_file
 
     def transform(self, modality):
-        transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata)
+        transformed_modality = TransformedModality(
+            modality.modality_type, self, modality.metadata
+        )
         t = [word_tokenize(s.lower()) for s in modality.data]
         model = Word2Vec(
             sentences=t,
diff --git a/src/main/python/systemds/scuro/utils/__init__.py b/src/main/python/systemds/scuro/utils/__init__.py
index 0a47bfff92c..e66abb4646f 100644
--- a/src/main/python/systemds/scuro/utils/__init__.py
+++ b/src/main/python/systemds/scuro/utils/__init__.py
@@ -17,4 +17,4 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# -------------------------------------------------------------
\ No newline at end of file
+# -------------------------------------------------------------
diff --git a/src/main/python/systemds/scuro/utils/schema_helpers.py b/src/main/python/systemds/scuro/utils/schema_helpers.py
index dfad21012cd..a88e81f7161 100644
--- a/src/main/python/systemds/scuro/utils/schema_helpers.py
+++ b/src/main/python/systemds/scuro/utils/schema_helpers.py
@@ -40,4 +40,4 @@ def create_timestamps(frequency, sample_length, start_datetime=None):
 def calculate_new_frequency(new_length, old_length, old_frequency):
     duration = old_length / old_frequency
     new_frequency = new_length / duration
-    return math.floor(new_frequency)
+    return new_frequency
diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py
index bce71ebefaf..ec0783df9cb 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -33,15 +33,15 @@
 def setup_data(modalities, num_instances, path):
     if os.path.isdir(path):
         shutil.rmtree(path)
-        
+
     os.makedirs(path)
-    
+
     indizes = [str(i) for i in range(0, num_instances)]
-    
+
     modalities_to_create = []
     for modality in modalities:
         mod_path = path + "/" + modality.name + "/"
-        
+
         if modality == ModalityType.VIDEO:
             data_loader = VideoLoader(mod_path, indizes)
         elif modality == ModalityType.AUDIO:
@@ -49,10 +49,10 @@ def setup_data(modalities, num_instances, path):
         elif modality == ModalityType.TEXT:
             data_loader = TextLoader(mod_path, indizes)
         else:
-            raise 'Modality not supported in DataGenerator'
-        
+            raise "Modality not supported in DataGenerator"
+
         modalities_to_create.append(UnimodalModality(data_loader, modality))
-    
+
     data_generator = TestDataGenerator(modalities_to_create, path)
     data_generator.create_multimodal_data(num_instances)
     return data_generator
@@ -60,12 +60,12 @@ def setup_data(modalities, num_instances, path):
 
 class TestDataGenerator:
     def __init__(self, modalities, path, balanced=True):
-        
+
         self.modalities = modalities
         self.modalities_by_type = {}
         for modality in modalities:
             self.modalities_by_type[modality.modality_type] = modality
-        
+
         self._indices = None
         self.path = path
         self.balanced = balanced
@@ -76,16 +76,16 @@ def __init__(self, modalities, path, balanced=True):
             modality.file_path = mod_path
         self.labels = []
         self.label_path = f"{path}/labels.npy"
-        
+
     def get_modality_path(self, modality_type):
         return self.modalities_by_type[modality_type].data_loader.source_path
-    
+
     @property
     def indices(self):
         if self._indices is None:
-            raise 'No indices available, please call setup_data first'
+            raise "No indices available, please call setup_data first"
         return self._indices
-    
+
     def create_multimodal_data(self, num_instances, duration=2, seed=42):
         speed_fast = 0
         speed_slow = 0
diff --git a/src/main/python/tests/scuro/test_data_loaders.py b/src/main/python/tests/scuro/test_data_loaders.py
index a8058c1391b..4ca77b205d0 100644
--- a/src/main/python/tests/scuro/test_data_loaders.py
+++ b/src/main/python/tests/scuro/test_data_loaders.py
@@ -49,12 +49,18 @@ def setUpClass(cls):
         cls.num_instances = 2
         cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
         cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path)
-        
+
         os.makedirs(f"{cls.test_file_path}/embeddings")
-        
-        cls.text_ref = cls.data_generator.modalities_by_type[ModalityType.TEXT].apply_representation(Bert())
-        cls.audio_ref = cls.data_generator.modalities_by_type[ModalityType.AUDIO].apply_representation(MelSpectrogram())
-        cls.video_ref = cls.data_generator.modalities_by_type[ModalityType.VIDEO].apply_representation(ResNet())
+
+        cls.text_ref = cls.data_generator.modalities_by_type[
+            ModalityType.TEXT
+        ].apply_representation(Bert())
+        cls.audio_ref = cls.data_generator.modalities_by_type[
+            ModalityType.AUDIO
+        ].apply_representation(MelSpectrogram())
+        cls.video_ref = cls.data_generator.modalities_by_type[
+            ModalityType.VIDEO
+        ].apply_representation(ResNet())
 
     @classmethod
     def tearDownClass(cls):
@@ -62,25 +68,38 @@ def tearDownClass(cls):
         shutil.rmtree(cls.test_file_path)
 
     def test_load_audio_data_from_file(self):
-        audio_data_loader = AudioLoader(self.data_generator.get_modality_path(ModalityType.AUDIO), self.data_generator.indices)
+        audio_data_loader = AudioLoader(
+            self.data_generator.get_modality_path(ModalityType.AUDIO),
+            self.data_generator.indices,
+        )
         audio = UnimodalModality(
             audio_data_loader, ModalityType.AUDIO
         ).apply_representation(MelSpectrogram())
 
         for i in range(0, self.num_instances):
-            assert round(sum(sum(self.audio_ref.data[i])), 4) == round(sum(sum(audio.data[i])), 4)
+            assert round(sum(sum(self.audio_ref.data[i])), 4) == round(
+                sum(sum(audio.data[i])), 4
+            )
 
     def test_load_video_data_from_file(self):
-        video_data_loader = VideoLoader(self.data_generator.get_modality_path(ModalityType.VIDEO), self.data_generator.indices)
+        video_data_loader = VideoLoader(
+            self.data_generator.get_modality_path(ModalityType.VIDEO),
+            self.data_generator.indices,
+        )
         video = UnimodalModality(
             video_data_loader, ModalityType.VIDEO
         ).apply_representation(ResNet())
 
         for i in range(0, self.num_instances):
-            assert round(sum(sum(self.video_ref.data[i])), 4) == round(sum(sum(video.data[i])), 4)
+            assert round(sum(sum(self.video_ref.data[i])), 4) == round(
+                sum(sum(video.data[i])), 4
+            )
 
     def test_load_text_data_from_file(self):
-        text_data_loader = TextLoader(self.data_generator.get_modality_path(ModalityType.TEXT), self.data_generator.indices)
+        text_data_loader = TextLoader(
+            self.data_generator.get_modality_path(ModalityType.TEXT),
+            self.data_generator.indices,
+        )
         text = UnimodalModality(
             text_data_loader, ModalityType.TEXT
         ).apply_representation(Bert())
diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py
index eda23348404..0e9b01557d0 100644
--- a/src/main/python/tests/scuro/test_dr_search.py
+++ b/src/main/python/tests/scuro/test_dr_search.py
@@ -92,20 +92,35 @@ def setUpClass(cls):
         cls.test_file_path = "test_data_dr_search"
         cls.num_instances = 8
         modalities = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
-        
-        cls.data_generator = setup_data(modalities, cls.num_instances, cls.test_file_path)
+
+        cls.data_generator = setup_data(
+            modalities, cls.num_instances, cls.test_file_path
+        )
         os.makedirs(f"{cls.test_file_path}/embeddings")
 
-        #TODO: adapt the representation so they return non aggregated values. Apply windowing operation instead
-        
-        cls.bert = cls.data_generator.modalities_by_type[ModalityType.TEXT].apply_representation(Bert())
-        cls.mel_spe = cls.data_generator.modalities_by_type[ModalityType.AUDIO].apply_representation(MelSpectrogram())
-        cls.resnet = cls.data_generator.modalities_by_type[ModalityType.VIDEO].apply_representation(ResNet())
+        # TODO: adapt the representation so they return non aggregated values. Apply windowing operation instead
 
+        cls.bert = cls.data_generator.modalities_by_type[
+            ModalityType.TEXT
+        ].apply_representation(Bert())
+        cls.mel_spe = (
+            cls.data_generator.modalities_by_type[ModalityType.AUDIO]
+            .apply_representation(MelSpectrogram())
+            .flatten()
+        )
+        cls.resnet = (
+            cls.data_generator.modalities_by_type[ModalityType.VIDEO]
+            .apply_representation(ResNet())
+            .window(10, "avg")
+            .flatten()
+        )
         cls.mods = [cls.bert, cls.mel_spe, cls.resnet]
 
         split = train_test_split(
-            cls.data_generator.indices, cls.data_generator.labels, test_size=0.2, random_state=42
+            cls.data_generator.indices,
+            cls.data_generator.labels,
+            test_size=0.2,
+            random_state=42,
         )
         cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [
             int(i) for i in split[1]
@@ -117,7 +132,7 @@ def setUpClass(cls):
         cls.representations = [
             Concatenation(),
             Average(),
-            RowMax(),
+            RowMax(100),
             Multiplication(),
             Sum(),
             LSTM(width=256, depth=3),
diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py
index a21895c98f0..c48f5f56b2a 100644
--- a/src/main/python/tests/scuro/test_multimodal_join.py
+++ b/src/main/python/tests/scuro/test_multimodal_join.py
@@ -18,10 +18,8 @@
 # under the License.
 #
 
-# Test edge cases: unequal number of audio-video timestamps (should still work and add the average over all audio/video samples)
+# TODO: Test edge cases: unequal number of audio-video timestamps (should still work and add the average over all audio/video samples)
 
-
-import os
 import shutil
 import unittest
 
@@ -37,7 +35,7 @@
 from systemds.scuro.modality.type import ModalityType
 
 
-class TestUnimodalRepresentations(unittest.TestCase):
+class TestMultimodalJoin(unittest.TestCase):
     test_file_path = None
     mods = None
     text = None
@@ -52,7 +50,7 @@ def setUpClass(cls):
         cls.test_file_path = "join_test_data"
         cls.num_instances = 4
         cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO]
-       
+
         cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path)
 
     @classmethod
@@ -61,33 +59,64 @@ def tearDownClass(cls):
         shutil.rmtree(cls.test_file_path)
 
     def test_video_audio_join(self):
-        self._execute_av_join()
+        self._execute_va_join()
 
     def test_chunked_video_audio_join(self):
-        self._execute_av_join(2)
-        
+        self._execute_va_join(2)
+
     def test_video_chunked_audio_join(self):
-        self._execute_av_join(None, 2)
+        self._execute_va_join(None, 2)
 
     def test_chunked_video_chunked_audio_join(self):
-        self._execute_av_join(2, 2)
+        self._execute_va_join(2, 2)
+
+    def test_audio_video_join(self):
+        # Audio has a much higher frequency than video, hence we would need to
+        # duplicate or interpolate frames to match them to the audio frequency
+        self._execute_av_join()
+
+    # TODO
+    # def test_chunked_audio_video_join(self):
+    #     self._execute_av_join(2)
+
+    # TODO
+    # def test_chunked_audio_chunked_video_join(self):
+    #     self._execute_av_join(2, 2)
+
+    def _execute_va_join(self, l_chunk_size=None, r_chunk_size=None):
+        video, audio = self._prepare_data(l_chunk_size, r_chunk_size)
+        self._join(video, audio, 2)
 
     def _execute_av_join(self, l_chunk_size=None, r_chunk_size=None):
-        window_size = 2
+        video, audio = self._prepare_data(l_chunk_size, r_chunk_size)
+        self._join(audio, video, 2)
+
+    def _prepare_data(self, l_chunk_size=None, r_chunk_size=None):
         video_data_loader = VideoLoader(
-            self.data_generator.get_modality_path(ModalityType.VIDEO), self.data_generator.indices, chunk_size=l_chunk_size
+            self.data_generator.get_modality_path(ModalityType.VIDEO),
+            self.data_generator.indices,
+            chunk_size=l_chunk_size,
         )
         video = UnimodalModality(video_data_loader, ModalityType.VIDEO)
-        
-        audio_data_loader = AudioLoader(self.data_generator.get_modality_path(ModalityType.AUDIO), self.data_generator.indices, r_chunk_size)
+
+        audio_data_loader = AudioLoader(
+            self.data_generator.get_modality_path(ModalityType.AUDIO),
+            self.data_generator.indices,
+            r_chunk_size,
+        )
         audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO)
-        
+
         mel_audio = audio.apply_representation(MelSpectrogram())
-        
+
+        return video, mel_audio
+
+    def _join(self, left_modality, right_modality, window_size):
         resnet_modality = (
-            video.join(mel_audio, JoinCondition("timestamp", "timestamp", "<"))
+            left_modality.join(
+                right_modality, JoinCondition("timestamp", "timestamp", "<")
+            )
             .apply_representation(
-                ResNet(layer="layer1.0.conv2"),
+                ResNet(layer="layer1.0.conv2", model_name="ResNet50"),
                 WindowAggregation(window_size=window_size, aggregation_function="mean"),
             )
             .combine("concat")
@@ -99,5 +128,8 @@ def _execute_av_join(self, l_chunk_size=None, r_chunk_size=None):
         assert len(resnet_modality.right_modality.data) == self.num_instances
         assert resnet_modality.data is not None
 
+        return resnet_modality
+
+
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py b/src/main/python/tests/scuro/test_unimodal_representations.py
index aea37eb93b0..d566830697f 100644
--- a/src/main/python/tests/scuro/test_unimodal_representations.py
+++ b/src/main/python/tests/scuro/test_unimodal_representations.py
@@ -101,7 +101,7 @@ def test_text_representations(self):
             r = text.apply_representation(representation)
             assert r.data is not None
             assert len(r.data) == self.num_instances
-            
+
     def test_chunked_video_representations(self):
         video_representations = [ResNet()]
         video_data_loader = VideoLoader(

From 000441df3c7a06fd25e0406ffa2e18091c73c0c1 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Wed, 5 Feb 2025 10:22:09 +0100
Subject: [PATCH 06/16] check python tests

---
 .github/workflows/python.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 9f39f07ecb7..c1dac76e640 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -31,7 +31,7 @@ on:
       - 'src/assembly/**'
       - 'dev/**'
     branches:
-      - main
+      - scuro_join
   pull_request:
     paths-ignore:
       - 'docs/**'

From c6f8ca60ab4bcc7005e5f009c3d6a9b330ac688a Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Wed, 5 Feb 2025 10:32:30 +0100
Subject: [PATCH 07/16] remove plot

---
 .../scuro/representations/mel_spectrogram.py   | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
index 73da83b74b9..05b7f37a6ca 100644
--- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
@@ -22,7 +22,7 @@
 import numpy as np
 
 from systemds.scuro.modality.transformed import TransformedModality
-import matplotlib.pyplot as plt
+# import matplotlib.pyplot as plt
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 
 
@@ -46,11 +46,11 @@ def transform(self, modality):
         transformed_modality.data = result
         return transformed_modality
 
-    def plot_spectrogram(self, spectrogram):
-        plt.figure(figsize=(10, 4))
-        librosa.display.specshow(
-            spectrogram, x_axis="time", y_axis="mel", sr=22050, cmap="viridis"
-        )
-        plt.colorbar(format="%+2.0f dB")
-        plt.title("Mel Spectrogram")
-        plt.savefig("spectrogram.jpg")
+    # def plot_spectrogram(self, spectrogram):
+    #     plt.figure(figsize=(10, 4))
+    #     librosa.display.specshow(
+    #         spectrogram, x_axis="time", y_axis="mel", sr=22050, cmap="viridis"
+    #     )
+    #     plt.colorbar(format="%+2.0f dB")
+    #     plt.title("Mel Spectrogram")
+    #     plt.savefig("spectrogram.jpg")

From 884aba17e6d31d995c86426d52060284139253b8 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Wed, 5 Feb 2025 10:36:08 +0100
Subject: [PATCH 08/16] check package versions

---
 src/main/python/systemds/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/main/python/systemds/__init__.py b/src/main/python/systemds/__init__.py
index a618ff6e9dd..f9bc19ff817 100644
--- a/src/main/python/systemds/__init__.py
+++ b/src/main/python/systemds/__init__.py
@@ -40,6 +40,8 @@
 
 def check_package_version(package_name, required_version):
     try:
+        print(f"Checking package version for {package_name}...")
+        print(f"Requried version: {required_version} - actual version: {version(package_name)}")
         return version(package_name) >= required_version
     except PackageNotFoundError:
         return False

From 0e8ba13a08243dc36ddfabe2f361aab38c3ec0cf Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Wed, 5 Feb 2025 10:49:07 +0100
Subject: [PATCH 09/16] check if cuda is available

---
 .github/workflows/python.yml                             | 5 +++--
 src/main/python/systemds/__init__.py                     | 4 ++--
 src/main/python/systemds/scuro/representations/resnet.py | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index c1dac76e640..9f372695ed9 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -116,8 +116,9 @@ jobs:
           h5py \
           nltk \
           gensim \
-          black
-
+          black \
+          opt-einsum
+                
     - name: Build Python Package
       run: |
         cd src/main/python
diff --git a/src/main/python/systemds/__init__.py b/src/main/python/systemds/__init__.py
index f9bc19ff817..609ec571c9f 100644
--- a/src/main/python/systemds/__init__.py
+++ b/src/main/python/systemds/__init__.py
@@ -26,8 +26,8 @@
 __all__ = ["context", "operator", "examples"]
 
 required_packages = [
-    ("torch", "2.5.1"),
-    ("torchvision", "0.20.1"),
+    ("torch", "2.4.1"),
+    ("torchvision", "0.19.1"),
     ("librosa", "0.10.2"),
     ("opencv-python", "4.10.0.84"),
     ("opt-einsum", "3.3.0"),
diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py
index 2b80436aa8f..eaa3d99e479 100644
--- a/src/main/python/systemds/scuro/representations/resnet.py
+++ b/src/main/python/systemds/scuro/representations/resnet.py
@@ -33,7 +33,7 @@
 
 if torch.backends.mps.is_available():
     DEVICE = torch.device("mps")
-elif torch.backends.cudnn.is_available():
+elif torch.cuda.is_available():
     DEVICE = torch.device("cuda")
 else:
     DEVICE = torch.device("cpu")

From eccf7922e428d53b474b2830ca26dbe1e7455883 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Wed, 5 Feb 2025 12:05:38 +0100
Subject: [PATCH 10/16] remove nltk

---
 .github/workflows/python.yml                               | 1 -
 src/main/python/systemds/__init__.py                       | 3 ---
 src/main/python/systemds/scuro/representations/glove.py    | 5 +++--
 src/main/python/systemds/scuro/representations/word2vec.py | 6 +++---
 4 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 9f372695ed9..c8ee2521046 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -114,7 +114,6 @@ jobs:
           torch \
           librosa \
           h5py \
-          nltk \
           gensim \
           black \
           opt-einsum
diff --git a/src/main/python/systemds/__init__.py b/src/main/python/systemds/__init__.py
index 609ec571c9f..443b5d23d90 100644
--- a/src/main/python/systemds/__init__.py
+++ b/src/main/python/systemds/__init__.py
@@ -33,15 +33,12 @@
     ("opt-einsum", "3.3.0"),
     ("h5py", "3.11.0"),
     ("transformers", "4.46.3"),
-    ("nltk", "3.9.1"),
     ("gensim", "4.3.3"),
 ]
 
 
 def check_package_version(package_name, required_version):
     try:
-        print(f"Checking package version for {package_name}...")
-        print(f"Requried version: {required_version} - actual version: {version(package_name)}")
         return version(package_name) >= required_version
     except PackageNotFoundError:
         return False
diff --git a/src/main/python/systemds/scuro/representations/glove.py b/src/main/python/systemds/scuro/representations/glove.py
index cf13c717d2f..767fc8d375e 100644
--- a/src/main/python/systemds/scuro/representations/glove.py
+++ b/src/main/python/systemds/scuro/representations/glove.py
@@ -19,7 +19,8 @@
 #
 # -------------------------------------------------------------
 import numpy as np
-from nltk import word_tokenize
+from gensim.utils import tokenize
+
 
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from systemds.scuro.representations.utils import read_data_from_file, save_embeddings
@@ -47,7 +48,7 @@ def transform(self, data):
 
         embeddings = []
         for sentences in data:
-            tokens = word_tokenize(sentences.lower())
+            tokens = list(tokenize(sentences.lower()))
             embeddings.append(
                 np.mean(
                     [
diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py
index 51729d635fa..b68a9fd3eb4 100644
--- a/src/main/python/systemds/scuro/representations/word2vec.py
+++ b/src/main/python/systemds/scuro/representations/word2vec.py
@@ -23,7 +23,7 @@
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from systemds.scuro.representations.utils import save_embeddings
 from gensim.models import Word2Vec
-from nltk.tokenize import word_tokenize
+from gensim.utils import tokenize
 
 
 def get_embedding(sentence, model):
@@ -47,7 +47,7 @@ def transform(self, modality):
         transformed_modality = TransformedModality(
             modality.modality_type, self, modality.metadata
         )
-        t = [word_tokenize(s.lower()) for s in modality.data]
+        t = [list(tokenize(s.lower())) for s in modality.data]
         model = Word2Vec(
             sentences=t,
             vector_size=self.vector_size,
@@ -56,7 +56,7 @@ def transform(self, modality):
         )
         embeddings = []
         for sentences in modality.data:
-            tokens = word_tokenize(sentences.lower())
+            tokens = list(tokenize(sentences.lower()))
             embeddings.append(get_embedding(tokens, model))
 
         if self.output_file is not None:

From 19190aa1b70f3012276239e0371e82fc00021a9a Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Wed, 5 Feb 2025 12:06:57 +0100
Subject: [PATCH 11/16] remove prints

---
 src/main/python/systemds/scuro/modality/joined.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/main/python/systemds/scuro/modality/joined.py b/src/main/python/systemds/scuro/modality/joined.py
index 08d9a1b7ae7..acdf4fb94f1 100644
--- a/src/main/python/systemds/scuro/modality/joined.py
+++ b/src/main/python/systemds/scuro/modality/joined.py
@@ -110,14 +110,6 @@ def execute(self, starting_idx=0):
                             if right.ndim == 1:
                                 right = right[np.newaxis, :]
                         else:
-                            if len(self.right_modality.data) < i:
-                                print(f"i:{i}")
-                                print(f"starting_index:{starting_idx}")
-                                print(
-                                    f"right mod length:{len(self.right_modality.data)}"
-                                )
-                                print(f"left mod length:{len(self.left_modality.data)}")
-
                             if self.right_modality.data[i][c].ndim == 1:
                                 right = np.concatenate(
                                     [
@@ -238,11 +230,9 @@ def _apply_representation_chunked(
         new_left = Modality(left_modality.modality_type, {})
         new_right = Modality(right_modality.modality_type, {})
 
-        transform_right = True
         while (
             left_modality.data_loader.next_chunk < left_modality.data_loader.num_chunks
         ):
-            print(left_modality.data_loader.next_chunk)
             if chunk_right:
                 right_modality.extract_raw_data()
                 starting_idx = 0

From 425ce2c39040c72642798a42665df35df77c5490 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Wed, 5 Feb 2025 12:56:54 +0100
Subject: [PATCH 12/16] add resnet weights

---
 .../python/systemds/scuro/representations/resnet.py    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py
index eaa3d99e479..3f91c09bf8d 100644
--- a/src/main/python/systemds/scuro/representations/resnet.py
+++ b/src/main/python/systemds/scuro/representations/resnet.py
@@ -63,15 +63,15 @@ def model(self):
     @model.setter
     def model(self, model):
         if model == "ResNet18":
-            self._model = models.resnet18(pretrained=True).to(DEVICE)
+            self._model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT).to(DEVICE)
         elif model == "ResNet34":
-            self._model = models.resnet34(pretrained=True).to(DEVICE)
+            self._model = models.resnet34(weights=models.ResNet34_Weights.DEFAULT).to(DEVICE)
         elif model == "ResNet50":
-            self._model = models.resnet50(pretrained=True).to(DEVICE)
+            self._model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT).to(DEVICE)
         elif model == "ResNet101":
-            self._model = models.resnet101(pretrained=True).to(DEVICE)
+            self._model = models.resnet101(weights=models.ResNet101_Weights.DEFAULT).to(DEVICE)
         elif model == "ResNet152":
-            self._model = models.resnet152(pretrained=True).to(DEVICE)
+            self._model = models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to(DEVICE)
         else:
             raise NotImplementedError
 

From 5e7a678b4e303bb7d245a99cd28a8671a90b0dac Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Wed, 5 Feb 2025 12:58:25 +0100
Subject: [PATCH 13/16] formatting

---
 .../scuro/representations/mel_spectrogram.py  |  1 +
 .../systemds/scuro/representations/resnet.py  | 20 ++++++++++++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
index 05b7f37a6ca..483ea181b8f 100644
--- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
@@ -22,6 +22,7 @@
 import numpy as np
 
 from systemds.scuro.modality.transformed import TransformedModality
+
 # import matplotlib.pyplot as plt
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 
diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py
index 3f91c09bf8d..3a73e702a95 100644
--- a/src/main/python/systemds/scuro/representations/resnet.py
+++ b/src/main/python/systemds/scuro/representations/resnet.py
@@ -63,15 +63,25 @@ def model(self):
     @model.setter
     def model(self, model):
         if model == "ResNet18":
-            self._model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT).to(DEVICE)
+            self._model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT).to(
+                DEVICE
+            )
         elif model == "ResNet34":
-            self._model = models.resnet34(weights=models.ResNet34_Weights.DEFAULT).to(DEVICE)
+            self._model = models.resnet34(weights=models.ResNet34_Weights.DEFAULT).to(
+                DEVICE
+            )
         elif model == "ResNet50":
-            self._model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT).to(DEVICE)
+            self._model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT).to(
+                DEVICE
+            )
         elif model == "ResNet101":
-            self._model = models.resnet101(weights=models.ResNet101_Weights.DEFAULT).to(DEVICE)
+            self._model = models.resnet101(weights=models.ResNet101_Weights.DEFAULT).to(
+                DEVICE
+            )
         elif model == "ResNet152":
-            self._model = models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to(DEVICE)
+            self._model = models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to(
+                DEVICE
+            )
         else:
             raise NotImplementedError
 

From 57b98c500aced4074df1d3ced04f2414ea7a1595 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Wed, 5 Feb 2025 13:14:49 +0100
Subject: [PATCH 14/16] increase number of instances created in drsearch test

---
 src/main/python/systemds/scuro/representations/resnet.py | 3 ---
 src/main/python/tests/scuro/test_dr_search.py            | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py
index 3a73e702a95..ff63e6766b6 100644
--- a/src/main/python/systemds/scuro/representations/resnet.py
+++ b/src/main/python/systemds/scuro/representations/resnet.py
@@ -19,9 +19,6 @@
 #
 # -------------------------------------------------------------
 
-
-import h5py
-
 from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from typing import Callable, Dict, Tuple, Any
diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py
index 0e9b01557d0..f2ba9d2d790 100644
--- a/src/main/python/tests/scuro/test_dr_search.py
+++ b/src/main/python/tests/scuro/test_dr_search.py
@@ -90,7 +90,7 @@ class TestDataLoaders(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls.test_file_path = "test_data_dr_search"
-        cls.num_instances = 8
+        cls.num_instances = 20
         modalities = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
 
         cls.data_generator = setup_data(

From 81342aa78e2516110218e67d04f34599c707178e Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Wed, 5 Feb 2025 13:33:41 +0100
Subject: [PATCH 15/16] reset branch in workflow

---
 .github/workflows/python.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index c8ee2521046..54da49f8fb5 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -31,7 +31,7 @@ on:
       - 'src/assembly/**'
       - 'dev/**'
     branches:
-      - scuro_join
+      - main
   pull_request:
     paths-ignore:
       - 'docs/**'

From 9aab9914a7291f8810c58caefef6e8e13c69f303 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Fri, 7 Feb 2025 16:01:14 +0100
Subject: [PATCH 16/16] remove version update in init

---
 src/main/python/systemds/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/main/python/systemds/__init__.py b/src/main/python/systemds/__init__.py
index 443b5d23d90..a618ff6e9dd 100644
--- a/src/main/python/systemds/__init__.py
+++ b/src/main/python/systemds/__init__.py
@@ -26,13 +26,14 @@
 __all__ = ["context", "operator", "examples"]
 
 required_packages = [
-    ("torch", "2.4.1"),
-    ("torchvision", "0.19.1"),
+    ("torch", "2.5.1"),
+    ("torchvision", "0.20.1"),
     ("librosa", "0.10.2"),
     ("opencv-python", "4.10.0.84"),
     ("opt-einsum", "3.3.0"),
     ("h5py", "3.11.0"),
     ("transformers", "4.46.3"),
+    ("nltk", "3.9.1"),
     ("gensim", "4.3.3"),
 ]