From 715bc3794606037ae0d7b96d656ac88d1bedf346 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Mon, 13 Jan 2025 10:18:10 +0100 Subject: [PATCH 01/16] add schema for base modalities and create timestamps for audio+video --- .../systemds/scuro/dataloader/audio_loader.py | 6 ++-- .../systemds/scuro/dataloader/base_loader.py | 15 ++++++-- .../systemds/scuro/dataloader/json_loader.py | 6 ++-- .../systemds/scuro/dataloader/text_loader.py | 5 +-- .../systemds/scuro/dataloader/video_loader.py | 20 +++++++++-- .../systemds/scuro/modality/modality.py | 3 +- .../python/systemds/scuro/modality/type.py | 36 ++++++++++++++++--- 7 files changed, 75 insertions(+), 16 deletions(-) diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py index f85b1b80faa..d20042c84bd 100644 --- a/src/main/python/systemds/scuro/dataloader/audio_loader.py +++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py @@ -18,7 +18,7 @@ # under the License. # # ------------------------------------------------------------- -from typing import List, Optional +from typing import List, Optional, Union import librosa from systemds.scuro.dataloader.base_loader import BaseLoader @@ -33,7 +33,9 @@ def __init__( ): super().__init__(source_path, indices, chunk_size) - def extract(self, file: str): + def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): self.file_sanity_check(file) audio, sr = librosa.load(file) + self.metadata[file] = {"sample_rate": sr, "length": audio.shape[0]} + self.metadata[file]["timestamp"] = self.create_timestamps(self.metadata[file]["sample_rate"], self.metadata[file]["length"]) self.data.append(audio) diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py index 2ef60677c67..1cb7e625105 100644 --- a/src/main/python/systemds/scuro/dataloader/base_loader.py +++ b/src/main/python/systemds/scuro/dataloader/base_loader.py @@ -21,7 +21,7 @@ import os from abc import ABC, abstractmethod from typing import List, Optional, Union - +import numpy as np class BaseLoader(ABC): def __init__( @@ -35,6 +35,9 @@ def __init__( (otherwise please provide your own Dataloader that knows about the file name convention) """ self.data = [] + self.metadata = ( + {} + ) # TODO: check what the index should be for storing the metadata (file_name, counter, ...) self.source_path = source_path self.indices = indices self.chunk_size = chunk_size @@ -78,7 +81,15 @@ def _load(self, indices: List[str]): @abstractmethod def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): pass - + + def create_timestamps(self, frequency, sample_length, start_datetime=None): + start_time = start_datetime if start_datetime is not None else np.datetime64('1970-01-01T00:00:00.000000') + time_increment = 1 / frequency + time_increments_array = np.arange(sample_length) * np.timedelta64(int(time_increment * 1e6)) + timestamps = start_time + time_increments_array + return timestamps + + def file_sanity_check(self, file): """ Checks if the file can be found is not empty diff --git a/src/main/python/systemds/scuro/dataloader/json_loader.py b/src/main/python/systemds/scuro/dataloader/json_loader.py index c4e3b956111..ac375451888 100644 --- a/src/main/python/systemds/scuro/dataloader/json_loader.py +++ b/src/main/python/systemds/scuro/dataloader/json_loader.py @@ -21,7 +21,7 @@ import json from systemds.scuro.dataloader.base_loader import BaseLoader -from typing import Optional, List +from typing import Optional, List, Union class JSONLoader(BaseLoader): @@ -35,9 +35,9 @@ def __init__( super().__init__(source_path, indices, chunk_size) self.field = field - def extract(self, file: str, indices: List[str]): + def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): self.file_sanity_check(file) with open(file) as f: json_file = json.load(f) - for idx in indices: + for idx in index: self.data.append(json_file[idx][self.field]) diff --git a/src/main/python/systemds/scuro/dataloader/text_loader.py b/src/main/python/systemds/scuro/dataloader/text_loader.py index f614472bce6..bf34cf85c7f 100644 --- a/src/main/python/systemds/scuro/dataloader/text_loader.py +++ b/src/main/python/systemds/scuro/dataloader/text_loader.py @@ -19,7 +19,7 @@ # # ------------------------------------------------------------- from systemds.scuro.dataloader.base_loader import BaseLoader -from typing import Optional, Pattern, List +from typing import Optional, Pattern, List, Union import re @@ -34,11 +34,12 @@ def __init__( super().__init__(source_path, indices, chunk_size) self.prefix = prefix - def extract(self, file: str): + def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): self.file_sanity_check(file) with open(file) as text_file: for i, line in enumerate(text_file): if self.prefix: line = re.sub(self.prefix, "", line) line = line.replace("\n", "") + self.metadata[file] = {"length": len(line.split())} self.data.append(line) diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py index 6da20b34756..505ae111ffb 100644 --- a/src/main/python/systemds/scuro/dataloader/video_loader.py +++ b/src/main/python/systemds/scuro/dataloader/video_loader.py @@ -18,7 +18,7 @@ # under the License. # # ------------------------------------------------------------- -from typing import List, Optional +from typing import List, Optional, Union import numpy as np @@ -35,9 +35,25 @@ def __init__( ): super().__init__(source_path, indices, chunk_size) - def extract(self, file: str): + def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): self.file_sanity_check(file) cap = cv2.VideoCapture(file) + + if not cap.isOpened(): + raise f"Could not read video at path: {file}" + + self.metadata[file] = { + "fps": int(cap.get(cv2.CAP_PROP_FPS)), + "length": int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), + "width": int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), + "height": int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), + "num_channels": 3, + } + + self.metadata[file]["timestamp"] = self.create_timestamps( + self.metadata[file]["fps"], self.metadata[file]["length"] + ) + frames = [] while cap.isOpened(): ret, frame = cap.read() diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py index 9a3d1b148d2..6479c6247c4 100644 --- a/src/main/python/systemds/scuro/modality/modality.py +++ b/src/main/python/systemds/scuro/modality/modality.py @@ -31,11 +31,12 @@ def __init__(self, modality_type: ModalityType): :param modality_type: Type of the modality """ self.type = modality_type + self.schema = modality_type.get_schema() self.data = None self.data_type = None self.cost = None self.shape = None - self.schema = {} + self.data_index = None def get_modality_names(self) -> List[str]: """ diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py index c451eea6f1d..7da2744d0b3 100644 --- a/src/main/python/systemds/scuro/modality/type.py +++ b/src/main/python/systemds/scuro/modality/type.py @@ -18,7 +18,36 @@ # under the License. # # ------------------------------------------------------------- -from enum import Enum, Flag, auto +from enum import Flag, auto + + +class ModalitySchemas: + TEXT_SCHEMA = {"type": "string", "length": "int"} + + AUDIO_SCHEMA = { + "timestamp": "array", + "type": "float32", + "sample_rate": "integer", + "length": "integer", + } + + VIDEO_SCHEMA = { + "timestamp": "array", + "type": "object", + "fps": "integer", + "length": "integer", + "width": "integer", + "height": "integer", + "num_channels": "integer", + } + + @classmethod + def get(cls, name): + return getattr(cls, f"{name}_SCHEMA", None) + + @classmethod + def add_schema(cls, name, schema): + setattr(cls, f"{name}_SCHEMA", schema) class ModalityType(Flag): @@ -26,6 +55,5 @@ class ModalityType(Flag): AUDIO = auto() VIDEO = auto() - # def __init__(self, value, name): - # self._value_ = value - # self.name = name + def get_schema(self): + return ModalitySchemas.get(self.name) From e0ebd6936995a0942e191891fd6a19149e945577 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Tue, 14 Jan 2025 13:16:14 +0100 Subject: [PATCH 02/16] add join for raw modalities --- .../systemds/scuro/dataloader/audio_loader.py | 4 +- .../systemds/scuro/dataloader/base_loader.py | 49 +++++--- .../python/systemds/scuro/modality/joined.py | 109 ++++++++++++++++++ .../systemds/scuro/modality/transformed.py | 4 +- .../scuro/modality/unimodal_modality.py | 27 ++++- .../python/systemds/scuro/utils/__init__.py | 20 ++++ .../systemds/scuro/utils/join_condition.py | 28 +++++ 7 files changed, 222 insertions(+), 19 deletions(-) create mode 100644 src/main/python/systemds/scuro/modality/joined.py create mode 100644 src/main/python/systemds/scuro/utils/__init__.py create mode 100644 src/main/python/systemds/scuro/utils/join_condition.py diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py index d20042c84bd..121b5513502 100644 --- a/src/main/python/systemds/scuro/dataloader/audio_loader.py +++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py @@ -37,5 +37,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): self.file_sanity_check(file) audio, sr = librosa.load(file) self.metadata[file] = {"sample_rate": sr, "length": audio.shape[0]} - self.metadata[file]["timestamp"] = self.create_timestamps(self.metadata[file]["sample_rate"], self.metadata[file]["length"]) + self.metadata[file]["timestamp"] = self.create_timestamps( + self.metadata[file]["sample_rate"], self.metadata[file]["length"] + ) self.data.append(audio) diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py index 1cb7e625105..142f99ffc0e 100644 --- a/src/main/python/systemds/scuro/dataloader/base_loader.py +++ b/src/main/python/systemds/scuro/dataloader/base_loader.py @@ -23,6 +23,7 @@ from typing import List, Optional, Union import numpy as np + class BaseLoader(ABC): def __init__( self, source_path: str, indices: List[str], chunk_size: Optional[int] = None @@ -40,30 +41,46 @@ def __init__( ) # TODO: check what the index should be for storing the metadata (file_name, counter, ...) self.source_path = source_path self.indices = indices - self.chunk_size = chunk_size - self.next_chunk = 0 + self._next_chunk = 0 + self._num_chunks = 1 + self._chunk_size = None - if self.chunk_size: - self.num_chunks = int(len(self.indices) / self.chunk_size) + if chunk_size: + self.update_chunk_size(chunk_size) def load(self): """ Takes care of loading the raw data either chunk wise (if chunk size is defined) or all at once """ - if self.chunk_size: + if self._chunk_size: return self._load_next_chunk() return self._load(self.indices) + def update_chunk_size(self, new_chunk_size): + self._chunk_size = new_chunk_size + self._num_chunks = int(len(self.indices) / self._chunk_size) + + def get_chunk_size(self): + return self._chunk_size + + def get_next_chunk_number(self): + return self._next_chunk + + def get_num_total_chunks(self): + return self._num_chunks + def _load_next_chunk(self): """ Loads the next chunk of data """ self.data = [] next_chunk_indices = self.indices[ - self.next_chunk * self.chunk_size : (self.next_chunk + 1) * self.chunk_size + self._next_chunk + * self._chunk_size : (self._next_chunk + 1) + * self._chunk_size ] - self.next_chunk += 1 + self._next_chunk += 1 return self._load(next_chunk_indices) def _load(self, indices: List[str]): @@ -81,15 +98,21 @@ def _load(self, indices: List[str]): @abstractmethod def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): pass - + def create_timestamps(self, frequency, sample_length, start_datetime=None): - start_time = start_datetime if start_datetime is not None else np.datetime64('1970-01-01T00:00:00.000000') + start_time = ( + start_datetime + if start_datetime is not None + else np.datetime64("1970-01-01T00:00:00.000000") + ) time_increment = 1 / frequency - time_increments_array = np.arange(sample_length) * np.timedelta64(int(time_increment * 1e6)) + time_increments_array = np.arange(sample_length) * np.timedelta64( + int(time_increment * 1e6) + ) timestamps = start_time + time_increments_array - return timestamps - - + + return timestamps.astype(np.int64) + def file_sanity_check(self, file): """ Checks if the file can be found is not empty diff --git a/src/main/python/systemds/scuro/modality/joined.py b/src/main/python/systemds/scuro/modality/joined.py new file mode 100644 index 00000000000..d5ab6b3406d --- /dev/null +++ b/src/main/python/systemds/scuro/modality/joined.py @@ -0,0 +1,109 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +import sys + +import numpy as np + +from systemds.scuro.modality.modality import Modality +from systemds.scuro.modality.transformed import TransformedModality +from systemds.scuro.utils.join_condition import JoinCondition + + +class JoinedModality(Modality): + + def __init__(self, modality_type, primary, other, join_condition: JoinCondition): + """ + TODO + :param modality_type: Type of the original modality(ies) + """ + super().__init__(modality_type) + self.primary_modality = primary + self.other_modality = other + self.condition = join_condition + self.chunked_execution = False + self._check_chunked_data_extraction() + + def execute(self): + self.primary_modality.extract_raw_data() + self.data = {"other": []} + self.other_modality.extract_raw_data() + + for i, element in enumerate(self.primary_modality.data): + idx_1 = list(self.primary_modality.data_loader.metadata.values())[i][ + self.condition.field_1 + ] + if ( + self.condition.alignment is None and self.condition.join_type == "<" + ): # TODO compute correct alignment timestamps/spatial params + next_idx = np.zeros(len(idx_1), dtype=int) + next_idx[:-1] = idx_1[1:] + next_idx[-1] = sys.maxsize + + idx_2 = list(self.other_modality.data_loader.metadata.values())[i][ + self.condition.field_2 + ] + + c = 0 + for j in range(0, len(idx_1)): + other = [] + if self.condition.join_type == "<": + while c < len(idx_2) and idx_2[c] < next_idx[j]: + other.append(self.other_modality.data[i][c]) + c = c + 1 + else: + while c < len(idx_2) and idx_2[c] <= idx_1[j]: + if idx_2[c] == idx_1[j]: + other.append(self.other_modality.data[i][c]) + c = c + 1 + + self.data["other"].append(other) + + def apply_representation(self, representation): + if self.chunked_execution: + new_modality = TransformedModality( + self.primary_modality.type, representation + ) + + while ( + self.primary_modality.data_loader.get_next_chunk_number() + < self.primary_modality.data_loader.get_num_total_chunks() + ): + self.execute() + + def _check_chunked_data_extraction(self): + if self.primary_modality.data_loader.get_chunk_size(): + if not self.other_modality.data_loader.get_chunk_size(): + self.other_modality.data_loader.update_chunk_size( + self.primary_modality.data_loader.get_chunk_size() + ) + elif ( + self.other_modality.data_loader.get_chunk_size() + > self.primary_modality.data_loader.get_chunk_size() + ): + self.primary_modality.data_loader.update_chunk_size( + self.other_modality.data_loader.get_chunk_size() + ) + self.chunked_execution = True + elif self.other_modality.data_loader.get_chunk_size(): + self.primary_modality.data_loader.update_chunk_size( + self.other_modality.data_loader.get_chunk_size() + ) + self.chunked_execution = True diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py index 61c327e469e..4e17b1e4975 100644 --- a/src/main/python/systemds/scuro/modality/transformed.py +++ b/src/main/python/systemds/scuro/modality/transformed.py @@ -22,12 +22,11 @@ from operator import or_ from systemds.scuro.modality.modality import Modality -from systemds.scuro.modality.type import ModalityType class TransformedModality(Modality): - def __init__(self, modality_type: ModalityType, transformation): + def __init__(self, modality_type, transformation): """ Parent class of the different Modalities (unimodal & multimodal) :param modality_type: Type of the original modality(ies) @@ -35,6 +34,7 @@ def __init__(self, modality_type: ModalityType, transformation): """ super().__init__(modality_type) self.transformation = transformation + self.data = [] def combine(self, other, fusion_method): """ diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py index 976d4194d47..84fbf537649 100644 --- a/src/main/python/systemds/scuro/modality/unimodal_modality.py +++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py @@ -18,7 +18,12 @@ # under the License. # # ------------------------------------------------------------- +from functools import reduce +from operator import or_ + + from systemds.scuro.dataloader.base_loader import BaseLoader +from systemds.scuro.modality.joined import JoinedModality from systemds.scuro.modality.modality import Modality from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.modality.type import ModalityType @@ -34,12 +39,12 @@ def __init__(self, data_loader: BaseLoader, modality_type: ModalityType): """ super().__init__(modality_type) self.data_loader = data_loader + self.join_modality = None def extract_raw_data(self): """ Uses the data loader to read the raw data from a specified location and stores the data in the data location. - TODO: schema """ self.data = self.data_loader.load() @@ -47,8 +52,11 @@ def apply_representation(self, representation): new_modality = TransformedModality(self.type, representation) new_modality.data = [] - if self.data_loader.chunk_size: - while self.data_loader.next_chunk < self.data_loader.num_chunks: + if self.data_loader.get_chunk_size(): + while ( + self.data_loader.get_next_chunk_number() + < self.data_loader.get_num_total_chunks() + ): self.extract_raw_data() new_modality.data.extend(representation.transform(self.data)) else: @@ -57,3 +65,16 @@ def apply_representation(self, representation): new_modality.data = representation.transform(self.data) return new_modality + + def join(self, other, join_condition): + joined_modality = JoinedModality( + reduce(or_, other.type, self.type), self, other, join_condition + ) + + if ( + not self.data_loader.get_chunk_size() + and not other.data_loader.get_chunk_size() + ): + joined_modality.execute() + + return joined_modality diff --git a/src/main/python/systemds/scuro/utils/__init__.py b/src/main/python/systemds/scuro/utils/__init__.py new file mode 100644 index 00000000000..0a47bfff92c --- /dev/null +++ b/src/main/python/systemds/scuro/utils/__init__.py @@ -0,0 +1,20 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- \ No newline at end of file diff --git a/src/main/python/systemds/scuro/utils/join_condition.py b/src/main/python/systemds/scuro/utils/join_condition.py new file mode 100644 index 00000000000..62c8a4d0623 --- /dev/null +++ b/src/main/python/systemds/scuro/utils/join_condition.py @@ -0,0 +1,28 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + + +class JoinCondition: + def __init__(self, field_1, field_2, join_type, alignment=None): + self.field_1 = field_1 + self.field_2 = field_2 + self.join_type = join_type + self.alignment = alignment From d0058b05e5d8d255667d1136b6cc71936dccd17c Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Tue, 28 Jan 2025 22:14:20 +0100 Subject: [PATCH 03/16] audio video join (initial working version) --- .../systemds/scuro/dataloader/audio_loader.py | 4 +- .../systemds/scuro/dataloader/base_loader.py | 61 +++--- .../systemds/scuro/dataloader/video_loader.py | 5 +- .../python/systemds/scuro/modality/joined.py | 179 +++++++++++++----- .../scuro/modality/joined_transformed.py | 62 ++++++ .../systemds/scuro/modality/modality.py | 26 ++- .../systemds/scuro/modality/transformed.py | 11 +- .../python/systemds/scuro/modality/type.py | 48 ++++- .../scuro/modality/unimodal_modality.py | 49 ++--- .../scuro/representations/aggregate.py | 51 +++++ .../systemds/scuro/representations/lstm.py | 6 +- .../scuro/representations/mel_spectrogram.py | 42 ++-- .../systemds/scuro/representations/resnet.py | 70 ++++--- .../systemds/scuro/representations/window.py | 46 +++++ .../{join_condition.py => schema_helpers.py} | 27 ++- src/main/python/tests/scuro/data_generator.py | 8 +- .../python/tests/scuro/test_data_loaders.py | 2 +- src/main/python/tests/scuro/test_dr_search.py | 1 + 18 files changed, 524 insertions(+), 174 deletions(-) create mode 100644 src/main/python/systemds/scuro/modality/joined_transformed.py create mode 100644 src/main/python/systemds/scuro/representations/aggregate.py create mode 100644 src/main/python/systemds/scuro/representations/window.py rename src/main/python/systemds/scuro/utils/{join_condition.py => schema_helpers.py} (57%) diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py index 121b5513502..b86d8a28763 100644 --- a/src/main/python/systemds/scuro/dataloader/audio_loader.py +++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py @@ -22,7 +22,7 @@ import librosa from systemds.scuro.dataloader.base_loader import BaseLoader - +from systemds.scuro.utils.schema_helpers import create_timestamps class AudioLoader(BaseLoader): def __init__( @@ -37,7 +37,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): self.file_sanity_check(file) audio, sr = librosa.load(file) self.metadata[file] = {"sample_rate": sr, "length": audio.shape[0]} - self.metadata[file]["timestamp"] = self.create_timestamps( + self.metadata[file]["timestamp"] = create_timestamps( self.metadata[file]["sample_rate"], self.metadata[file]["length"] ) self.data.append(audio) diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py index 142f99ffc0e..33d4e4920e1 100644 --- a/src/main/python/systemds/scuro/dataloader/base_loader.py +++ b/src/main/python/systemds/scuro/dataloader/base_loader.py @@ -21,7 +21,6 @@ import os from abc import ABC, abstractmethod from typing import List, Optional, Union -import numpy as np class BaseLoader(ABC): @@ -46,7 +45,24 @@ def __init__( self._chunk_size = None if chunk_size: - self.update_chunk_size(chunk_size) + self.chunk_size = chunk_size + + @property + def chunk_size(self): + return self._chunk_size + + @chunk_size.setter + def chunk_size(self, value): + self._chunk_size = value + self._num_chunks = int(len(self.indices) / self._chunk_size) + + @property + def num_chunks(self): + return self._num_chunks + + @property + def next_chunk(self): + return self._next_chunk def load(self): """ @@ -57,18 +73,18 @@ def load(self): return self._load(self.indices) - def update_chunk_size(self, new_chunk_size): - self._chunk_size = new_chunk_size - self._num_chunks = int(len(self.indices) / self._chunk_size) + def update_chunk_sizes(self, other): + if not self._chunk_size and not other.chunk_size: + return - def get_chunk_size(self): - return self._chunk_size - - def get_next_chunk_number(self): - return self._next_chunk - - def get_num_total_chunks(self): - return self._num_chunks + if ( + self._chunk_size + and not other.chunk_size + or self._chunk_size < other.chunk_size + ): + other.chunk_size = self.chunk_size + else: + self.chunk_size = other.chunk_size def _load_next_chunk(self): """ @@ -93,27 +109,14 @@ def _load(self, indices: List[str]): else: self.extract(self.source_path, indices) - return self.data + return self.data, self.metadata @abstractmethod def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): pass - def create_timestamps(self, frequency, sample_length, start_datetime=None): - start_time = ( - start_datetime - if start_datetime is not None - else np.datetime64("1970-01-01T00:00:00.000000") - ) - time_increment = 1 / frequency - time_increments_array = np.arange(sample_length) * np.timedelta64( - int(time_increment * 1e6) - ) - timestamps = start_time + time_increments_array - - return timestamps.astype(np.int64) - - def file_sanity_check(self, file): + @staticmethod + def file_sanity_check(file): """ Checks if the file can be found is not empty """ diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py index 505ae111ffb..807a43b21cc 100644 --- a/src/main/python/systemds/scuro/dataloader/video_loader.py +++ b/src/main/python/systemds/scuro/dataloader/video_loader.py @@ -23,6 +23,7 @@ import numpy as np from systemds.scuro.dataloader.base_loader import BaseLoader +from systemds.scuro.utils.schema_helpers import create_timestamps import cv2 @@ -43,14 +44,14 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): raise f"Could not read video at path: {file}" self.metadata[file] = { - "fps": int(cap.get(cv2.CAP_PROP_FPS)), + "fps": cap.get(cv2.CAP_PROP_FPS), "length": int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), "width": int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), "height": int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), "num_channels": 3, } - self.metadata[file]["timestamp"] = self.create_timestamps( + self.metadata[file]["timestamp"] = create_timestamps( self.metadata[file]["fps"], self.metadata[file]["length"] ) diff --git a/src/main/python/systemds/scuro/modality/joined.py b/src/main/python/systemds/scuro/modality/joined.py index d5ab6b3406d..aeccdd3d321 100644 --- a/src/main/python/systemds/scuro/modality/joined.py +++ b/src/main/python/systemds/scuro/modality/joined.py @@ -22,88 +22,165 @@ import numpy as np +from systemds.scuro.modality.joined_transformed import JoinedTransformedModality from systemds.scuro.modality.modality import Modality from systemds.scuro.modality.transformed import TransformedModality -from systemds.scuro.utils.join_condition import JoinCondition +from systemds.scuro.representations.aggregate import Aggregation + + +class JoinCondition: + def __init__(self, leftField, rightField, joinType, alignment=None): + self.leftField = leftField + self.rightField = rightField + self.join_type = joinType + self.alignment = alignment class JoinedModality(Modality): - def __init__(self, modality_type, primary, other, join_condition: JoinCondition): + def __init__( + self, + modality_type, + left_modality, + right_modality, + join_condition: JoinCondition, + chunked_execution=False, + ): """ TODO :param modality_type: Type of the original modality(ies) """ super().__init__(modality_type) - self.primary_modality = primary - self.other_modality = other + self.aggregation = None + self.joined_right = None + self.left_modality = left_modality + self.right_modality = right_modality self.condition = join_condition - self.chunked_execution = False - self._check_chunked_data_extraction() + self.chunked_execution = chunked_execution # TODO: maybe move this into parent class + self.left_type = type(left_modality) + self.right_type = type(right_modality) + if self.chunked_execution: + self.chunk_left = left_modality.data_loader.chunk_size is not None - def execute(self): - self.primary_modality.extract_raw_data() - self.data = {"other": []} - self.other_modality.extract_raw_data() + def execute(self, right_starting_idx=0): + self.joined_right = self.right_modality.copy_from_instance() - for i, element in enumerate(self.primary_modality.data): - idx_1 = list(self.primary_modality.data_loader.metadata.values())[i][ - self.condition.field_1 + for i, element in enumerate(self.left_modality.data): + idx_1 = list(self.left_modality.metadata.values())[i + right_starting_idx][ + self.condition.leftField ] if ( self.condition.alignment is None and self.condition.join_type == "<" ): # TODO compute correct alignment timestamps/spatial params - next_idx = np.zeros(len(idx_1), dtype=int) - next_idx[:-1] = idx_1[1:] - next_idx[-1] = sys.maxsize + nextIdx = np.zeros(len(idx_1), dtype=int) + nextIdx[:-1] = idx_1[1:] + nextIdx[-1] = sys.maxsize - idx_2 = list(self.other_modality.data_loader.metadata.values())[i][ - self.condition.field_2 + idx_2 = list(self.right_modality.metadata.values())[i + right_starting_idx][ + self.condition.rightField ] + self.joined_right.data.append([]) c = 0 + # Assumes ordered lists (temporal) + # TODO: need to extract the shape of the data from the metadata + # video: list of lists of numpy array + # audio: list of numpy array for j in range(0, len(idx_1)): - other = [] + self.joined_right.data[i].append([]) + other = np.array([]) if self.condition.join_type == "<": - while c < len(idx_2) and idx_2[c] < next_idx[j]: - other.append(self.other_modality.data[i][c]) + while c < len(idx_2) and idx_2[c] < nextIdx[j]: + if other.size == 0: + other = self.right_modality.data[i + right_starting_idx][c][np.newaxis, :] + else: + other = np.concatenate([other, self.right_modality.data[i + right_starting_idx][c][np.newaxis, :]], axis=0) + # other.append(self.right_modality.data[i][c]) c = c + 1 else: while c < len(idx_2) and idx_2[c] <= idx_1[j]: if idx_2[c] == idx_1[j]: - other.append(self.other_modality.data[i][c]) + other.append(self.right_modality.data[i + right_starting_idx][c]) c = c + 1 + + if len(other) == 0: # Audio and video length sometimes do not match so we add the average all audio samples for this specific frame + other = np.mean(self.right_modality.data[i + right_starting_idx], axis=0)[np.newaxis,:] # TODO: check correct loading for all data layouts, this is similar to missing data, add a different operation for htis + self.joined_right.data[i][j] = other - self.data["other"].append(other) - - def apply_representation(self, representation): + def apply_representation(self, representation, aggregation): + self.aggregation = aggregation if self.chunked_execution: - new_modality = TransformedModality( - self.primary_modality.type, representation + return self._handle_chunked_execution(representation) + elif self.left_type.__name__.__contains__("Unimodal"): + self.left_modality.extract_raw_data() + if self.left_type == self.right_type: + self.right_modality.extract_raw_data() + elif self.right_type.__name__.__contains__("Unimodal"): + self.right_modality.extract_raw_data() + + self.execute() + + def aggregate(self, aggregation_function, field_name): # TODO: use the filed name to extract data entries from modalities + self.aggregation = Aggregation(aggregation_function, field_name) + + if not self.chunked_execution and self.joined_right: + return self.aggregation.aggregate(self.joined_right) + + return self + + + def _handle_chunked_execution(self, representation): + if self.left_type == self.right_type: + return self._apply_representation_chunked( + self.left_modality, self.right_modality, True, representation + ) + elif self.chunk_left: + return self._apply_representation_chunked( + self.left_modality, self.right_modality, False, representation + ) + else: + return self._apply_representation_chunked( + self.right_modality, self.left_modality, False, representation + ) + + def _apply_representation_chunked( + self, chunk_modality, other_modality, chunk_other, representation + ): + new_left= TransformedModality( + self.left_modality.modality_type, + representation, + self.left_modality.metadata, + ) + new_right = TransformedModality( + self.right_modality.modality_type, + representation, + self.right_modality.metadata, + ) + while ( + chunk_modality.data_loader.next_chunk + < chunk_modality.data_loader.num_chunks + ): + print(chunk_modality.data_loader.next_chunk + ) + if chunk_other: + other_modality.extract_raw_data() + starting_idx = 0 + else: + starting_idx = chunk_modality.data_loader.next_chunk * chunk_modality.data_loader.chunk_size + chunk_modality.extract_raw_data() + + self.execute(starting_idx) + left_transformed = representation.transform(self.left_modality) + left_aggregated = self.aggregation.window(left_transformed) + new_left.data.extend( + left_aggregated.data ) - while ( - self.primary_modality.data_loader.get_next_chunk_number() - < self.primary_modality.data_loader.get_num_total_chunks() - ): - self.execute() - - def _check_chunked_data_extraction(self): - if self.primary_modality.data_loader.get_chunk_size(): - if not self.other_modality.data_loader.get_chunk_size(): - self.other_modality.data_loader.update_chunk_size( - self.primary_modality.data_loader.get_chunk_size() - ) - elif ( - self.other_modality.data_loader.get_chunk_size() - > self.primary_modality.data_loader.get_chunk_size() - ): - self.primary_modality.data_loader.update_chunk_size( - self.other_modality.data_loader.get_chunk_size() - ) - self.chunked_execution = True - elif self.other_modality.data_loader.get_chunk_size(): - self.primary_modality.data_loader.update_chunk_size( - self.other_modality.data_loader.get_chunk_size() + right_transformed = representation.transform(self.joined_right) + right_aggregated = self.aggregation.window(right_transformed) + new_right.data.extend( + right_aggregated.data ) - self.chunked_execution = True + + return JoinedTransformedModality(new_left, new_right, f'joined_{representation.name}') + diff --git a/src/main/python/systemds/scuro/modality/joined_transformed.py b/src/main/python/systemds/scuro/modality/joined_transformed.py new file mode 100644 index 00000000000..558b0e3760e --- /dev/null +++ b/src/main/python/systemds/scuro/modality/joined_transformed.py @@ -0,0 +1,62 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +from functools import reduce +from operator import or_ + +import numpy as np + +from systemds.scuro.modality.modality import Modality +from systemds.scuro.representations.utils import pad_sequences + +class JoinedTransformedModality(Modality): + + def __init__(self, left_modality, right_modality, transformation): + """ + Parent class of the different Modalities (unimodal & multimodal) + :param transformation: Representation to be applied on the modality + """ + super().__init__(reduce(or_, [left_modality.modality_type], right_modality.modality_type)) + self.transformation = transformation + self.left_modality = left_modality + self.right_modality = right_modality + + def combine(self, fusion_method): + """ + Combines two or more modalities with each other using a dedicated fusion method + :param other: The modality to be combined + :param fusion_method: The fusion method to be used to combine modalities + """ + modalities = [self.left_modality, self.right_modality] + self.data = [] + for i in range(0, len(self.left_modality.data)): + self.data.append([]) + for j in range(0, len(self.left_modality.data[i])): + self.data[i].append([]) + fused = np.concatenate([self.left_modality.data[i][j], self.right_modality.data[i][j]], axis=0) + self.data[i][j] = fused + # self.data = fusion_method.transform(modalities) + + for i, instance in enumerate(self.data): # TODO: only if the layout is list_of_lists_of_numpy_array + r = [] + [r.extend(l) for l in instance] + self.data[i] = np.array(r) + self.data = pad_sequences(self.data) + return self diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py index 6479c6247c4..28d27b04144 100644 --- a/src/main/python/systemds/scuro/modality/modality.py +++ b/src/main/python/systemds/scuro/modality/modality.py @@ -20,26 +20,42 @@ # ------------------------------------------------------------- from typing import List +import numpy as np + from systemds.scuro.modality.type import ModalityType class Modality: - def __init__(self, modality_type: ModalityType): + def __init__(self, modalityType: ModalityType, metadata=None): """ Parent class of the different Modalities (unimodal & multimodal) :param modality_type: Type of the modality """ - self.type = modality_type - self.schema = modality_type.get_schema() + self.modality_type = modalityType + self.schema = modalityType.get_schema() self.data = None self.data_type = None self.cost = None self.shape = None - self.data_index = None + self.dataIndex = None + self.metadata = metadata def get_modality_names(self) -> List[str]: """ Extracts the individual unimodal modalities for a given transformed modality. """ - return [modality.name for modality in ModalityType if modality in self.type] + return [modality.name for modality in ModalityType if modality in self.modality_type] + + + def update_metadata(self): + md_copy = self.metadata + self.metadata = {} + for i, (md_k, md_v) in enumerate(md_copy.items()): + updated_md = self.modality_type.update_metadata(md_v, self.data[i]) + self.metadata[md_k] = updated_md + + + def window(self, windowSize, aggregationFunction, fieldName): + pass + \ No newline at end of file diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py index 4e17b1e4975..e13395045f2 100644 --- a/src/main/python/systemds/scuro/modality/transformed.py +++ b/src/main/python/systemds/scuro/modality/transformed.py @@ -26,16 +26,20 @@ class TransformedModality(Modality): - def __init__(self, modality_type, transformation): + def __init__(self, modality_type, transformation, metadata): """ Parent class of the different Modalities (unimodal & multimodal) :param modality_type: Type of the original modality(ies) :param transformation: Representation to be applied on the modality """ - super().__init__(modality_type) + super().__init__(modality_type, metadata) self.transformation = transformation self.data = [] + def copy_from_instance(self): + return type(self)(self.modality_type, self.transformation, self.metadata) + + def combine(self, other, fusion_method): """ Combines two or more modalities with each other using a dedicated fusion method @@ -43,7 +47,8 @@ def combine(self, other, fusion_method): :param fusion_method: The fusion method to be used to combine modalities """ fused_modality = TransformedModality( - reduce(or_, (o.type for o in other), self.type), fusion_method + reduce(or_, (o.modality_type for o in other), self.modality_type), + fusion_method, self.metadata ) modalities = [self] modalities.extend(other) diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py index 7da2744d0b3..0dbacccef5a 100644 --- a/src/main/python/systemds/scuro/modality/type.py +++ b/src/main/python/systemds/scuro/modality/type.py @@ -19,21 +19,27 @@ # # ------------------------------------------------------------- from enum import Flag, auto +from systemds.scuro.utils.schema_helpers import ( + calculate_new_frequency, + create_timestamps, +) +# TODO: needs a way to define if data comes from a dataset with multiple instances or is like a streaming scenario where we only have one instance +# right now it is a list of instances (if only one instance the list would contain only a single item) class ModalitySchemas: TEXT_SCHEMA = {"type": "string", "length": "int"} AUDIO_SCHEMA = { "timestamp": "array", - "type": "float32", + "data_layout": {"type": "?", "representation": "?"}, "sample_rate": "integer", "length": "integer", } VIDEO_SCHEMA = { "timestamp": "array", - "type": "object", + "data_layout": {"type": "?", "representation": "?"}, "fps": "integer", "length": "integer", "width": "integer", @@ -41,6 +47,8 @@ class ModalitySchemas: "num_channels": "integer", } + _metadata_handlers = {} + @classmethod def get(cls, name): return getattr(cls, f"{name}_SCHEMA", None) @@ -49,6 +57,39 @@ def get(cls, name): def add_schema(cls, name, schema): setattr(cls, f"{name}_SCHEMA", schema) + @classmethod + def register_metadata_handler(cls, name): + def decorator(metadata_handler): + cls._metadata_handlers[name] = metadata_handler + return metadata_handler + + return decorator + + @classmethod + def update_metadata(cls, name, md, data): + mdHandler = cls._metadata_handlers.get(name) + if mdHandler: + return mdHandler(md, data) + + def extract_data(self, data, index): + if self.get("data_layout").get("representation") == "list_array": + return data[index] + else: + return data[index] + + +@ModalitySchemas.register_metadata_handler("AUDIO") +def handle_audio_metadata(md, data): + new_frequency = calculate_new_frequency(len(data), md["length"], md["sample_rate"]) + md.update( + { + "length": len(data), + "sample_rate": new_frequency, + "timestamp": create_timestamps(new_frequency, len(data)), + } + ) + return md + class ModalityType(Flag): TEXT = auto() @@ -57,3 +98,6 @@ class ModalityType(Flag): def get_schema(self): return ModalitySchemas.get(self.name) + + def update_metadata(self, md, data): + return ModalitySchemas.update_metadata(self.name, md, data) diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py index 84fbf537649..4fcf091afea 100644 --- a/src/main/python/systemds/scuro/modality/unimodal_modality.py +++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py @@ -37,25 +37,42 @@ def __init__(self, data_loader: BaseLoader, modality_type: ModalityType): :param data_loader: Defines how the raw data should be loaded :param modality_type: Type of the modality """ - super().__init__(modality_type) + super().__init__(modality_type, None) self.data_loader = data_loader - self.join_modality = None + + def copy_from_instance(self): + return type(self)(self.data_loader, self.modality_type) def extract_raw_data(self): """ Uses the data loader to read the raw data from a specified location and stores the data in the data location. """ - self.data = self.data_loader.load() + self.data, self.metadata = self.data_loader.load() - def apply_representation(self, representation): - new_modality = TransformedModality(self.type, representation) + def join(self, other, join_condition): + if isinstance(other, UnimodalModality): + self.data_loader.update_chunk_sizes(other.data_loader) + + joined_modality = JoinedModality( + reduce(or_, [other.modality_type], self.modality_type), + self, + other, + join_condition, + self.data_loader.chunk_size is not None + ) + + return joined_modality + + # TODO: add aggregation method like in join + def apply_representation(self, representation, aggregation): + new_modality = TransformedModality(self.modality_type, representation, self.data_loader.metadata) new_modality.data = [] - if self.data_loader.get_chunk_size(): + if self.data_loader.chunk_size: while ( - self.data_loader.get_next_chunk_number() - < self.data_loader.get_num_total_chunks() + self.data_loader.next_chunk + < self.data_loader.num_chunks ): self.extract_raw_data() new_modality.data.extend(representation.transform(self.data)) @@ -63,18 +80,6 @@ def apply_representation(self, representation): if not self.data: self.extract_raw_data() new_modality.data = representation.transform(self.data) - + + new_modality.update_metadata() return new_modality - - def join(self, other, join_condition): - joined_modality = JoinedModality( - reduce(or_, other.type, self.type), self, other, join_condition - ) - - if ( - not self.data_loader.get_chunk_size() - and not other.data_loader.get_chunk_size() - ): - joined_modality.execute() - - return joined_modality diff --git a/src/main/python/systemds/scuro/representations/aggregate.py b/src/main/python/systemds/scuro/representations/aggregate.py new file mode 100644 index 00000000000..7c8d1c68d12 --- /dev/null +++ b/src/main/python/systemds/scuro/representations/aggregate.py @@ -0,0 +1,51 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +import numpy as np + +from systemds.scuro.modality.modality import Modality + + +# TODO: make this a Representation and add a fusion method that fuses two modalities with each other + + +class Aggregation: + def __init__(self, aggregation_function, field_name): + self.aggregation_function = aggregation_function + self.field_name = field_name + + def aggregate(self, modality): + aggregated_modality = Modality(modality.modality_type, modality.metadata) + aggregated_modality.data = [] + for i, instance in enumerate(modality.data): + aggregated_modality.data.append([]) + for j, entry in enumerate(instance): + if self.aggregation_function == "sum": + aggregated_modality.data[i].append(np.sum(entry, axis=0)) + elif self.aggregation_function == "mean": + aggregated_modality.data[i].append(np.mean(entry, axis=0)) + elif self.aggregation_function == "min": + aggregated_modality.data[i].append(np.min(entry, axis=0)) + elif self.aggregation_function == "max": + aggregated_modality.data[i].append(np.max(entry, axis=0)) + else: + raise ValueError("Invalid aggregation function") + + return aggregated_modality diff --git a/src/main/python/systemds/scuro/representations/lstm.py b/src/main/python/systemds/scuro/representations/lstm.py index 649b81117b2..6f06e762a56 100644 --- a/src/main/python/systemds/scuro/representations/lstm.py +++ b/src/main/python/systemds/scuro/representations/lstm.py @@ -46,11 +46,11 @@ def transform(self, modalities: List[Modality]): result = np.zeros((size, 0)) for modality in modalities: - if modality.type in self.unimodal_embeddings.keys(): - out = self.unimodal_embeddings.get(modality.type) + if modality.modality_type in self.unimodal_embeddings.keys(): + out = self.unimodal_embeddings.get(modality.modality_type) else: out = self.run_lstm(modality.data) - self.unimodal_embeddings[modality.type] = out + self.unimodal_embeddings[modality.modality_type] = out result = np.concatenate([result, out], axis=-1) diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py index 57a7fab83e2..31b7f222cad 100644 --- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py +++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py @@ -24,7 +24,7 @@ import librosa import numpy as np from systemds.scuro.representations.utils import pad_sequences - +import matplotlib.pyplot as plt from systemds.scuro.representations.unimodal import UnimodalRepresentation @@ -38,24 +38,34 @@ def transform(self, data): result = [] max_length = 0 for sample in data: - S = librosa.feature.melspectrogram(y=sample) + S = librosa.feature.melspectrogram( + y=sample, sr=22050 + ) S_dB = librosa.power_to_db(S, ref=np.max) if S_dB.shape[-1] > max_length: max_length = S_dB.shape[-1] - result.append(S_dB) - - r = [] - for elem in result: - d = pad_sequences(elem, maxlen=max_length, dtype="float32") - r.append(d) + result.append(S_dB.T) - np_array_r = np.array(r) if not self.avg else np.mean(np.array(r), axis=1) + # r = [] + # for elem in result: + # d = pad_sequences(elem, maxlen=max_length, dtype="float32") + # r.append(d) - if self.output_file is not None: - data = [] - for i in range(0, np_array_r.shape[0]): - data.append(np_array_r[i]) - with open(self.output_file, "wb") as file: - pickle.dump(data, file) + # np_array_r = np.array(r) if not self.avg else np.mean(np.array(r), axis=1) + # + # if self.output_file is not None: + # data = [] + # for i in range(0, np_array_r.shape[0]): + # data.append(np_array_r[i]) + # with open(self.output_file, "wb") as file: + # pickle.dump(data, file) - return np_array_r + return result + + + def plot_spectrogram(self, spectrogram): + plt.figure(figsize=(10, 4)) + librosa.display.specshow(spectrogram, x_axis='time', y_axis='mel', sr=22050, cmap='viridis') + plt.colorbar(format='%+2.0f dB') + plt.title('Mel Spectrogram') + plt.savefig('spectrogram.jpg') diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py index 1c1bfa1d5ec..de80562b16f 100644 --- a/src/main/python/systemds/scuro/representations/resnet.py +++ b/src/main/python/systemds/scuro/representations/resnet.py @@ -22,6 +22,7 @@ import h5py +from systemds.scuro.modality.modality import Modality from systemds.scuro.representations.unimodal import UnimodalRepresentation from typing import Callable, Dict, Tuple, Any import torch.utils.data @@ -30,8 +31,10 @@ import torchvision.transforms as transforms import numpy as np -DEVICE = "cpu" - +if torch.backends.mps.is_available(): + DEVICE = torch.device("mps") +else: + DEVICE = torch.device("cpu") class ResNet(UnimodalRepresentation): def __init__(self, layer="avgpool", output_file=None): @@ -40,7 +43,7 @@ def __init__(self, layer="avgpool", output_file=None): self.output_file = output_file self.layer_name = layer - def transform(self, data): + def transform(self, modality): resnet = models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to(DEVICE) resnet.eval() @@ -60,7 +63,7 @@ def transform(self, data): ] ) - dataset = ResNetDataset(data, t) + dataset = ResNetDataset(modality.data, t) embeddings = {} class Identity(torch.nn.Module): @@ -88,7 +91,7 @@ def hook( for instance in torch.utils.data.DataLoader(dataset): video_id = instance["id"][0] - frames = instance["frames"][0].to(DEVICE) + frames = instance["data"][0].to(DEVICE) embeddings[video_id] = [] batch_size = 64 @@ -99,30 +102,36 @@ def hook( _ = resnet(frame_batch) values = res5c_output - - if self.layer_name == "avgpool" or self.layer_name == "maxpool": - embeddings[video_id].extend( - torch.flatten(values, 1).detach().cpu().numpy() - ) - - else: - pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1)) - - embeddings[video_id].extend( - torch.flatten(pooled, 1).detach().cpu().numpy() - ) - + # if self.layer_name == "avgpool" or self.layer_name == "maxpool": + # embeddings[video_id].extend( + # torch.flatten(values, 1).detach().cpu().numpy() + # ) + # + # else: + pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1)) + + embeddings[video_id].extend( + torch.flatten(pooled, 1).detach().cpu().numpy() + ) + + # TODO: this functionality could be used for operator reuse if the data stays the same if self.output_file is not None: with h5py.File(self.output_file, "w") as hdf: for key, value in embeddings.items(): hdf.create_dataset(key, data=value) - emb = [] + # emb = [] + + # TODO: this should be moved out to a windowing function + # for video in embeddings.values(): + # emb.append(np.array(video).mean(axis=0).tolist()) - for video in embeddings.values(): - emb.append(np.array(video).mean(axis=0).tolist()) + transformed_modality = Modality(modality.modality_type, modality.metadata) + transformed_modality.data = list(embeddings.values()) + transformed_modality.schema["data_layout"]["representation"] = "list_of_lists_of_numpy_array" # TODO: create infer data_layout method in modality + transformed_modality.schema["data_layout"]["type"] = transformed_modality.data[0][0].dtype # TODO: create infer data_layout method in modality - return np.array(emb) + return transformed_modality class ResNetDataset(torch.utils.data.Dataset): @@ -131,12 +140,17 @@ def __init__(self, data: str, tf: Callable = None): self.tf = tf def __getitem__(self, index) -> Dict[str, object]: - video = self.data[index] - frames = torch.empty((len(video), 3, 224, 224)) - - for i, frame in enumerate(video): - frames[i] = self.tf(frame) - return {"id": index, "frames": frames} + data = self.data[index] + output = torch.empty((len(data), 3, 224, 224)) + + for i, d in enumerate(data): + if data[0].ndim < 3: + d = torch.tensor(d) + d = d.repeat(3, 1, 1) + + output[i] = self.tf(d) + + return {"id": index, "data": output} def __len__(self) -> int: return len(self.data) diff --git a/src/main/python/systemds/scuro/representations/window.py b/src/main/python/systemds/scuro/representations/window.py new file mode 100644 index 00000000000..b589c8664b5 --- /dev/null +++ b/src/main/python/systemds/scuro/representations/window.py @@ -0,0 +1,46 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +import numpy as np +import math +from systemds.scuro import TransformedModality +from systemds.scuro.representations.aggregate import Aggregation + + +class WindowAggregation: + def __init__(self, window_size, aggregation_function): + self.window_size = window_size + self.aggregation_function = aggregation_function + + def window(self, modality): + # data is a 2d array + transformed_modality = TransformedModality(modality.modality_type, "window", modality.metadata) + for instance in modality.data: + window_length = math.ceil(len(instance) / self.window_size) + result = [[] for _ in range(0, window_length)] + # if modality.schema["data_layout"]["representation"] == "list_of_lists_of_numpy_array": + data = np.stack(instance) + for i in range(0, window_length): + result[i] = np.mean(data[i * self.window_size: i * self.window_size + self.window_size], axis=0) # TODO: add actual aggregation function here + + transformed_modality.data.append(result) + + return transformed_modality + \ No newline at end of file diff --git a/src/main/python/systemds/scuro/utils/join_condition.py b/src/main/python/systemds/scuro/utils/schema_helpers.py similarity index 57% rename from src/main/python/systemds/scuro/utils/join_condition.py rename to src/main/python/systemds/scuro/utils/schema_helpers.py index 62c8a4d0623..dfad21012cd 100644 --- a/src/main/python/systemds/scuro/utils/join_condition.py +++ b/src/main/python/systemds/scuro/utils/schema_helpers.py @@ -18,11 +18,26 @@ # under the License. # # ------------------------------------------------------------- +import math +import numpy as np -class JoinCondition: - def __init__(self, field_1, field_2, join_type, alignment=None): - self.field_1 = field_1 - self.field_2 = field_2 - self.join_type = join_type - self.alignment = alignment +def create_timestamps(frequency, sample_length, start_datetime=None): + start_time = ( + start_datetime + if start_datetime is not None + else np.datetime64("1970-01-01T00:00:00.000000") + ) + time_increment = 1 / frequency + time_increments_array = np.arange(sample_length) * np.timedelta64( + int(time_increment * 1e6) + ) + timestamps = start_time + time_increments_array + + return timestamps.astype(np.int64) + + +def calculate_new_frequency(new_length, old_length, old_frequency): + duration = old_length / old_frequency + new_frequency = new_length / duration + return math.floor(new_frequency) diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index 6856ee70442..03bdb243d15 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -33,7 +33,7 @@ def __init__(self, modalities, path, balanced=True): self.balanced = balanced for modality in modalities: - mod_path = f"{self.path}/{modality.type.name}/" + mod_path = f"{self.path}/{modality.modality_type.name}/" os.mkdir(mod_path) modality.file_path = mod_path self.labels = [] @@ -69,11 +69,11 @@ def create_multimodal_data(self, num_instances, duration=2, seed=42): speed_slow += 1 for modality in self.modalities: - if modality.type == ModalityType.VIDEO: + if modality.modality_type == ModalityType.VIDEO: self.__create_video_data(idx, duration, 30, speed_factor) - if modality.type == ModalityType.AUDIO: + if modality.modality_type == ModalityType.AUDIO: self.__create_audio_data(idx, duration, speed_factor) - if modality.type == ModalityType.TEXT: + if modality.modality_type == ModalityType.TEXT: self.__create_text_data(idx, speed_factor) np.save(f"{self.path}/labels.npy", np.array(self.labels)) diff --git a/src/main/python/tests/scuro/test_data_loaders.py b/src/main/python/tests/scuro/test_data_loaders.py index 55704b8d8af..acd48113762 100644 --- a/src/main/python/tests/scuro/test_data_loaders.py +++ b/src/main/python/tests/scuro/test_data_loaders.py @@ -88,7 +88,7 @@ def test_load_audio_data_from_file(self): ).apply_representation(MelSpectrogram()) for i in range(0, self.num_instances): - assert round(sum(self.audio_ref.data[i]), 4) == round(sum(audio.data[i]), 4) + assert round(sum(sum(self.audio_ref.data[i])), 4) == round(sum(sum(audio.data[i])), 4) def test_load_video_data_from_file(self): video_data_loader = VideoLoader(self.video_path, self.indizes) diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py index d0d7ef50770..88e063eef63 100644 --- a/src/main/python/tests/scuro/test_dr_search.py +++ b/src/main/python/tests/scuro/test_dr_search.py @@ -119,6 +119,7 @@ def setUpClass(cls): cls.data_generator = TestDataGenerator([video, audio, text], cls.test_file_path) cls.data_generator.create_multimodal_data(cls.num_instances) + #TODO: adapt the representation so they return non aggregated values. Apply windowing operation instead cls.bert = text.apply_representation(Bert()) cls.mel_spe = audio.apply_representation(MelSpectrogram()) cls.resnet = video.apply_representation(ResNet()) From 67966d063b9b67c3fa9439766b3049013e529509 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Wed, 29 Jan 2025 16:38:38 +0100 Subject: [PATCH 04/16] adapt existing representations to handle context and add tests --- .../python/systemds/scuro/modality/joined.py | 37 ++++-- .../scuro/modality/unimodal_modality.py | 14 +- .../systemds/scuro/representations/bert.py | 31 ++--- .../systemds/scuro/representations/bow.py | 11 +- .../scuro/representations/mel_spectrogram.py | 34 ++--- .../systemds/scuro/representations/tfidf.py | 12 +- .../scuro/representations/word2vec.py | 13 +- src/main/python/tests/scuro/data_generator.py | 50 +++++++- .../python/tests/scuro/test_data_loaders.py | 46 ++----- src/main/python/tests/scuro/test_dr_search.py | 46 ++----- .../tests/scuro/test_multimodal_join.py | 103 +++++++++++++++ .../scuro/test_unimodal_representations.py | 120 ++++++++++++++++++ 12 files changed, 375 insertions(+), 142 deletions(-) create mode 100644 src/main/python/tests/scuro/test_multimodal_join.py create mode 100644 src/main/python/tests/scuro/test_unimodal_representations.py diff --git a/src/main/python/systemds/scuro/modality/joined.py b/src/main/python/systemds/scuro/modality/joined.py index aeccdd3d321..5cb3e0f6de8 100644 --- a/src/main/python/systemds/scuro/modality/joined.py +++ b/src/main/python/systemds/scuro/modality/joined.py @@ -119,6 +119,11 @@ def apply_representation(self, representation, aggregation): self.right_modality.extract_raw_data() self.execute() + left_transformed, right_transformed = self._apply_representation(representation) + left_transformed.update_metadata() + right_transformed.update_metadata() + return JoinedTransformedModality(left_transformed, right_transformed, f'joined_{representation.name}') + def aggregate(self, aggregation_function, field_name): # TODO: use the filed name to extract data entries from modalities self.aggregation = Aggregation(aggregation_function, field_name) @@ -149,12 +154,12 @@ def _apply_representation_chunked( new_left= TransformedModality( self.left_modality.modality_type, representation, - self.left_modality.metadata, + {}, ) new_right = TransformedModality( self.right_modality.modality_type, representation, - self.right_modality.metadata, + {}, ) while ( chunk_modality.data_loader.next_chunk @@ -170,17 +175,29 @@ def _apply_representation_chunked( chunk_modality.extract_raw_data() self.execute(starting_idx) - left_transformed = representation.transform(self.left_modality) - left_aggregated = self.aggregation.window(left_transformed) + + left_transformed, right_transformed = self._apply_representation(representation) new_left.data.extend( - left_aggregated.data + left_transformed.data ) - - right_transformed = representation.transform(self.joined_right) - right_aggregated = self.aggregation.window(right_transformed) + new_left.metadata.update(left_transformed.metadata) new_right.data.extend( - right_aggregated.data + right_transformed.data ) - + new_right.metadata.update(right_transformed.metadata) + + new_left.update_metadata() + new_right.update_metadata() return JoinedTransformedModality(new_left, new_right, f'joined_{representation.name}') + + def _apply_representation(self, representation): + left_transformed = representation.transform(self.left_modality) + if self.aggregation: + left_transformed = self.aggregation.window(left_transformed) + + right_transformed = representation.transform(self.joined_right) + if self.aggregation: + right_transformed = self.aggregation.window(right_transformed) + + return left_transformed, right_transformed \ No newline at end of file diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py index 4fcf091afea..60d9ad004fe 100644 --- a/src/main/python/systemds/scuro/modality/unimodal_modality.py +++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py @@ -64,8 +64,8 @@ def join(self, other, join_condition): return joined_modality - # TODO: add aggregation method like in join - def apply_representation(self, representation, aggregation): + # TODO: maybe this can be made generic so it can be used in the join class as well + def apply_representation(self, representation, aggregation=None): new_modality = TransformedModality(self.modality_type, representation, self.data_loader.metadata) new_modality.data = [] @@ -75,11 +75,17 @@ def apply_representation(self, representation, aggregation): < self.data_loader.num_chunks ): self.extract_raw_data() - new_modality.data.extend(representation.transform(self.data)) + transformed_chunk = representation.transform(self) + if aggregation: + transformed_chunk = aggregation.window(transformed_chunk) + new_modality.data.extend(transformed_chunk.data) else: if not self.data: self.extract_raw_data() - new_modality.data = representation.transform(self.data) + new_modality = representation.transform(self) + if aggregation: + new_modality = aggregation.window(new_modality) + new_modality.update_metadata() return new_modality diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py index 0fcf1e8d280..08cb85e7395 100644 --- a/src/main/python/systemds/scuro/representations/bert.py +++ b/src/main/python/systemds/scuro/representations/bert.py @@ -21,6 +21,7 @@ import numpy as np +from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.representations.unimodal import UnimodalRepresentation import torch from transformers import BertTokenizer, BertModel @@ -28,30 +29,27 @@ class Bert(UnimodalRepresentation): - def __init__(self, avg_layers=None, output_file=None): + def __init__(self, output_file=None): super().__init__("Bert") - self.avg_layers = avg_layers self.output_file = output_file - def transform(self, data): - + def transform(self, modality): + transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata) model_name = "bert-base-uncased" tokenizer = BertTokenizer.from_pretrained( model_name, clean_up_tokenization_spaces=True ) - if self.avg_layers is not None: - model = BertModel.from_pretrained(model_name, output_hidden_states=True) - else: - model = BertModel.from_pretrained(model_name) + model = BertModel.from_pretrained(model_name) - embeddings = self.create_embeddings(data, model, tokenizer) + embeddings = self.create_embeddings(modality.data, model, tokenizer) if self.output_file is not None: save_embeddings(embeddings, self.output_file) - - return embeddings + + transformed_modality.data = embeddings + return transformed_modality def create_embeddings(self, data, model, tokenizer): embeddings = [] @@ -60,16 +58,9 @@ def create_embeddings(self, data, model, tokenizer): with torch.no_grad(): outputs = model(**inputs) - - if self.avg_layers is not None: - cls_embedding = [ - outputs.hidden_states[i][:, 0, :] - for i in range(-self.avg_layers, 0) - ] - cls_embedding = torch.mean(torch.stack(cls_embedding), dim=0).numpy() - else: + cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy() - embeddings.append(cls_embedding) + embeddings.append(cls_embedding) embeddings = np.array(embeddings) return embeddings.reshape((embeddings.shape[0], embeddings.shape[-1])) diff --git a/src/main/python/systemds/scuro/representations/bow.py b/src/main/python/systemds/scuro/representations/bow.py index bd54654a5cb..52863aaae3e 100644 --- a/src/main/python/systemds/scuro/representations/bow.py +++ b/src/main/python/systemds/scuro/representations/bow.py @@ -21,6 +21,7 @@ from sklearn.feature_extraction.text import CountVectorizer +from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.representations.unimodal import UnimodalRepresentation from systemds.scuro.representations.utils import save_embeddings @@ -32,14 +33,16 @@ def __init__(self, ngram_range, min_df, output_file=None): self.min_df = min_df self.output_file = output_file - def transform(self, data): + def transform(self, modality): + transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata) vectorizer = CountVectorizer( ngram_range=(1, self.ngram_range), min_df=self.min_df ) - X = vectorizer.fit_transform(data).toarray() + X = vectorizer.fit_transform(modality.data).toarray() if self.output_file is not None: save_embeddings(X, self.output_file) - - return X + + transformed_modality.data = X + return transformed_modality diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py index 31b7f222cad..3ac026374d7 100644 --- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py +++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py @@ -18,26 +18,23 @@ # under the License. # # ------------------------------------------------------------- - -import pickle - import librosa import numpy as np -from systemds.scuro.representations.utils import pad_sequences + +from systemds.scuro.modality.transformed import TransformedModality import matplotlib.pyplot as plt from systemds.scuro.representations.unimodal import UnimodalRepresentation class MelSpectrogram(UnimodalRepresentation): - def __init__(self, avg=True, output_file=None): + def __init__(self): super().__init__("MelSpectrogram") - self.avg = avg - self.output_file = output_file - def transform(self, data): + def transform(self, modality): + transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata) result = [] max_length = 0 - for sample in data: + for sample in modality.data: S = librosa.feature.melspectrogram( y=sample, sr=22050 ) @@ -45,22 +42,9 @@ def transform(self, data): if S_dB.shape[-1] > max_length: max_length = S_dB.shape[-1] result.append(S_dB.T) - - # r = [] - # for elem in result: - # d = pad_sequences(elem, maxlen=max_length, dtype="float32") - # r.append(d) - - # np_array_r = np.array(r) if not self.avg else np.mean(np.array(r), axis=1) - # - # if self.output_file is not None: - # data = [] - # for i in range(0, np_array_r.shape[0]): - # data.append(np_array_r[i]) - # with open(self.output_file, "wb") as file: - # pickle.dump(data, file) - - return result + + transformed_modality.data = result + return transformed_modality def plot_spectrogram(self, spectrogram): diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py index 4849aba1360..0d149f30a79 100644 --- a/src/main/python/systemds/scuro/representations/tfidf.py +++ b/src/main/python/systemds/scuro/representations/tfidf.py @@ -20,7 +20,7 @@ # ------------------------------------------------------------- from sklearn.feature_extraction.text import TfidfVectorizer - +from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.representations.unimodal import UnimodalRepresentation from systemds.scuro.representations.utils import read_data_from_file, save_embeddings @@ -31,13 +31,15 @@ def __init__(self, min_df, output_file=None): self.min_df = min_df self.output_file = output_file - def transform(self, data): + def transform(self, modality): + transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata) vectorizer = TfidfVectorizer(min_df=self.min_df) - X = vectorizer.fit_transform(data) + X = vectorizer.fit_transform(modality.data) X = X.toarray() if self.output_file is not None: save_embeddings(X, self.output_file) - - return X + + transformed_modality.data = X + return transformed_modality diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py index 209091648d5..a460e918bfe 100644 --- a/src/main/python/systemds/scuro/representations/word2vec.py +++ b/src/main/python/systemds/scuro/representations/word2vec.py @@ -19,7 +19,7 @@ # # ------------------------------------------------------------- import numpy as np - +from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.representations.unimodal import UnimodalRepresentation from systemds.scuro.representations.utils import save_embeddings from gensim.models import Word2Vec @@ -43,8 +43,9 @@ def __init__(self, vector_size, min_count, window, output_file=None): self.window = window self.output_file = output_file - def transform(self, data): - t = [word_tokenize(s.lower()) for s in data] + def transform(self, modality): + transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata) + t = [word_tokenize(s.lower()) for s in modality.data] model = Word2Vec( sentences=t, vector_size=self.vector_size, @@ -52,11 +53,11 @@ def transform(self, data): min_count=self.min_count, ) embeddings = [] - for sentences in data: + for sentences in modality.data: tokens = word_tokenize(sentences.lower()) embeddings.append(get_embedding(tokens, model)) if self.output_file is not None: save_embeddings(np.array(embeddings), self.output_file) - - return np.array(embeddings) + transformed_modality.data = np.array(embeddings) + return transformed_modality diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index 03bdb243d15..bce71ebefaf 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -18,17 +18,55 @@ # under the License. # # ------------------------------------------------------------- +import shutil + import cv2 import numpy as np from scipy.io.wavfile import write import random import os + +from systemds.scuro import VideoLoader, AudioLoader, TextLoader, UnimodalModality from systemds.scuro.modality.type import ModalityType +def setup_data(modalities, num_instances, path): + if os.path.isdir(path): + shutil.rmtree(path) + + os.makedirs(path) + + indizes = [str(i) for i in range(0, num_instances)] + + modalities_to_create = [] + for modality in modalities: + mod_path = path + "/" + modality.name + "/" + + if modality == ModalityType.VIDEO: + data_loader = VideoLoader(mod_path, indizes) + elif modality == ModalityType.AUDIO: + data_loader = AudioLoader(mod_path, indizes) + elif modality == ModalityType.TEXT: + data_loader = TextLoader(mod_path, indizes) + else: + raise 'Modality not supported in DataGenerator' + + modalities_to_create.append(UnimodalModality(data_loader, modality)) + + data_generator = TestDataGenerator(modalities_to_create, path) + data_generator.create_multimodal_data(num_instances) + return data_generator + + class TestDataGenerator: def __init__(self, modalities, path, balanced=True): + self.modalities = modalities + self.modalities_by_type = {} + for modality in modalities: + self.modalities_by_type[modality.modality_type] = modality + + self._indices = None self.path = path self.balanced = balanced @@ -38,10 +76,20 @@ def __init__(self, modalities, path, balanced=True): modality.file_path = mod_path self.labels = [] self.label_path = f"{path}/labels.npy" - + + def get_modality_path(self, modality_type): + return self.modalities_by_type[modality_type].data_loader.source_path + + @property + def indices(self): + if self._indices is None: + raise 'No indices available, please call setup_data first' + return self._indices + def create_multimodal_data(self, num_instances, duration=2, seed=42): speed_fast = 0 speed_slow = 0 + self._indices = [str(i) for i in range(0, num_instances)] for idx in range(num_instances): np.random.seed(seed) if self.balanced: diff --git a/src/main/python/tests/scuro/test_data_loaders.py b/src/main/python/tests/scuro/test_data_loaders.py index acd48113762..a8058c1391b 100644 --- a/src/main/python/tests/scuro/test_data_loaders.py +++ b/src/main/python/tests/scuro/test_data_loaders.py @@ -26,7 +26,7 @@ from systemds.scuro.representations.bert import Bert from systemds.scuro.representations.mel_spectrogram import MelSpectrogram from systemds.scuro.representations.resnet import ResNet -from tests.scuro.data_generator import TestDataGenerator +from tests.scuro.data_generator import setup_data from systemds.scuro.dataloader.audio_loader import AudioLoader from systemds.scuro.dataloader.video_loader import VideoLoader @@ -42,39 +42,19 @@ class TestDataLoaders(unittest.TestCase): video = None data_generator = None num_instances = 0 - indizes = [] @classmethod def setUpClass(cls): cls.test_file_path = "test_data" - - if os.path.isdir(cls.test_file_path): - shutil.rmtree(cls.test_file_path) - - os.makedirs(f"{cls.test_file_path}/embeddings") - cls.num_instances = 2 - cls.indizes = [str(i) for i in range(0, cls.num_instances)] - - cls.video_path = cls.test_file_path + "/" + ModalityType.VIDEO.name + "/" - cls.audio_path = cls.test_file_path + "/" + ModalityType.AUDIO.name + "/" - cls.text_path = cls.test_file_path + "/" + ModalityType.TEXT.name + "/" - - video_data_loader = VideoLoader(cls.video_path, cls.indizes) - audio_data_loader = AudioLoader(cls.audio_path, cls.indizes) - text_data_loader = TextLoader(cls.text_path, cls.indizes) - - # Load modalities (audio, video, text) - video = UnimodalModality(video_data_loader, ModalityType.VIDEO) - audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO) - text = UnimodalModality(text_data_loader, ModalityType.TEXT) - - cls.mods = [video, audio, text] - cls.data_generator = TestDataGenerator(cls.mods, cls.test_file_path) - cls.data_generator.create_multimodal_data(cls.num_instances) - cls.text_ref = text.apply_representation(Bert()) - cls.audio_ref = audio.apply_representation(MelSpectrogram()) - cls.video_ref = video.apply_representation(ResNet()) + cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] + cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) + + os.makedirs(f"{cls.test_file_path}/embeddings") + + cls.text_ref = cls.data_generator.modalities_by_type[ModalityType.TEXT].apply_representation(Bert()) + cls.audio_ref = cls.data_generator.modalities_by_type[ModalityType.AUDIO].apply_representation(MelSpectrogram()) + cls.video_ref = cls.data_generator.modalities_by_type[ModalityType.VIDEO].apply_representation(ResNet()) @classmethod def tearDownClass(cls): @@ -82,7 +62,7 @@ def tearDownClass(cls): shutil.rmtree(cls.test_file_path) def test_load_audio_data_from_file(self): - audio_data_loader = AudioLoader(self.audio_path, self.indizes) + audio_data_loader = AudioLoader(self.data_generator.get_modality_path(ModalityType.AUDIO), self.data_generator.indices) audio = UnimodalModality( audio_data_loader, ModalityType.AUDIO ).apply_representation(MelSpectrogram()) @@ -91,16 +71,16 @@ def test_load_audio_data_from_file(self): assert round(sum(sum(self.audio_ref.data[i])), 4) == round(sum(sum(audio.data[i])), 4) def test_load_video_data_from_file(self): - video_data_loader = VideoLoader(self.video_path, self.indizes) + video_data_loader = VideoLoader(self.data_generator.get_modality_path(ModalityType.VIDEO), self.data_generator.indices) video = UnimodalModality( video_data_loader, ModalityType.VIDEO ).apply_representation(ResNet()) for i in range(0, self.num_instances): - assert round(sum(self.video_ref.data[i]), 4) == round(sum(video.data[i]), 4) + assert round(sum(sum(self.video_ref.data[i])), 4) == round(sum(sum(video.data[i])), 4) def test_load_text_data_from_file(self): - text_data_loader = TextLoader(self.text_path, self.indizes) + text_data_loader = TextLoader(self.data_generator.get_modality_path(ModalityType.TEXT), self.data_generator.indices) text = UnimodalModality( text_data_loader, ModalityType.TEXT ).apply_representation(Bert()) diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py index 88e063eef63..eda23348404 100644 --- a/src/main/python/tests/scuro/test_dr_search.py +++ b/src/main/python/tests/scuro/test_dr_search.py @@ -25,14 +25,10 @@ import numpy as np from sklearn import svm from sklearn.metrics import classification_report -from sklearn.model_selection import train_test_split, KFold +from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler -from systemds.scuro.modality.unimodal_modality import UnimodalModality from systemds.scuro.modality.type import ModalityType -from systemds.scuro.dataloader.text_loader import TextLoader -from systemds.scuro.dataloader.audio_loader import AudioLoader -from systemds.scuro.dataloader.video_loader import VideoLoader from systemds.scuro.aligner.dr_search import DRSearch from systemds.scuro.aligner.task import Task from systemds.scuro.models.model import Model @@ -45,7 +41,7 @@ from systemds.scuro.representations.multiplication import Multiplication from systemds.scuro.representations.resnet import ResNet from systemds.scuro.representations.sum import Sum -from tests.scuro.data_generator import TestDataGenerator +from tests.scuro.data_generator import setup_data import warnings @@ -89,52 +85,34 @@ class TestDataLoaders(unittest.TestCase): video = None data_generator = None num_instances = 0 - indizes = [] representations = None @classmethod def setUpClass(cls): cls.test_file_path = "test_data_dr_search" - - if os.path.isdir(cls.test_file_path): - shutil.rmtree(cls.test_file_path) - - os.makedirs(f"{cls.test_file_path}/embeddings") - cls.num_instances = 8 - cls.indizes = [str(i) for i in range(0, cls.num_instances)] - - video_data_loader = VideoLoader( - cls.test_file_path + "/" + ModalityType.VIDEO.name + "/", cls.indizes - ) - audio_data_loader = AudioLoader( - cls.test_file_path + "/" + ModalityType.AUDIO.name + "/", cls.indizes - ) - text_data_loader = TextLoader( - cls.test_file_path + "/" + ModalityType.TEXT.name + "/", cls.indizes - ) - video = UnimodalModality(video_data_loader, ModalityType.VIDEO) - audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO) - text = UnimodalModality(text_data_loader, ModalityType.TEXT) - cls.data_generator = TestDataGenerator([video, audio, text], cls.test_file_path) - cls.data_generator.create_multimodal_data(cls.num_instances) + modalities = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] + + cls.data_generator = setup_data(modalities, cls.num_instances, cls.test_file_path) + os.makedirs(f"{cls.test_file_path}/embeddings") #TODO: adapt the representation so they return non aggregated values. Apply windowing operation instead - cls.bert = text.apply_representation(Bert()) - cls.mel_spe = audio.apply_representation(MelSpectrogram()) - cls.resnet = video.apply_representation(ResNet()) + + cls.bert = cls.data_generator.modalities_by_type[ModalityType.TEXT].apply_representation(Bert()) + cls.mel_spe = cls.data_generator.modalities_by_type[ModalityType.AUDIO].apply_representation(MelSpectrogram()) + cls.resnet = cls.data_generator.modalities_by_type[ModalityType.VIDEO].apply_representation(ResNet()) cls.mods = [cls.bert, cls.mel_spe, cls.resnet] split = train_test_split( - cls.indizes, cls.data_generator.labels, test_size=0.2, random_state=42 + cls.data_generator.indices, cls.data_generator.labels, test_size=0.2, random_state=42 ) cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ int(i) for i in split[1] ] for m in cls.mods: - m.data = scale_data(m.data, [int(i) for i in cls.train_indizes]) + m.data = scale_data(m.data, cls.train_indizes) cls.representations = [ Concatenation(), diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py new file mode 100644 index 00000000000..a21895c98f0 --- /dev/null +++ b/src/main/python/tests/scuro/test_multimodal_join.py @@ -0,0 +1,103 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Test edge cases: unequal number of audio-video timestamps (should still work and add the average over all audio/video samples) + + +import os +import shutil +import unittest + +from systemds.scuro.modality.joined import JoinCondition +from systemds.scuro.representations.window import WindowAggregation +from systemds.scuro.modality.unimodal_modality import UnimodalModality +from systemds.scuro.representations.mel_spectrogram import MelSpectrogram +from systemds.scuro.representations.resnet import ResNet +from tests.scuro.data_generator import setup_data + +from systemds.scuro.dataloader.audio_loader import AudioLoader +from systemds.scuro.dataloader.video_loader import VideoLoader +from systemds.scuro.modality.type import ModalityType + + +class TestUnimodalRepresentations(unittest.TestCase): + test_file_path = None + mods = None + text = None + audio = None + video = None + data_generator = None + num_instances = 0 + indizes = [] + + @classmethod + def setUpClass(cls): + cls.test_file_path = "join_test_data" + cls.num_instances = 4 + cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO] + + cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) + + @classmethod + def tearDownClass(cls): + print("Cleaning up test data") + shutil.rmtree(cls.test_file_path) + + def test_video_audio_join(self): + self._execute_av_join() + + def test_chunked_video_audio_join(self): + self._execute_av_join(2) + + def test_video_chunked_audio_join(self): + self._execute_av_join(None, 2) + + def test_chunked_video_chunked_audio_join(self): + self._execute_av_join(2, 2) + + def _execute_av_join(self, l_chunk_size=None, r_chunk_size=None): + window_size = 2 + video_data_loader = VideoLoader( + self.data_generator.get_modality_path(ModalityType.VIDEO), self.data_generator.indices, chunk_size=l_chunk_size + ) + video = UnimodalModality(video_data_loader, ModalityType.VIDEO) + + audio_data_loader = AudioLoader(self.data_generator.get_modality_path(ModalityType.AUDIO), self.data_generator.indices, r_chunk_size) + audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO) + + mel_audio = audio.apply_representation(MelSpectrogram()) + + resnet_modality = ( + video.join(mel_audio, JoinCondition("timestamp", "timestamp", "<")) + .apply_representation( + ResNet(layer="layer1.0.conv2"), + WindowAggregation(window_size=window_size, aggregation_function="mean"), + ) + .combine("concat") + ) + + assert resnet_modality.left_modality is not None + assert resnet_modality.right_modality is not None + assert len(resnet_modality.left_modality.data) == self.num_instances + assert len(resnet_modality.right_modality.data) == self.num_instances + assert resnet_modality.data is not None + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py b/src/main/python/tests/scuro/test_unimodal_representations.py new file mode 100644 index 00000000000..aea37eb93b0 --- /dev/null +++ b/src/main/python/tests/scuro/test_unimodal_representations.py @@ -0,0 +1,120 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +import os +import shutil +import unittest + +from systemds.scuro.representations.bow import BoW +from systemds.scuro.representations.word2vec import W2V +from systemds.scuro.representations.tfidf import TfIdf +from systemds.scuro.modality.unimodal_modality import UnimodalModality +from systemds.scuro.representations.bert import Bert +from systemds.scuro.representations.mel_spectrogram import MelSpectrogram +from systemds.scuro.representations.resnet import ResNet +from tests.scuro.data_generator import setup_data + +from systemds.scuro.dataloader.audio_loader import AudioLoader +from systemds.scuro.dataloader.video_loader import VideoLoader +from systemds.scuro.dataloader.text_loader import TextLoader +from systemds.scuro.modality.type import ModalityType + + +class TestUnimodalRepresentations(unittest.TestCase): + test_file_path = None + mods = None + text = None + audio = None + video = None + data_generator = None + num_instances = 0 + indizes = [] + + @classmethod + def setUpClass(cls): + cls.test_file_path = "unimodal_test_data" + + cls.num_instances = 4 + cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] + + cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) + os.makedirs(f"{cls.test_file_path}/embeddings") + + @classmethod + def tearDownClass(cls): + print("Cleaning up test data") + shutil.rmtree(cls.test_file_path) + + def test_audio_representations(self): + audio_representations = [MelSpectrogram()] # TODO: add FFT, TFN, 1DCNN + audio_data_loader = AudioLoader( + self.data_generator.get_modality_path(ModalityType.AUDIO), + self.data_generator.indices, + ) + audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO) + + for representation in audio_representations: + r = audio.apply_representation(representation) + assert r.data is not None + assert len(r.data) == self.num_instances + + def test_video_representations(self): + video_representations = [ResNet()] # Todo: add other video representations + video_data_loader = VideoLoader( + self.data_generator.get_modality_path(ModalityType.VIDEO), + self.data_generator.indices, + ) + video = UnimodalModality(video_data_loader, ModalityType.VIDEO) + for representation in video_representations: + r = video.apply_representation(representation) + assert r.data is not None + assert len(r.data) == self.num_instances + + def test_text_representations(self): + # TODO: check params fro BOW, W2V, TfIdf + test_representations = [BoW(2, 2), W2V(5, 2, 2), TfIdf(2), Bert()] + text_data_loader = TextLoader( + self.data_generator.get_modality_path(ModalityType.TEXT), + self.data_generator.indices, + ) + text = UnimodalModality(text_data_loader, ModalityType.TEXT) + + for representation in test_representations: + r = text.apply_representation(representation) + assert r.data is not None + assert len(r.data) == self.num_instances + + def test_chunked_video_representations(self): + video_representations = [ResNet()] + video_data_loader = VideoLoader( + self.data_generator.get_modality_path(ModalityType.VIDEO), + self.data_generator.indices, + chunk_size=2, + ) + video = UnimodalModality(video_data_loader, ModalityType.VIDEO) + for representation in video_representations: + r = video.apply_representation(representation) + assert r.data is not None + assert len(r.data) == self.num_instances + + +if __name__ == "__main__": + unittest.main() From 57217c4241cd10b181a0f5954c6a80d23f3df7b7 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Wed, 5 Feb 2025 10:20:12 +0100 Subject: [PATCH 05/16] add join for transformed modalities --- .../systemds/scuro/dataloader/audio_loader.py | 1 + .../systemds/scuro/dataloader/base_loader.py | 4 +- .../python/systemds/scuro/modality/joined.py | 213 ++++++++++++------ .../scuro/modality/joined_transformed.py | 16 +- .../systemds/scuro/modality/modality.py | 70 +++++- .../systemds/scuro/modality/transformed.py | 43 +++- .../python/systemds/scuro/modality/type.py | 15 +- .../scuro/modality/unimodal_modality.py | 40 ++-- .../systemds/scuro/representations/bert.py | 8 +- .../systemds/scuro/representations/bow.py | 6 +- .../scuro/representations/mel_spectrogram.py | 23 +- .../systemds/scuro/representations/resnet.py | 97 ++++---- .../systemds/scuro/representations/tfidf.py | 6 +- .../systemds/scuro/representations/window.py | 23 +- .../scuro/representations/word2vec.py | 4 +- .../python/systemds/scuro/utils/__init__.py | 2 +- .../systemds/scuro/utils/schema_helpers.py | 2 +- src/main/python/tests/scuro/data_generator.py | 26 +-- .../python/tests/scuro/test_data_loaders.py | 39 +++- src/main/python/tests/scuro/test_dr_search.py | 33 ++- .../tests/scuro/test_multimodal_join.py | 70 ++++-- .../scuro/test_unimodal_representations.py | 2 +- 22 files changed, 518 insertions(+), 225 deletions(-) diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py index b86d8a28763..f7319fe1912 100644 --- a/src/main/python/systemds/scuro/dataloader/audio_loader.py +++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py @@ -24,6 +24,7 @@ from systemds.scuro.dataloader.base_loader import BaseLoader from systemds.scuro.utils.schema_helpers import create_timestamps + class AudioLoader(BaseLoader): def __init__( self, diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py index 33d4e4920e1..5cdf63f584c 100644 --- a/src/main/python/systemds/scuro/dataloader/base_loader.py +++ b/src/main/python/systemds/scuro/dataloader/base_loader.py @@ -55,11 +55,11 @@ def chunk_size(self): def chunk_size(self, value): self._chunk_size = value self._num_chunks = int(len(self.indices) / self._chunk_size) - + @property def num_chunks(self): return self._num_chunks - + @property def next_chunk(self): return self._next_chunk diff --git a/src/main/python/systemds/scuro/modality/joined.py b/src/main/python/systemds/scuro/modality/joined.py index 5cb3e0f6de8..08d9a1b7ae7 100644 --- a/src/main/python/systemds/scuro/modality/joined.py +++ b/src/main/python/systemds/scuro/modality/joined.py @@ -24,8 +24,8 @@ from systemds.scuro.modality.joined_transformed import JoinedTransformedModality from systemds.scuro.modality.modality import Modality -from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.representations.aggregate import Aggregation +from systemds.scuro.representations.utils import pad_sequences class JoinCondition: @@ -56,17 +56,28 @@ def __init__( self.left_modality = left_modality self.right_modality = right_modality self.condition = join_condition - self.chunked_execution = chunked_execution # TODO: maybe move this into parent class + self.chunked_execution = ( + chunked_execution # TODO: maybe move this into parent class + ) self.left_type = type(left_modality) self.right_type = type(right_modality) - if self.chunked_execution: + self.chunk_left = False + if self.chunked_execution and self.left_type.__name__.__contains__("Unimodal"): self.chunk_left = left_modality.data_loader.chunk_size is not None - def execute(self, right_starting_idx=0): + def execute(self, starting_idx=0): self.joined_right = self.right_modality.copy_from_instance() - for i, element in enumerate(self.left_modality.data): - idx_1 = list(self.left_modality.metadata.values())[i + right_starting_idx][ + start, end = 0, len(self.left_modality.data) + if self.chunked_execution and not self.chunk_left: + start = starting_idx + end = ( + self.right_modality.data_loader.chunk_size + * self.right_modality.data_loader.next_chunk + ) + + for i in range(start, end): + idx_1 = list(self.left_modality.metadata.values())[i + starting_idx][ self.condition.leftField ] if ( @@ -76,7 +87,10 @@ def execute(self, right_starting_idx=0): nextIdx[:-1] = idx_1[1:] nextIdx[-1] = sys.maxsize - idx_2 = list(self.right_modality.metadata.values())[i + right_starting_idx][ + if self.chunk_left: + i = i + starting_idx + + idx_2 = list(self.right_modality.metadata.values())[i][ self.condition.rightField ] self.joined_right.data.append([]) @@ -87,25 +101,53 @@ def execute(self, right_starting_idx=0): # video: list of lists of numpy array # audio: list of numpy array for j in range(0, len(idx_1)): - self.joined_right.data[i].append([]) - other = np.array([]) + self.joined_right.data[i - starting_idx].append([]) + right = np.array([]) if self.condition.join_type == "<": while c < len(idx_2) and idx_2[c] < nextIdx[j]: - if other.size == 0: - other = self.right_modality.data[i + right_starting_idx][c][np.newaxis, :] + if right.size == 0: + right = self.right_modality.data[i][c] + if right.ndim == 1: + right = right[np.newaxis, :] else: - other = np.concatenate([other, self.right_modality.data[i + right_starting_idx][c][np.newaxis, :]], axis=0) - # other.append(self.right_modality.data[i][c]) + if len(self.right_modality.data) < i: + print(f"i:{i}") + print(f"starting_index:{starting_idx}") + print( + f"right mod length:{len(self.right_modality.data)}" + ) + print(f"left mod length:{len(self.left_modality.data)}") + + if self.right_modality.data[i][c].ndim == 1: + right = np.concatenate( + [ + right, + self.right_modality.data[i][c][np.newaxis, :], + ], + axis=0, + ) + else: + right = np.concatenate( + [right, self.right_modality.data[i][c]], + axis=0, + ) c = c + 1 else: while c < len(idx_2) and idx_2[c] <= idx_1[j]: if idx_2[c] == idx_1[j]: - other.append(self.right_modality.data[i + right_starting_idx][c]) + right.append(self.right_modality.data[i][c]) c = c + 1 - - if len(other) == 0: # Audio and video length sometimes do not match so we add the average all audio samples for this specific frame - other = np.mean(self.right_modality.data[i + right_starting_idx], axis=0)[np.newaxis,:] # TODO: check correct loading for all data layouts, this is similar to missing data, add a different operation for htis - self.joined_right.data[i][j] = other + + if ( + len(right) == 0 + ): # Audio and video length sometimes do not match so we add the average all audio samples for this specific frame + right = np.mean(self.right_modality.data[i][c - 1 : c], axis=0) + if right.ndim == 1: + right = right[ + np.newaxis, : + ] # TODO: check correct loading for all data layouts, this is similar to missing data, add a different operation for this + + self.joined_right.data[i - starting_idx][j] = right def apply_representation(self, representation, aggregation): self.aggregation = aggregation @@ -119,20 +161,62 @@ def apply_representation(self, representation, aggregation): self.right_modality.extract_raw_data() self.execute() - left_transformed, right_transformed = self._apply_representation(representation) + left_transformed = self._apply_representation( + self.left_modality, representation + ) + right_transformed = self._apply_representation( + self.joined_right, representation + ) left_transformed.update_metadata() right_transformed.update_metadata() - return JoinedTransformedModality(left_transformed, right_transformed, f'joined_{representation.name}') - - - def aggregate(self, aggregation_function, field_name): # TODO: use the filed name to extract data entries from modalities + return JoinedTransformedModality( + left_transformed, right_transformed, f"joined_{representation.name}" + ) + + def aggregate( + self, aggregation_function, field_name + ): # TODO: use the filed name to extract data entries from modalities self.aggregation = Aggregation(aggregation_function, field_name) - + if not self.chunked_execution and self.joined_right: return self.aggregation.aggregate(self.joined_right) - + + return self + + def combine(self, fusion_method): + """ + Combines two or more modalities with each other using a dedicated fusion method + :param other: The modality to be combined + :param fusion_method: The fusion method to be used to combine modalities + """ + modalities = [self.left_modality, self.right_modality] + self.data = [] + reshape = False + if self.left_modality.get_data_shape() != self.joined_right.get_data_shape(): + reshape = True + for i in range(0, len(self.left_modality.data)): + self.data.append([]) + for j in range(0, len(self.left_modality.data[i])): + self.data[i].append([]) + if reshape: + self.joined_right.data[i][j] = self.joined_right.data[i][j].reshape( + self.left_modality.get_data_shape() + ) + fused = np.concatenate( + [self.left_modality.data[i][j], self.joined_right.data[i][j]], + axis=0, + ) + self.data[i][j] = fused + # self.data = fusion_method.transform(modalities) + + for i, instance in enumerate( + self.data + ): # TODO: only if the layout is list_of_lists_of_numpy_array + r = [] + [r.extend(l) for l in instance] + self.data[i] = np.array(r) + self.data = pad_sequences(self.data) return self - def _handle_chunked_execution(self, representation): if self.left_type == self.right_type: @@ -143,61 +227,58 @@ def _handle_chunked_execution(self, representation): return self._apply_representation_chunked( self.left_modality, self.right_modality, False, representation ) - else: + else: # TODO: refactor this approach (it is changing the way the modalities are joined) return self._apply_representation_chunked( self.right_modality, self.left_modality, False, representation ) def _apply_representation_chunked( - self, chunk_modality, other_modality, chunk_other, representation + self, left_modality, right_modality, chunk_right, representation ): - new_left= TransformedModality( - self.left_modality.modality_type, - representation, - {}, - ) - new_right = TransformedModality( - self.right_modality.modality_type, - representation, - {}, - ) + new_left = Modality(left_modality.modality_type, {}) + new_right = Modality(right_modality.modality_type, {}) + + transform_right = True while ( - chunk_modality.data_loader.next_chunk - < chunk_modality.data_loader.num_chunks + left_modality.data_loader.next_chunk < left_modality.data_loader.num_chunks ): - print(chunk_modality.data_loader.next_chunk - ) - if chunk_other: - other_modality.extract_raw_data() + print(left_modality.data_loader.next_chunk) + if chunk_right: + right_modality.extract_raw_data() starting_idx = 0 else: - starting_idx = chunk_modality.data_loader.next_chunk * chunk_modality.data_loader.chunk_size - chunk_modality.extract_raw_data() + starting_idx = ( + left_modality.data_loader.next_chunk + * left_modality.data_loader.chunk_size + ) + left_modality.extract_raw_data() self.execute(starting_idx) - - left_transformed, right_transformed = self._apply_representation(representation) - new_left.data.extend( - left_transformed.data - ) - new_left.metadata.update(left_transformed.metadata) - new_right.data.extend( - right_transformed.data + + right_transformed = self._apply_representation( + self.joined_right, representation ) + new_right.data.extend(right_transformed.data) new_right.metadata.update(right_transformed.metadata) - + + left_transformed = self._apply_representation(left_modality, representation) + new_left.data.extend(left_transformed.data) + new_left.metadata.update(left_transformed.metadata) + new_left.update_metadata() new_right.update_metadata() - return JoinedTransformedModality(new_left, new_right, f'joined_{representation.name}') - + return JoinedTransformedModality( + new_left, new_right, f"joined_{representation.name}" + ) - def _apply_representation(self, representation): - left_transformed = representation.transform(self.left_modality) - if self.aggregation: - left_transformed = self.aggregation.window(left_transformed) - - right_transformed = representation.transform(self.joined_right) + def _apply_representation(self, modality, representation): + transformed = representation.transform(modality) if self.aggregation: - right_transformed = self.aggregation.window(right_transformed) - - return left_transformed, right_transformed \ No newline at end of file + aggregated_data_left = self.aggregation.window(transformed) + transformed = Modality( + transformed.modality_type, + transformed.metadata, + ) + transformed.data = aggregated_data_left + + return transformed diff --git a/src/main/python/systemds/scuro/modality/joined_transformed.py b/src/main/python/systemds/scuro/modality/joined_transformed.py index 558b0e3760e..e2b53671aa8 100644 --- a/src/main/python/systemds/scuro/modality/joined_transformed.py +++ b/src/main/python/systemds/scuro/modality/joined_transformed.py @@ -26,6 +26,7 @@ from systemds.scuro.modality.modality import Modality from systemds.scuro.representations.utils import pad_sequences + class JoinedTransformedModality(Modality): def __init__(self, left_modality, right_modality, transformation): @@ -33,7 +34,9 @@ def __init__(self, left_modality, right_modality, transformation): Parent class of the different Modalities (unimodal & multimodal) :param transformation: Representation to be applied on the modality """ - super().__init__(reduce(or_, [left_modality.modality_type], right_modality.modality_type)) + super().__init__( + reduce(or_, [left_modality.modality_type], right_modality.modality_type) + ) self.transformation = transformation self.left_modality = left_modality self.right_modality = right_modality @@ -50,11 +53,16 @@ def combine(self, fusion_method): self.data.append([]) for j in range(0, len(self.left_modality.data[i])): self.data[i].append([]) - fused = np.concatenate([self.left_modality.data[i][j], self.right_modality.data[i][j]], axis=0) + fused = np.concatenate( + [self.left_modality.data[i][j], self.right_modality.data[i][j]], + axis=0, + ) self.data[i][j] = fused # self.data = fusion_method.transform(modalities) - - for i, instance in enumerate(self.data): # TODO: only if the layout is list_of_lists_of_numpy_array + + for i, instance in enumerate( + self.data + ): # TODO: only if the layout is list_of_lists_of_numpy_array r = [] [r.extend(l) for l in instance] self.data[i] = np.array(r) diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py index 28d27b04144..cce26eee014 100644 --- a/src/main/python/systemds/scuro/modality/modality.py +++ b/src/main/python/systemds/scuro/modality/modality.py @@ -34,7 +34,7 @@ def __init__(self, modalityType: ModalityType, metadata=None): """ self.modality_type = modalityType self.schema = modalityType.get_schema() - self.data = None + self.data = [] self.data_type = None self.cost = None self.shape = None @@ -45,17 +45,69 @@ def get_modality_names(self) -> List[str]: """ Extracts the individual unimodal modalities for a given transformed modality. """ - return [modality.name for modality in ModalityType if modality in self.modality_type] - - + return [ + modality.name for modality in ModalityType if modality in self.modality_type + ] + + def copy_from_instance(self): + return type(self)(self.modality_type, self.metadata) + def update_metadata(self): md_copy = self.metadata self.metadata = {} for i, (md_k, md_v) in enumerate(md_copy.items()): updated_md = self.modality_type.update_metadata(md_v, self.data[i]) self.metadata[md_k] = updated_md - - - def window(self, windowSize, aggregationFunction, fieldName): - pass - \ No newline at end of file + + def get_metadata_at_position(self, position: int): + return self.metadata[self.dataIndex][position] + + def flatten(self): + for num_instance, instance in enumerate(self.data): + if type(instance) is np.ndarray: + self.data[num_instance] = instance.flatten() + elif type(instance) is list: + self.data[num_instance] = np.array( + [item for sublist in instance for item in sublist] + ) + + self.data = np.array(self.data) + return self + + def get_data_layout(self): + if not self.data: + return self.data + + if isinstance(self.data[0], list): + return "list_of_lists_of_numpy_array" + elif isinstance(self.data[0], np.ndarray): + return "list_of_numpy_array" + + def get_data_shape(self): + layout = self.get_data_layout() + if not layout: + return None + + if layout == "list_of_lists_of_numpy_array": + return self.data[0][0].shape + elif layout == "list_of_numpy_array": + return self.data[0].shape + + def get_data_dtype(self): + layout = self.get_data_layout() + if not layout: + return None + + if layout == "list_of_lists_of_numpy_array": + return self.data[0][0].dtype + elif layout == "list_of_numpy_array": + return self.data[0].dtype + + def update_data_layout(self): + if not self.data: + return + + self.schema["data_layout"]["representation"] = self.get_data_layout() + + self.shape = self.get_data_shape() + self.schema["data_layout"]["type"] = self.get_data_dtype() diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py index e13395045f2..64bfba0819f 100644 --- a/src/main/python/systemds/scuro/modality/transformed.py +++ b/src/main/python/systemds/scuro/modality/transformed.py @@ -21,7 +21,9 @@ from functools import reduce from operator import or_ +from systemds.scuro.modality.joined import JoinedModality from systemds.scuro.modality.modality import Modality +from systemds.scuro.representations.window import WindowAggregation class TransformedModality(Modality): @@ -39,6 +41,44 @@ def __init__(self, modality_type, transformation, metadata): def copy_from_instance(self): return type(self)(self.modality_type, self.transformation, self.metadata) + def join(self, right, join_condition): + chunked_execution = False + if type(right).__name__.__contains__("Unimodal"): + if right.data_loader.chunk_size: + chunked_execution = True + elif right.data is None or len(right.data) == 0: + right.extract_raw_data() + + joined_modality = JoinedModality( + reduce(or_, [right.modality_type], self.modality_type), + self, + right, + join_condition, + chunked_execution, + ) + + if not chunked_execution: + joined_modality.execute(0) + + return joined_modality + + def window(self, windowSize, aggregationFunction, fieldName=None): + transformed_modality = TransformedModality( + self.modality_type, "window", self.metadata + ) + w = WindowAggregation(windowSize, aggregationFunction) + transformed_modality.data = w.window(self) + + return transformed_modality + + def apply_representation(self, representation, aggregation): + new_modality = representation.transform(self) + + if aggregation: + new_modality.data = aggregation.window(new_modality) + + new_modality.update_metadata() + return new_modality def combine(self, other, fusion_method): """ @@ -48,7 +88,8 @@ def combine(self, other, fusion_method): """ fused_modality = TransformedModality( reduce(or_, (o.modality_type for o in other), self.modality_type), - fusion_method, self.metadata + fusion_method, + self.metadata, ) modalities = [self] modalities.extend(other) diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py index 0dbacccef5a..197ad23c540 100644 --- a/src/main/python/systemds/scuro/modality/type.py +++ b/src/main/python/systemds/scuro/modality/type.py @@ -70,7 +70,7 @@ def update_metadata(cls, name, md, data): mdHandler = cls._metadata_handlers.get(name) if mdHandler: return mdHandler(md, data) - + def extract_data(self, data, index): if self.get("data_layout").get("representation") == "list_array": return data[index] @@ -91,6 +91,19 @@ def handle_audio_metadata(md, data): return md +@ModalitySchemas.register_metadata_handler("VIDEO") +def handle_video_metadata(md, data): + new_frequency = calculate_new_frequency(len(data), md["length"], md["fps"]) + md.update( + { + "length": len(data), + "fps": new_frequency, + "timestamp": create_timestamps(new_frequency, len(data)), + } + ) + return md + + class ModalityType(Flag): TEXT = auto() AUDIO = auto() diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py index 60d9ad004fe..ae33b6605ba 100644 --- a/src/main/python/systemds/scuro/modality/unimodal_modality.py +++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py @@ -23,8 +23,8 @@ from systemds.scuro.dataloader.base_loader import BaseLoader -from systemds.scuro.modality.joined import JoinedModality from systemds.scuro.modality.modality import Modality +from systemds.scuro.modality.joined import JoinedModality from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.modality.type import ModalityType @@ -39,9 +39,20 @@ def __init__(self, data_loader: BaseLoader, modality_type: ModalityType): """ super().__init__(modality_type, None) self.data_loader = data_loader - + def copy_from_instance(self): - return type(self)(self.data_loader, self.modality_type) + new_instance = type(self)(self.data_loader, self.modality_type) + if self.metadata: + new_instance.metadata = self.metadata.copy() + return new_instance + + def get_metadata_at_position(self, position: int): + if self.data_loader.chunk_size: + return self.metadata[ + self.data_loader.chunk_size * self.data_loader.next_chunk + position + ] + + return self.metadata[self.dataIndex][position] def extract_raw_data(self): """ @@ -53,39 +64,38 @@ def extract_raw_data(self): def join(self, other, join_condition): if isinstance(other, UnimodalModality): self.data_loader.update_chunk_sizes(other.data_loader) - + joined_modality = JoinedModality( reduce(or_, [other.modality_type], self.modality_type), self, other, join_condition, - self.data_loader.chunk_size is not None + self.data_loader.chunk_size is not None, ) return joined_modality - # TODO: maybe this can be made generic so it can be used in the join class as well def apply_representation(self, representation, aggregation=None): - new_modality = TransformedModality(self.modality_type, representation, self.data_loader.metadata) + new_modality = TransformedModality( + self.modality_type, representation.name, self.data_loader.metadata.copy() + ) new_modality.data = [] if self.data_loader.chunk_size: - while ( - self.data_loader.next_chunk - < self.data_loader.num_chunks - ): + while self.data_loader.next_chunk < self.data_loader.num_chunks: self.extract_raw_data() transformed_chunk = representation.transform(self) if aggregation: - transformed_chunk = aggregation.window(transformed_chunk) + transformed_chunk.data = aggregation.window(transformed_chunk) new_modality.data.extend(transformed_chunk.data) + new_modality.metadata.update(transformed_chunk.metadata) else: if not self.data: self.extract_raw_data() new_modality = representation.transform(self) - + if aggregation: - new_modality = aggregation.window(new_modality) - + new_modality.data = aggregation.window(new_modality) + new_modality.update_metadata() return new_modality diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py index 08cb85e7395..bfaaa22642a 100644 --- a/src/main/python/systemds/scuro/representations/bert.py +++ b/src/main/python/systemds/scuro/representations/bert.py @@ -35,7 +35,9 @@ def __init__(self, output_file=None): self.output_file = output_file def transform(self, modality): - transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata) + transformed_modality = TransformedModality( + modality.modality_type, self, modality.metadata + ) model_name = "bert-base-uncased" tokenizer = BertTokenizer.from_pretrained( model_name, clean_up_tokenization_spaces=True @@ -47,7 +49,7 @@ def transform(self, modality): if self.output_file is not None: save_embeddings(embeddings, self.output_file) - + transformed_modality.data = embeddings return transformed_modality @@ -58,7 +60,7 @@ def create_embeddings(self, data, model, tokenizer): with torch.no_grad(): outputs = model(**inputs) - + cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy() embeddings.append(cls_embedding) diff --git a/src/main/python/systemds/scuro/representations/bow.py b/src/main/python/systemds/scuro/representations/bow.py index 52863aaae3e..f16f6ec04d8 100644 --- a/src/main/python/systemds/scuro/representations/bow.py +++ b/src/main/python/systemds/scuro/representations/bow.py @@ -34,7 +34,9 @@ def __init__(self, ngram_range, min_df, output_file=None): self.output_file = output_file def transform(self, modality): - transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata) + transformed_modality = TransformedModality( + modality.modality_type, self, modality.metadata + ) vectorizer = CountVectorizer( ngram_range=(1, self.ngram_range), min_df=self.min_df ) @@ -43,6 +45,6 @@ def transform(self, modality): if self.output_file is not None: save_embeddings(X, self.output_file) - + transformed_modality.data = X return transformed_modality diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py index 3ac026374d7..73da83b74b9 100644 --- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py +++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py @@ -31,25 +31,26 @@ def __init__(self): super().__init__("MelSpectrogram") def transform(self, modality): - transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata) + transformed_modality = TransformedModality( + modality.modality_type, self, modality.metadata + ) result = [] max_length = 0 for sample in modality.data: - S = librosa.feature.melspectrogram( - y=sample, sr=22050 - ) + S = librosa.feature.melspectrogram(y=sample, sr=22050) S_dB = librosa.power_to_db(S, ref=np.max) if S_dB.shape[-1] > max_length: max_length = S_dB.shape[-1] result.append(S_dB.T) - + transformed_modality.data = result return transformed_modality - - + def plot_spectrogram(self, spectrogram): plt.figure(figsize=(10, 4)) - librosa.display.specshow(spectrogram, x_axis='time', y_axis='mel', sr=22050, cmap='viridis') - plt.colorbar(format='%+2.0f dB') - plt.title('Mel Spectrogram') - plt.savefig('spectrogram.jpg') + librosa.display.specshow( + spectrogram, x_axis="time", y_axis="mel", sr=22050, cmap="viridis" + ) + plt.colorbar(format="%+2.0f dB") + plt.title("Mel Spectrogram") + plt.savefig("spectrogram.jpg") diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py index de80562b16f..2b80436aa8f 100644 --- a/src/main/python/systemds/scuro/representations/resnet.py +++ b/src/main/python/systemds/scuro/representations/resnet.py @@ -22,7 +22,7 @@ import h5py -from systemds.scuro.modality.modality import Modality +from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.representations.unimodal import UnimodalRepresentation from typing import Callable, Dict, Tuple, Any import torch.utils.data @@ -33,23 +33,49 @@ if torch.backends.mps.is_available(): DEVICE = torch.device("mps") +elif torch.backends.cudnn.is_available(): + DEVICE = torch.device("cuda") else: DEVICE = torch.device("cpu") + class ResNet(UnimodalRepresentation): - def __init__(self, layer="avgpool", output_file=None): + def __init__(self, layer="avgpool", model_name="ResNet18", output_file=None): super().__init__("ResNet") self.output_file = output_file self.layer_name = layer + self.model = model_name + self.model.eval() + for param in self.model.parameters(): + param.requires_grad = False - def transform(self, modality): + class Identity(torch.nn.Module): + def forward(self, input_: torch.Tensor) -> torch.Tensor: + return input_ - resnet = models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to(DEVICE) - resnet.eval() + self.model.fc = Identity() + + @property + def model(self): + return self._model + + @model.setter + def model(self, model): + if model == "ResNet18": + self._model = models.resnet18(pretrained=True).to(DEVICE) + elif model == "ResNet34": + self._model = models.resnet34(pretrained=True).to(DEVICE) + elif model == "ResNet50": + self._model = models.resnet50(pretrained=True).to(DEVICE) + elif model == "ResNet101": + self._model = models.resnet101(pretrained=True).to(DEVICE) + elif model == "ResNet152": + self._model = models.resnet152(pretrained=True).to(DEVICE) + else: + raise NotImplementedError - for param in resnet.parameters(): - param.requires_grad = False + def transform(self, modality): t = transforms.Compose( [ @@ -66,12 +92,6 @@ def transform(self, modality): dataset = ResNetDataset(modality.data, t) embeddings = {} - class Identity(torch.nn.Module): - def forward(self, input_: torch.Tensor) -> torch.Tensor: - return input_ - - resnet.fc = Identity() - res5c_output = None def get_features(name_): @@ -84,7 +104,7 @@ def hook( return hook if self.layer_name: - for name, layer in resnet.named_modules(): + for name, layer in self.model.named_modules(): if name == self.layer_name: layer.register_forward_hook(get_features(name)) break @@ -100,36 +120,19 @@ def hook( frame_ids_range = range(start_index, end_index) frame_batch = frames[frame_ids_range] - _ = resnet(frame_batch) + _ = self.model(frame_batch) values = res5c_output - # if self.layer_name == "avgpool" or self.layer_name == "maxpool": - # embeddings[video_id].extend( - # torch.flatten(values, 1).detach().cpu().numpy() - # ) - # - # else: pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1)) embeddings[video_id].extend( torch.flatten(pooled, 1).detach().cpu().numpy() ) - # TODO: this functionality could be used for operator reuse if the data stays the same - if self.output_file is not None: - with h5py.File(self.output_file, "w") as hdf: - for key, value in embeddings.items(): - hdf.create_dataset(key, data=value) - - # emb = [] - - # TODO: this should be moved out to a windowing function - # for video in embeddings.values(): - # emb.append(np.array(video).mean(axis=0).tolist()) - - transformed_modality = Modality(modality.modality_type, modality.metadata) + transformed_modality = TransformedModality( + modality.modality_type, "resnet", modality.metadata + ) transformed_modality.data = list(embeddings.values()) - transformed_modality.schema["data_layout"]["representation"] = "list_of_lists_of_numpy_array" # TODO: create infer data_layout method in modality - transformed_modality.schema["data_layout"]["type"] = transformed_modality.data[0][0].dtype # TODO: create infer data_layout method in modality + transformed_modality.update_data_layout() return transformed_modality @@ -141,14 +144,20 @@ def __init__(self, data: str, tf: Callable = None): def __getitem__(self, index) -> Dict[str, object]: data = self.data[index] - output = torch.empty((len(data), 3, 224, 224)) - - for i, d in enumerate(data): - if data[0].ndim < 3: - d = torch.tensor(d) - d = d.repeat(3, 1, 1) - - output[i] = self.tf(d) + if type(data) is np.ndarray: + output = torch.empty((1, 3, 224, 224)) + d = torch.tensor(data) + d = d.repeat(3, 1, 1) + output[0] = self.tf(d) + else: + output = torch.empty((len(data), 3, 224, 224)) + + for i, d in enumerate(data): + if data[0].ndim < 3: + d = torch.tensor(d) + d = d.repeat(3, 1, 1) + + output[i] = self.tf(d) return {"id": index, "data": output} diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py index 0d149f30a79..02cfb927c71 100644 --- a/src/main/python/systemds/scuro/representations/tfidf.py +++ b/src/main/python/systemds/scuro/representations/tfidf.py @@ -32,7 +32,9 @@ def __init__(self, min_df, output_file=None): self.output_file = output_file def transform(self, modality): - transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata) + transformed_modality = TransformedModality( + modality.modality_type, self, modality.metadata + ) vectorizer = TfidfVectorizer(min_df=self.min_df) X = vectorizer.fit_transform(modality.data) @@ -40,6 +42,6 @@ def transform(self, modality): if self.output_file is not None: save_embeddings(X, self.output_file) - + transformed_modality.data = X return transformed_modality diff --git a/src/main/python/systemds/scuro/representations/window.py b/src/main/python/systemds/scuro/representations/window.py index b589c8664b5..af0301d0e3b 100644 --- a/src/main/python/systemds/scuro/representations/window.py +++ b/src/main/python/systemds/scuro/representations/window.py @@ -20,27 +20,30 @@ # ------------------------------------------------------------- import numpy as np import math -from systemds.scuro import TransformedModality -from systemds.scuro.representations.aggregate import Aggregation +# TODO: move this into the aggregation class and add an aggregate() and a window(window_size) function there so they can use the same functionality. class WindowAggregation: def __init__(self, window_size, aggregation_function): self.window_size = window_size self.aggregation_function = aggregation_function - + def window(self, modality): # data is a 2d array - transformed_modality = TransformedModality(modality.modality_type, "window", modality.metadata) + windowed_data = [] for instance in modality.data: window_length = math.ceil(len(instance) / self.window_size) result = [[] for _ in range(0, window_length)] # if modality.schema["data_layout"]["representation"] == "list_of_lists_of_numpy_array": data = np.stack(instance) for i in range(0, window_length): - result[i] = np.mean(data[i * self.window_size: i * self.window_size + self.window_size], axis=0) # TODO: add actual aggregation function here - - transformed_modality.data.append(result) - - return transformed_modality - \ No newline at end of file + result[i] = np.mean( + data[ + i * self.window_size : i * self.window_size + self.window_size + ], + axis=0, + ) # TODO: add actual aggregation function here + + windowed_data.append(result) + + return windowed_data diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py index a460e918bfe..51729d635fa 100644 --- a/src/main/python/systemds/scuro/representations/word2vec.py +++ b/src/main/python/systemds/scuro/representations/word2vec.py @@ -44,7 +44,9 @@ def __init__(self, vector_size, min_count, window, output_file=None): self.output_file = output_file def transform(self, modality): - transformed_modality = TransformedModality(modality.modality_type, self, modality.metadata) + transformed_modality = TransformedModality( + modality.modality_type, self, modality.metadata + ) t = [word_tokenize(s.lower()) for s in modality.data] model = Word2Vec( sentences=t, diff --git a/src/main/python/systemds/scuro/utils/__init__.py b/src/main/python/systemds/scuro/utils/__init__.py index 0a47bfff92c..e66abb4646f 100644 --- a/src/main/python/systemds/scuro/utils/__init__.py +++ b/src/main/python/systemds/scuro/utils/__init__.py @@ -17,4 +17,4 @@ # specific language governing permissions and limitations # under the License. # -# ------------------------------------------------------------- \ No newline at end of file +# ------------------------------------------------------------- diff --git a/src/main/python/systemds/scuro/utils/schema_helpers.py b/src/main/python/systemds/scuro/utils/schema_helpers.py index dfad21012cd..a88e81f7161 100644 --- a/src/main/python/systemds/scuro/utils/schema_helpers.py +++ b/src/main/python/systemds/scuro/utils/schema_helpers.py @@ -40,4 +40,4 @@ def create_timestamps(frequency, sample_length, start_datetime=None): def calculate_new_frequency(new_length, old_length, old_frequency): duration = old_length / old_frequency new_frequency = new_length / duration - return math.floor(new_frequency) + return new_frequency diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index bce71ebefaf..ec0783df9cb 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -33,15 +33,15 @@ def setup_data(modalities, num_instances, path): if os.path.isdir(path): shutil.rmtree(path) - + os.makedirs(path) - + indizes = [str(i) for i in range(0, num_instances)] - + modalities_to_create = [] for modality in modalities: mod_path = path + "/" + modality.name + "/" - + if modality == ModalityType.VIDEO: data_loader = VideoLoader(mod_path, indizes) elif modality == ModalityType.AUDIO: @@ -49,10 +49,10 @@ def setup_data(modalities, num_instances, path): elif modality == ModalityType.TEXT: data_loader = TextLoader(mod_path, indizes) else: - raise 'Modality not supported in DataGenerator' - + raise "Modality not supported in DataGenerator" + modalities_to_create.append(UnimodalModality(data_loader, modality)) - + data_generator = TestDataGenerator(modalities_to_create, path) data_generator.create_multimodal_data(num_instances) return data_generator @@ -60,12 +60,12 @@ def setup_data(modalities, num_instances, path): class TestDataGenerator: def __init__(self, modalities, path, balanced=True): - + self.modalities = modalities self.modalities_by_type = {} for modality in modalities: self.modalities_by_type[modality.modality_type] = modality - + self._indices = None self.path = path self.balanced = balanced @@ -76,16 +76,16 @@ def __init__(self, modalities, path, balanced=True): modality.file_path = mod_path self.labels = [] self.label_path = f"{path}/labels.npy" - + def get_modality_path(self, modality_type): return self.modalities_by_type[modality_type].data_loader.source_path - + @property def indices(self): if self._indices is None: - raise 'No indices available, please call setup_data first' + raise "No indices available, please call setup_data first" return self._indices - + def create_multimodal_data(self, num_instances, duration=2, seed=42): speed_fast = 0 speed_slow = 0 diff --git a/src/main/python/tests/scuro/test_data_loaders.py b/src/main/python/tests/scuro/test_data_loaders.py index a8058c1391b..4ca77b205d0 100644 --- a/src/main/python/tests/scuro/test_data_loaders.py +++ b/src/main/python/tests/scuro/test_data_loaders.py @@ -49,12 +49,18 @@ def setUpClass(cls): cls.num_instances = 2 cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) - + os.makedirs(f"{cls.test_file_path}/embeddings") - - cls.text_ref = cls.data_generator.modalities_by_type[ModalityType.TEXT].apply_representation(Bert()) - cls.audio_ref = cls.data_generator.modalities_by_type[ModalityType.AUDIO].apply_representation(MelSpectrogram()) - cls.video_ref = cls.data_generator.modalities_by_type[ModalityType.VIDEO].apply_representation(ResNet()) + + cls.text_ref = cls.data_generator.modalities_by_type[ + ModalityType.TEXT + ].apply_representation(Bert()) + cls.audio_ref = cls.data_generator.modalities_by_type[ + ModalityType.AUDIO + ].apply_representation(MelSpectrogram()) + cls.video_ref = cls.data_generator.modalities_by_type[ + ModalityType.VIDEO + ].apply_representation(ResNet()) @classmethod def tearDownClass(cls): @@ -62,25 +68,38 @@ def tearDownClass(cls): shutil.rmtree(cls.test_file_path) def test_load_audio_data_from_file(self): - audio_data_loader = AudioLoader(self.data_generator.get_modality_path(ModalityType.AUDIO), self.data_generator.indices) + audio_data_loader = AudioLoader( + self.data_generator.get_modality_path(ModalityType.AUDIO), + self.data_generator.indices, + ) audio = UnimodalModality( audio_data_loader, ModalityType.AUDIO ).apply_representation(MelSpectrogram()) for i in range(0, self.num_instances): - assert round(sum(sum(self.audio_ref.data[i])), 4) == round(sum(sum(audio.data[i])), 4) + assert round(sum(sum(self.audio_ref.data[i])), 4) == round( + sum(sum(audio.data[i])), 4 + ) def test_load_video_data_from_file(self): - video_data_loader = VideoLoader(self.data_generator.get_modality_path(ModalityType.VIDEO), self.data_generator.indices) + video_data_loader = VideoLoader( + self.data_generator.get_modality_path(ModalityType.VIDEO), + self.data_generator.indices, + ) video = UnimodalModality( video_data_loader, ModalityType.VIDEO ).apply_representation(ResNet()) for i in range(0, self.num_instances): - assert round(sum(sum(self.video_ref.data[i])), 4) == round(sum(sum(video.data[i])), 4) + assert round(sum(sum(self.video_ref.data[i])), 4) == round( + sum(sum(video.data[i])), 4 + ) def test_load_text_data_from_file(self): - text_data_loader = TextLoader(self.data_generator.get_modality_path(ModalityType.TEXT), self.data_generator.indices) + text_data_loader = TextLoader( + self.data_generator.get_modality_path(ModalityType.TEXT), + self.data_generator.indices, + ) text = UnimodalModality( text_data_loader, ModalityType.TEXT ).apply_representation(Bert()) diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py index eda23348404..0e9b01557d0 100644 --- a/src/main/python/tests/scuro/test_dr_search.py +++ b/src/main/python/tests/scuro/test_dr_search.py @@ -92,20 +92,35 @@ def setUpClass(cls): cls.test_file_path = "test_data_dr_search" cls.num_instances = 8 modalities = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] - - cls.data_generator = setup_data(modalities, cls.num_instances, cls.test_file_path) + + cls.data_generator = setup_data( + modalities, cls.num_instances, cls.test_file_path + ) os.makedirs(f"{cls.test_file_path}/embeddings") - #TODO: adapt the representation so they return non aggregated values. Apply windowing operation instead - - cls.bert = cls.data_generator.modalities_by_type[ModalityType.TEXT].apply_representation(Bert()) - cls.mel_spe = cls.data_generator.modalities_by_type[ModalityType.AUDIO].apply_representation(MelSpectrogram()) - cls.resnet = cls.data_generator.modalities_by_type[ModalityType.VIDEO].apply_representation(ResNet()) + # TODO: adapt the representation so they return non aggregated values. Apply windowing operation instead + cls.bert = cls.data_generator.modalities_by_type[ + ModalityType.TEXT + ].apply_representation(Bert()) + cls.mel_spe = ( + cls.data_generator.modalities_by_type[ModalityType.AUDIO] + .apply_representation(MelSpectrogram()) + .flatten() + ) + cls.resnet = ( + cls.data_generator.modalities_by_type[ModalityType.VIDEO] + .apply_representation(ResNet()) + .window(10, "avg") + .flatten() + ) cls.mods = [cls.bert, cls.mel_spe, cls.resnet] split = train_test_split( - cls.data_generator.indices, cls.data_generator.labels, test_size=0.2, random_state=42 + cls.data_generator.indices, + cls.data_generator.labels, + test_size=0.2, + random_state=42, ) cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ int(i) for i in split[1] @@ -117,7 +132,7 @@ def setUpClass(cls): cls.representations = [ Concatenation(), Average(), - RowMax(), + RowMax(100), Multiplication(), Sum(), LSTM(width=256, depth=3), diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py index a21895c98f0..c48f5f56b2a 100644 --- a/src/main/python/tests/scuro/test_multimodal_join.py +++ b/src/main/python/tests/scuro/test_multimodal_join.py @@ -18,10 +18,8 @@ # under the License. # -# Test edge cases: unequal number of audio-video timestamps (should still work and add the average over all audio/video samples) +# TODO: Test edge cases: unequal number of audio-video timestamps (should still work and add the average over all audio/video samples) - -import os import shutil import unittest @@ -37,7 +35,7 @@ from systemds.scuro.modality.type import ModalityType -class TestUnimodalRepresentations(unittest.TestCase): +class TestMultimodalJoin(unittest.TestCase): test_file_path = None mods = None text = None @@ -52,7 +50,7 @@ def setUpClass(cls): cls.test_file_path = "join_test_data" cls.num_instances = 4 cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO] - + cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) @classmethod @@ -61,33 +59,64 @@ def tearDownClass(cls): shutil.rmtree(cls.test_file_path) def test_video_audio_join(self): - self._execute_av_join() + self._execute_va_join() def test_chunked_video_audio_join(self): - self._execute_av_join(2) - + self._execute_va_join(2) + def test_video_chunked_audio_join(self): - self._execute_av_join(None, 2) + self._execute_va_join(None, 2) def test_chunked_video_chunked_audio_join(self): - self._execute_av_join(2, 2) + self._execute_va_join(2, 2) + + def test_audio_video_join(self): + # Audio has a much higher frequency than video, hence we would need to + # duplicate or interpolate frames to match them to the audio frequency + self._execute_av_join() + + # TODO + # def test_chunked_audio_video_join(self): + # self._execute_av_join(2) + + # TODO + # def test_chunked_audio_chunked_video_join(self): + # self._execute_av_join(2, 2) + + def _execute_va_join(self, l_chunk_size=None, r_chunk_size=None): + video, audio = self._prepare_data(l_chunk_size, r_chunk_size) + self._join(video, audio, 2) def _execute_av_join(self, l_chunk_size=None, r_chunk_size=None): - window_size = 2 + video, audio = self._prepare_data(l_chunk_size, r_chunk_size) + self._join(audio, video, 2) + + def _prepare_data(self, l_chunk_size=None, r_chunk_size=None): video_data_loader = VideoLoader( - self.data_generator.get_modality_path(ModalityType.VIDEO), self.data_generator.indices, chunk_size=l_chunk_size + self.data_generator.get_modality_path(ModalityType.VIDEO), + self.data_generator.indices, + chunk_size=l_chunk_size, ) video = UnimodalModality(video_data_loader, ModalityType.VIDEO) - - audio_data_loader = AudioLoader(self.data_generator.get_modality_path(ModalityType.AUDIO), self.data_generator.indices, r_chunk_size) + + audio_data_loader = AudioLoader( + self.data_generator.get_modality_path(ModalityType.AUDIO), + self.data_generator.indices, + r_chunk_size, + ) audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO) - + mel_audio = audio.apply_representation(MelSpectrogram()) - + + return video, mel_audio + + def _join(self, left_modality, right_modality, window_size): resnet_modality = ( - video.join(mel_audio, JoinCondition("timestamp", "timestamp", "<")) + left_modality.join( + right_modality, JoinCondition("timestamp", "timestamp", "<") + ) .apply_representation( - ResNet(layer="layer1.0.conv2"), + ResNet(layer="layer1.0.conv2", model_name="ResNet50"), WindowAggregation(window_size=window_size, aggregation_function="mean"), ) .combine("concat") @@ -99,5 +128,8 @@ def _execute_av_join(self, l_chunk_size=None, r_chunk_size=None): assert len(resnet_modality.right_modality.data) == self.num_instances assert resnet_modality.data is not None + return resnet_modality + + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py b/src/main/python/tests/scuro/test_unimodal_representations.py index aea37eb93b0..d566830697f 100644 --- a/src/main/python/tests/scuro/test_unimodal_representations.py +++ b/src/main/python/tests/scuro/test_unimodal_representations.py @@ -101,7 +101,7 @@ def test_text_representations(self): r = text.apply_representation(representation) assert r.data is not None assert len(r.data) == self.num_instances - + def test_chunked_video_representations(self): video_representations = [ResNet()] video_data_loader = VideoLoader( From 000441df3c7a06fd25e0406ffa2e18091c73c0c1 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Wed, 5 Feb 2025 10:22:09 +0100 Subject: [PATCH 06/16] check python tests --- .github/workflows/python.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 9f39f07ecb7..c1dac76e640 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -31,7 +31,7 @@ on: - 'src/assembly/**' - 'dev/**' branches: - - main + - scuro_join pull_request: paths-ignore: - 'docs/**' From c6f8ca60ab4bcc7005e5f009c3d6a9b330ac688a Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Wed, 5 Feb 2025 10:32:30 +0100 Subject: [PATCH 07/16] remove plot --- .../scuro/representations/mel_spectrogram.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py index 73da83b74b9..05b7f37a6ca 100644 --- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py +++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py @@ -22,7 +22,7 @@ import numpy as np from systemds.scuro.modality.transformed import TransformedModality -import matplotlib.pyplot as plt +# import matplotlib.pyplot as plt from systemds.scuro.representations.unimodal import UnimodalRepresentation @@ -46,11 +46,11 @@ def transform(self, modality): transformed_modality.data = result return transformed_modality - def plot_spectrogram(self, spectrogram): - plt.figure(figsize=(10, 4)) - librosa.display.specshow( - spectrogram, x_axis="time", y_axis="mel", sr=22050, cmap="viridis" - ) - plt.colorbar(format="%+2.0f dB") - plt.title("Mel Spectrogram") - plt.savefig("spectrogram.jpg") + # def plot_spectrogram(self, spectrogram): + # plt.figure(figsize=(10, 4)) + # librosa.display.specshow( + # spectrogram, x_axis="time", y_axis="mel", sr=22050, cmap="viridis" + # ) + # plt.colorbar(format="%+2.0f dB") + # plt.title("Mel Spectrogram") + # plt.savefig("spectrogram.jpg") From 884aba17e6d31d995c86426d52060284139253b8 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Wed, 5 Feb 2025 10:36:08 +0100 Subject: [PATCH 08/16] check package versions --- src/main/python/systemds/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/python/systemds/__init__.py b/src/main/python/systemds/__init__.py index a618ff6e9dd..f9bc19ff817 100644 --- a/src/main/python/systemds/__init__.py +++ b/src/main/python/systemds/__init__.py @@ -40,6 +40,8 @@ def check_package_version(package_name, required_version): try: + print(f"Checking package version for {package_name}...") + print(f"Requried version: {required_version} - actual version: {version(package_name)}") return version(package_name) >= required_version except PackageNotFoundError: return False From 0e8ba13a08243dc36ddfabe2f361aab38c3ec0cf Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Wed, 5 Feb 2025 10:49:07 +0100 Subject: [PATCH 09/16] check if cuda is available --- .github/workflows/python.yml | 5 +++-- src/main/python/systemds/__init__.py | 4 ++-- src/main/python/systemds/scuro/representations/resnet.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index c1dac76e640..9f372695ed9 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -116,8 +116,9 @@ jobs: h5py \ nltk \ gensim \ - black - + black \ + opt-einsum + - name: Build Python Package run: | cd src/main/python diff --git a/src/main/python/systemds/__init__.py b/src/main/python/systemds/__init__.py index f9bc19ff817..609ec571c9f 100644 --- a/src/main/python/systemds/__init__.py +++ b/src/main/python/systemds/__init__.py @@ -26,8 +26,8 @@ __all__ = ["context", "operator", "examples"] required_packages = [ - ("torch", "2.5.1"), - ("torchvision", "0.20.1"), + ("torch", "2.4.1"), + ("torchvision", "0.19.1"), ("librosa", "0.10.2"), ("opencv-python", "4.10.0.84"), ("opt-einsum", "3.3.0"), diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py index 2b80436aa8f..eaa3d99e479 100644 --- a/src/main/python/systemds/scuro/representations/resnet.py +++ b/src/main/python/systemds/scuro/representations/resnet.py @@ -33,7 +33,7 @@ if torch.backends.mps.is_available(): DEVICE = torch.device("mps") -elif torch.backends.cudnn.is_available(): +elif torch.cuda.is_available(): DEVICE = torch.device("cuda") else: DEVICE = torch.device("cpu") From eccf7922e428d53b474b2830ca26dbe1e7455883 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Wed, 5 Feb 2025 12:05:38 +0100 Subject: [PATCH 10/16] remove nltk --- .github/workflows/python.yml | 1 - src/main/python/systemds/__init__.py | 3 --- src/main/python/systemds/scuro/representations/glove.py | 5 +++-- src/main/python/systemds/scuro/representations/word2vec.py | 6 +++--- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 9f372695ed9..c8ee2521046 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -114,7 +114,6 @@ jobs: torch \ librosa \ h5py \ - nltk \ gensim \ black \ opt-einsum diff --git a/src/main/python/systemds/__init__.py b/src/main/python/systemds/__init__.py index 609ec571c9f..443b5d23d90 100644 --- a/src/main/python/systemds/__init__.py +++ b/src/main/python/systemds/__init__.py @@ -33,15 +33,12 @@ ("opt-einsum", "3.3.0"), ("h5py", "3.11.0"), ("transformers", "4.46.3"), - ("nltk", "3.9.1"), ("gensim", "4.3.3"), ] def check_package_version(package_name, required_version): try: - print(f"Checking package version for {package_name}...") - print(f"Requried version: {required_version} - actual version: {version(package_name)}") return version(package_name) >= required_version except PackageNotFoundError: return False diff --git a/src/main/python/systemds/scuro/representations/glove.py b/src/main/python/systemds/scuro/representations/glove.py index cf13c717d2f..767fc8d375e 100644 --- a/src/main/python/systemds/scuro/representations/glove.py +++ b/src/main/python/systemds/scuro/representations/glove.py @@ -19,7 +19,8 @@ # # ------------------------------------------------------------- import numpy as np -from nltk import word_tokenize +from gensim.utils import tokenize + from systemds.scuro.representations.unimodal import UnimodalRepresentation from systemds.scuro.representations.utils import read_data_from_file, save_embeddings @@ -47,7 +48,7 @@ def transform(self, data): embeddings = [] for sentences in data: - tokens = word_tokenize(sentences.lower()) + tokens = list(tokenize(sentences.lower())) embeddings.append( np.mean( [ diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py index 51729d635fa..b68a9fd3eb4 100644 --- a/src/main/python/systemds/scuro/representations/word2vec.py +++ b/src/main/python/systemds/scuro/representations/word2vec.py @@ -23,7 +23,7 @@ from systemds.scuro.representations.unimodal import UnimodalRepresentation from systemds.scuro.representations.utils import save_embeddings from gensim.models import Word2Vec -from nltk.tokenize import word_tokenize +from gensim.utils import tokenize def get_embedding(sentence, model): @@ -47,7 +47,7 @@ def transform(self, modality): transformed_modality = TransformedModality( modality.modality_type, self, modality.metadata ) - t = [word_tokenize(s.lower()) for s in modality.data] + t = [list(tokenize(s.lower())) for s in modality.data] model = Word2Vec( sentences=t, vector_size=self.vector_size, @@ -56,7 +56,7 @@ def transform(self, modality): ) embeddings = [] for sentences in modality.data: - tokens = word_tokenize(sentences.lower()) + tokens = list(tokenize(sentences.lower())) embeddings.append(get_embedding(tokens, model)) if self.output_file is not None: From 19190aa1b70f3012276239e0371e82fc00021a9a Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Wed, 5 Feb 2025 12:06:57 +0100 Subject: [PATCH 11/16] remove prints --- src/main/python/systemds/scuro/modality/joined.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/main/python/systemds/scuro/modality/joined.py b/src/main/python/systemds/scuro/modality/joined.py index 08d9a1b7ae7..acdf4fb94f1 100644 --- a/src/main/python/systemds/scuro/modality/joined.py +++ b/src/main/python/systemds/scuro/modality/joined.py @@ -110,14 +110,6 @@ def execute(self, starting_idx=0): if right.ndim == 1: right = right[np.newaxis, :] else: - if len(self.right_modality.data) < i: - print(f"i:{i}") - print(f"starting_index:{starting_idx}") - print( - f"right mod length:{len(self.right_modality.data)}" - ) - print(f"left mod length:{len(self.left_modality.data)}") - if self.right_modality.data[i][c].ndim == 1: right = np.concatenate( [ @@ -238,11 +230,9 @@ def _apply_representation_chunked( new_left = Modality(left_modality.modality_type, {}) new_right = Modality(right_modality.modality_type, {}) - transform_right = True while ( left_modality.data_loader.next_chunk < left_modality.data_loader.num_chunks ): - print(left_modality.data_loader.next_chunk) if chunk_right: right_modality.extract_raw_data() starting_idx = 0 From 425ce2c39040c72642798a42665df35df77c5490 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Wed, 5 Feb 2025 12:56:54 +0100 Subject: [PATCH 12/16] add resnet weights --- .../python/systemds/scuro/representations/resnet.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py index eaa3d99e479..3f91c09bf8d 100644 --- a/src/main/python/systemds/scuro/representations/resnet.py +++ b/src/main/python/systemds/scuro/representations/resnet.py @@ -63,15 +63,15 @@ def model(self): @model.setter def model(self, model): if model == "ResNet18": - self._model = models.resnet18(pretrained=True).to(DEVICE) + self._model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT).to(DEVICE) elif model == "ResNet34": - self._model = models.resnet34(pretrained=True).to(DEVICE) + self._model = models.resnet34(weights=models.ResNet34_Weights.DEFAULT).to(DEVICE) elif model == "ResNet50": - self._model = models.resnet50(pretrained=True).to(DEVICE) + self._model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT).to(DEVICE) elif model == "ResNet101": - self._model = models.resnet101(pretrained=True).to(DEVICE) + self._model = models.resnet101(weights=models.ResNet101_Weights.DEFAULT).to(DEVICE) elif model == "ResNet152": - self._model = models.resnet152(pretrained=True).to(DEVICE) + self._model = models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to(DEVICE) else: raise NotImplementedError From 5e7a678b4e303bb7d245a99cd28a8671a90b0dac Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Wed, 5 Feb 2025 12:58:25 +0100 Subject: [PATCH 13/16] formatting --- .../scuro/representations/mel_spectrogram.py | 1 + .../systemds/scuro/representations/resnet.py | 20 ++++++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py index 05b7f37a6ca..483ea181b8f 100644 --- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py +++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py @@ -22,6 +22,7 @@ import numpy as np from systemds.scuro.modality.transformed import TransformedModality + # import matplotlib.pyplot as plt from systemds.scuro.representations.unimodal import UnimodalRepresentation diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py index 3f91c09bf8d..3a73e702a95 100644 --- a/src/main/python/systemds/scuro/representations/resnet.py +++ b/src/main/python/systemds/scuro/representations/resnet.py @@ -63,15 +63,25 @@ def model(self): @model.setter def model(self, model): if model == "ResNet18": - self._model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT).to(DEVICE) + self._model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT).to( + DEVICE + ) elif model == "ResNet34": - self._model = models.resnet34(weights=models.ResNet34_Weights.DEFAULT).to(DEVICE) + self._model = models.resnet34(weights=models.ResNet34_Weights.DEFAULT).to( + DEVICE + ) elif model == "ResNet50": - self._model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT).to(DEVICE) + self._model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT).to( + DEVICE + ) elif model == "ResNet101": - self._model = models.resnet101(weights=models.ResNet101_Weights.DEFAULT).to(DEVICE) + self._model = models.resnet101(weights=models.ResNet101_Weights.DEFAULT).to( + DEVICE + ) elif model == "ResNet152": - self._model = models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to(DEVICE) + self._model = models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to( + DEVICE + ) else: raise NotImplementedError From 57b98c500aced4074df1d3ced04f2414ea7a1595 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Wed, 5 Feb 2025 13:14:49 +0100 Subject: [PATCH 14/16] increase number of instances created in drsearch test --- src/main/python/systemds/scuro/representations/resnet.py | 3 --- src/main/python/tests/scuro/test_dr_search.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py index 3a73e702a95..ff63e6766b6 100644 --- a/src/main/python/systemds/scuro/representations/resnet.py +++ b/src/main/python/systemds/scuro/representations/resnet.py @@ -19,9 +19,6 @@ # # ------------------------------------------------------------- - -import h5py - from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.representations.unimodal import UnimodalRepresentation from typing import Callable, Dict, Tuple, Any diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py index 0e9b01557d0..f2ba9d2d790 100644 --- a/src/main/python/tests/scuro/test_dr_search.py +++ b/src/main/python/tests/scuro/test_dr_search.py @@ -90,7 +90,7 @@ class TestDataLoaders(unittest.TestCase): @classmethod def setUpClass(cls): cls.test_file_path = "test_data_dr_search" - cls.num_instances = 8 + cls.num_instances = 20 modalities = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] cls.data_generator = setup_data( From 81342aa78e2516110218e67d04f34599c707178e Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Wed, 5 Feb 2025 13:33:41 +0100 Subject: [PATCH 15/16] reset branch in workflow --- .github/workflows/python.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index c8ee2521046..54da49f8fb5 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -31,7 +31,7 @@ on: - 'src/assembly/**' - 'dev/**' branches: - - scuro_join + - main pull_request: paths-ignore: - 'docs/**' From 9aab9914a7291f8810c58caefef6e8e13c69f303 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Fri, 7 Feb 2025 16:01:14 +0100 Subject: [PATCH 16/16] remove version update in init --- src/main/python/systemds/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/python/systemds/__init__.py b/src/main/python/systemds/__init__.py index 443b5d23d90..a618ff6e9dd 100644 --- a/src/main/python/systemds/__init__.py +++ b/src/main/python/systemds/__init__.py @@ -26,13 +26,14 @@ __all__ = ["context", "operator", "examples"] required_packages = [ - ("torch", "2.4.1"), - ("torchvision", "0.19.1"), + ("torch", "2.5.1"), + ("torchvision", "0.20.1"), ("librosa", "0.10.2"), ("opencv-python", "4.10.0.84"), ("opt-einsum", "3.3.0"), ("h5py", "3.11.0"), ("transformers", "4.46.3"), + ("nltk", "3.9.1"), ("gensim", "4.3.3"), ]