From ed1d9889280affb6a5d09a721b25c467bc34c7d3 Mon Sep 17 00:00:00 2001
From: Yihan Zhao <yihan@marqo.ai>
Date: Tue, 10 Dec 2024 12:20:36 +1100
Subject: [PATCH 1/5] Fix a bug where attributes_to_retrieve does not work for
 non-string fields in unstructured indexes created with Marqo 2.13 [Patching
 2.14] (#1064)

---
 .../semi_structured_document.py               |   5 +-
 .../semi_structured_vespa_index.py            |  11 +-
 .../unstructured_document.py                  |  17 ++-
 src/marqo/tensor_search/tensor_search.py      |   9 +-
 src/marqo/version.py                          |   2 +-
 .../test_semi_structured_vespa_index.py       | 111 ++++++++++++++++
 .../integ_tests/test_get_documents_by_ids.py  | 118 +++++++++++++++---
 .../test_search_semi_structured.py            | 109 +++++++++++-----
 .../integ_tests/test_search_structured.py     | 104 +++++++++------
 .../integ_tests/test_search_unstructured.py   | 109 +++++++++++-----
 10 files changed, 467 insertions(+), 128 deletions(-)
 create mode 100644 tests/core/semi_structured_vespa_index/test_semi_structured_vespa_index.py

diff --git a/src/marqo/core/semi_structured_vespa_index/semi_structured_document.py b/src/marqo/core/semi_structured_vespa_index/semi_structured_document.py
index bd85cf37e..1540f7d21 100644
--- a/src/marqo/core/semi_structured_vespa_index/semi_structured_document.py
+++ b/src/marqo/core/semi_structured_vespa_index/semi_structured_document.py
@@ -168,9 +168,12 @@ def to_marqo_document(self, marqo_index: SemiStructuredMarqoIndex) -> Dict[str,
                 marqo_document[key] = []
             marqo_document[key].append(value)
 
-        # marqo_document.update(self.fixed_fields.short_string_fields)
+        # Add int and float fields back
+        # Please note that int-map and float-map fields are flattened in the result. The correct behaviour is to convert
+        # them back to the format when they are indexed. We will keep the behaviour as is to avoid breaking changes.
         marqo_document.update(self.fixed_fields.int_fields)
         marqo_document.update(self.fixed_fields.float_fields)
+
         marqo_document.update({k: bool(v) for k, v in self.fixed_fields.bool_fields.items()})
         marqo_document[index_constants.MARQO_DOC_ID] = self.fixed_fields.marqo__id
 
diff --git a/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_index.py b/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_index.py
index f231dff18..e28624959 100644
--- a/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_index.py
+++ b/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_index.py
@@ -44,14 +44,23 @@ def to_marqo_document(self, vespa_document: Dict[str, Any], return_highlights: b
     def to_vespa_query(self, marqo_query: MarqoQuery) -> Dict[str, Any]:
         # Verify attributes to retrieve, if defined
         if marqo_query.attributes_to_retrieve is not None:
+            if len(marqo_query.attributes_to_retrieve) > 0:
+                # Retrieve static fields content to extract non-string values from combined fields
+                marqo_query.attributes_to_retrieve.extend([
+                    common.STRING_ARRAY,
+                    common.INT_FIELDS,
+                    common.FLOAT_FIELDS,
+                    common.BOOL_FIELDS,
+                ])
+
             marqo_query.attributes_to_retrieve.append(common.VESPA_FIELD_ID)
+
             # add chunk field names for tensor fields
             marqo_query.attributes_to_retrieve.extend(
                 [self.get_marqo_index().tensor_field_map[att].chunk_field_name
                  for att in marqo_query.attributes_to_retrieve
                  if att in self.get_marqo_index().tensor_field_map]
             )
-
         # Hybrid must be checked first since it is a subclass of Tensor and Lexical
         if isinstance(marqo_query, MarqoHybridQuery):
             return StructuredVespaIndex._to_vespa_hybrid_query(self, marqo_query)
diff --git a/src/marqo/core/unstructured_vespa_index/unstructured_document.py b/src/marqo/core/unstructured_vespa_index/unstructured_document.py
index 121b5ed69..6e0f29755 100644
--- a/src/marqo/core/unstructured_vespa_index/unstructured_document.py
+++ b/src/marqo/core/unstructured_vespa_index/unstructured_document.py
@@ -77,9 +77,9 @@ def from_vespa_document(cls, document: Dict) -> "UnstructuredVespaDocument":
             if unstructured_common.VESPA_DOC_HYBRID_RAW_TENSOR_SCORE in fields else None
 
         return cls(id=document[cls._VESPA_DOC_ID],
-                    raw_tensor_score = raw_tensor_score,
-                    raw_lexical_score = raw_lexical_score,
-                    fields=UnstructuredVespaDocumentFields(**fields))
+                   raw_tensor_score=raw_tensor_score,
+                   raw_lexical_score=raw_lexical_score,
+                   fields=UnstructuredVespaDocumentFields(**fields))
 
     @classmethod
     def from_marqo_document(cls, document: Dict, filter_string_max_length: int) -> "UnstructuredVespaDocument":
@@ -87,8 +87,9 @@ def from_marqo_document(cls, document: Dict, filter_string_max_length: int) -> "
         add_documents"""
 
         if index_constants.MARQO_DOC_ID not in document:
-            raise MarqoDocumentParsingError(f"Unstructured Marqo document does not have a {index_constants.MARQO_DOC_ID} field. "
-                             f"This should be assigned for a valid document")
+            raise MarqoDocumentParsingError(f"Unstructured Marqo document does not have a "
+                                            f"{index_constants.MARQO_DOC_ID} field. "
+                                            f"This should be assigned for a valid document")
 
         doc_id = document[index_constants.MARQO_DOC_ID]
         instance = cls(id=doc_id, fields=UnstructuredVespaDocumentFields(marqo__id=doc_id))
@@ -123,7 +124,8 @@ def from_marqo_document(cls, document: Dict, filter_string_max_length: int) -> "
                         instance.fields.score_modifiers_fields[f"{key}.{k}"] = v
             else:
                 raise MarqoDocumentParsingError(f"In document {doc_id}, field {key} has an "
-                                 f"unsupported type {type(value)} which has not been validated in advance.")
+                                                f"unsupported type {type(value)} which has not been "
+                                                f"validated in advance.")
 
         instance.fields.vespa_multimodal_params = document.get(unstructured_common.MARQO_DOC_MULTIMODAL_PARAMS, {})
         instance.fields.vespa_embeddings = document.get(index_constants.MARQO_DOC_EMBEDDINGS, {})
@@ -152,8 +154,11 @@ def to_marqo_document(self, return_highlights: bool = False) -> Dict[str, Any]:
             marqo_document[key].append(value)
 
         # Add int and float fields back
+        # Please note that int-map and float-map fields are flattened in the result. The correct behaviour is to convert
+        # them back to the format when they are indexed. We will keep the behaviour as is to avoid breaking changes.
         marqo_document.update(self.fields.int_fields)
         marqo_document.update(self.fields.float_fields)
+
         marqo_document.update({k: bool(v) for k, v in self.fields.bool_fields.items()})
         marqo_document[index_constants.MARQO_DOC_ID] = self.fields.marqo__id
 
diff --git a/src/marqo/tensor_search/tensor_search.py b/src/marqo/tensor_search/tensor_search.py
index a3a762b57..94c35e407 100644
--- a/src/marqo/tensor_search/tensor_search.py
+++ b/src/marqo/tensor_search/tensor_search.py
@@ -1782,7 +1782,8 @@ def gather_documents_from_response(response: QueryResult, marqo_index: MarqoInde
         marqo_doc = vespa_index.to_marqo_document(doc.dict(), return_highlights=highlights)
         marqo_doc['_score'] = doc.relevance
 
-        if marqo_index.type == IndexType.Unstructured and attributes_to_retrieve is not None:
+        if (marqo_index.type in [IndexType.Unstructured, IndexType.SemiStructured] and
+                attributes_to_retrieve is not None):
             # For an unstructured index, we do the attributes_to_retrieve after search
             marqo_doc = unstructured_index_attributes_to_retrieve(marqo_doc, attributes_to_retrieve)
 
@@ -1798,7 +1799,11 @@ def unstructured_index_attributes_to_retrieve(marqo_doc: Dict[str, Any], attribu
     str, Any]:
     # attributes_to_retrieve should already be validated at the start of search
     attributes_to_retrieve = list(set(attributes_to_retrieve).union({"_id", "_score", "_highlights"}))
-    return {k: v for k, v in marqo_doc.items() if k in attributes_to_retrieve}
+    return {k: v for k, v in marqo_doc.items() if k in attributes_to_retrieve or
+            # Please note that numeric map fields are flattened for unstructured or semi-structured indexes.
+            # Therefore, when filtering on attributes_to_retrieve, we need to also include flattened map fields
+            # with the specified attributes as prefixes. We keep this behaviour only for compatibility reasons.
+            any([k.startswith(attribute + ".") for attribute in attributes_to_retrieve])}
 
 
 def assign_query_to_vector_job(
diff --git a/src/marqo/version.py b/src/marqo/version.py
index 7e9e8e952..12f91034e 100644
--- a/src/marqo/version.py
+++ b/src/marqo/version.py
@@ -1,4 +1,4 @@
-__version__ = "2.14.0"
+__version__ = "2.14.1"
 
 def get_version() -> str:
     return f"{__version__}"
diff --git a/tests/core/semi_structured_vespa_index/test_semi_structured_vespa_index.py b/tests/core/semi_structured_vespa_index/test_semi_structured_vespa_index.py
new file mode 100644
index 000000000..0198e3770
--- /dev/null
+++ b/tests/core/semi_structured_vespa_index/test_semi_structured_vespa_index.py
@@ -0,0 +1,111 @@
+import re
+import time
+import unittest
+from typing import List, Set
+
+from marqo import version
+from marqo.core.models import MarqoTensorQuery, MarqoLexicalQuery
+from marqo.core.models.marqo_index import SemiStructuredMarqoIndex, Model, TextPreProcessing, TextSplitMethod, \
+    ImagePreProcessing, HnswConfig, VectorNumericType, DistanceMetric, Field, FieldType, FieldFeature, TensorField
+from marqo.core.semi_structured_vespa_index.common import STRING_ARRAY, BOOL_FIELDS, INT_FIELDS, FLOAT_FIELDS, \
+    VESPA_FIELD_ID
+from marqo.core.semi_structured_vespa_index.semi_structured_vespa_index import SemiStructuredVespaIndex
+from marqo.core.semi_structured_vespa_index.semi_structured_vespa_schema import SemiStructuredVespaSchema
+
+
+class TestSemiStructuredVespaIndexToVespaQuery(unittest.TestCase):
+
+    def test_to_vespa_query_should_include_static_fields_when_attributes_to_retrieve_is_not_empty(self):
+        marqo_index = self._semi_structured_marqo_index(name='index1', lexical_field_names=['title'],
+                                                        tensor_field_names=['title'])
+        vespa_index = SemiStructuredVespaIndex(marqo_index)
+
+        for marqo_query in [
+            MarqoTensorQuery(index_name=marqo_index.name, limit=10, offset=0,
+                             attributes_to_retrieve=['title'], vector_query=[1.0] * 10),
+            MarqoLexicalQuery(index_name=marqo_index.name, limit=10, offset=0,
+                              attributes_to_retrieve=['title'], and_phrases=['hello'], or_phrases=['world']),
+            # MarqoHybridSearch yql is just a placeholder and is generated in customer search component.
+        ]:
+            with self.subTest(test_query=marqo_query):
+                query = vespa_index.to_vespa_query(marqo_query)
+                fields = self._extract_fields_from_yql(query['yql'])
+
+                self.assertSetEqual({VESPA_FIELD_ID, 'title', 'marqo__chunks_title', STRING_ARRAY,
+                                     BOOL_FIELDS, INT_FIELDS, FLOAT_FIELDS}, fields)
+
+    def test_to_vespa_query_should_not_include_static_fields_when_attributes_to_retrieve_is_empty(self):
+        marqo_index = self._semi_structured_marqo_index(name='index1', lexical_field_names=['title'],
+                                                        tensor_field_names=['title'])
+        vespa_index = SemiStructuredVespaIndex(marqo_index)
+
+        for marqo_query in [
+            MarqoTensorQuery(index_name=marqo_index.name, limit=10, offset=0,
+                             attributes_to_retrieve=[], vector_query=[1.0] * 10),
+            MarqoLexicalQuery(index_name=marqo_index.name, limit=10, offset=0,
+                              attributes_to_retrieve=[], and_phrases=['hello'], or_phrases=['world']),
+            # MarqoHybridSearch yql is just a placeholder and is generated in customer search component.
+        ]:
+            with self.subTest(test_query=marqo_query):
+                query = vespa_index.to_vespa_query(marqo_query)
+                fields = self._extract_fields_from_yql(query['yql'])
+
+                self.assertSetEqual({VESPA_FIELD_ID}, fields)
+
+    def _extract_fields_from_yql(self, yql: str) -> Set[str]:
+        # Define the regex pattern to extract fields from the SELECT clause
+        pattern = r"select\s+(.*?)\s+from"
+
+        # Search for the fields between SELECT and FROM
+        match = re.search(pattern, yql, re.IGNORECASE)
+
+        if match:
+            # Extract the fields and split them by commas, trimming any extra spaces
+            fields = match.group(1).split(',')
+            return set([field.strip() for field in fields])
+        else:
+            raise ValueError("No fields found in the query.")
+
+    def _semi_structured_marqo_index(self, name='index_name',
+                                     lexical_field_names: List[str] = [],
+                                     tensor_field_names: List[str] = []):
+        return SemiStructuredMarqoIndex(
+            name=name,
+            schema_name=name,
+            model=Model(name='hf/all_datasets_v4_MiniLM-L6'),
+            normalize_embeddings=True,
+            text_preprocessing=TextPreProcessing(
+                split_length=2,
+                split_overlap=0,
+                split_method=TextSplitMethod.Sentence
+            ),
+            image_preprocessing=ImagePreProcessing(
+                patch_method=None
+            ),
+            distance_metric=DistanceMetric.Angular,
+            vector_numeric_type=VectorNumericType.Float,
+            hnsw_config=HnswConfig(
+                ef_construction=128,
+                m=16
+            ),
+            marqo_version=version.get_version(),
+            created_at=time.time(),
+            updated_at=time.time(),
+            treat_urls_and_pointers_as_images=True,
+            treat_urls_and_pointers_as_media=True,
+            filter_string_max_length=100,
+            lexical_fields=[
+                Field(name=field_name, type=FieldType.Text,
+                      features=[FieldFeature.LexicalSearch],
+                      lexical_field_name=f'{SemiStructuredVespaSchema.FIELD_INDEX_PREFIX}{field_name}')
+                for field_name in lexical_field_names
+            ],
+            tensor_fields=[
+                TensorField(
+                    name=field_name,
+                    chunk_field_name=f'{SemiStructuredVespaSchema.FIELD_CHUNKS_PREFIX}{field_name}',
+                    embeddings_field_name=f'{SemiStructuredVespaSchema.FIELD_EMBEDDING_PREFIX}{field_name}',
+                )
+                for field_name in tensor_field_names
+            ]
+        )
\ No newline at end of file
diff --git a/tests/tensor_search/integ_tests/test_get_documents_by_ids.py b/tests/tensor_search/integ_tests/test_get_documents_by_ids.py
index e8937be9c..48d3460b6 100644
--- a/tests/tensor_search/integ_tests/test_get_documents_by_ids.py
+++ b/tests/tensor_search/integ_tests/test_get_documents_by_ids.py
@@ -1,24 +1,18 @@
 import functools
 import os
-import pprint
-import unittest
 import uuid
 from unittest import mock
+from unittest.mock import patch
 
-from marqo.api.exceptions import IndexNotFoundError
 from marqo.api.exceptions import (
-    InvalidDocumentIdError, InvalidArgError,
+    InvalidArgError,
     IllegalRequestedDocCount
 )
+from marqo.core.models.marqo_index import *
+from marqo.core.models.marqo_index_request import FieldRequest
 from marqo.tensor_search import enums
 from marqo.tensor_search import tensor_search
-from marqo.core.models.add_docs_params import AddDocsParams
 from tests.marqo_test import MarqoTestCase
-from marqo.core.models.marqo_index import *
-from marqo.core.models.marqo_index_request import FieldRequest
-from unittest.mock import patch
-import os
-import pprint
 from tests.utils.transition import *
 
 
@@ -33,15 +27,37 @@ def setUpClass(cls) -> None:
             fields=[
                 FieldRequest(name='title1', type=FieldType.Text),
                 FieldRequest(name='desc2', type=FieldType.Text),
+                FieldRequest(name='int_field', type=FieldType.Int),
+                FieldRequest(name='int_array_field', type=FieldType.ArrayInt),
+                FieldRequest(name='int_map_field', type=FieldType.MapInt),
+                FieldRequest(name='float_field', type=FieldType.Float),
+                FieldRequest(name='float_array_field', type=FieldType.ArrayFloat),
+                FieldRequest(name='float_map_field', type=FieldType.MapFloat),
+                FieldRequest(name='long_field', type=FieldType.Long),
+                FieldRequest(name='long_array_field', type=FieldType.ArrayLong),
+                FieldRequest(name='long_map_field', type=FieldType.MapLong),
+                FieldRequest(name='double_field', type=FieldType.Double),
+                FieldRequest(name='double_array_field', type=FieldType.ArrayDouble),
+                FieldRequest(name='double_map_field', type=FieldType.MapDouble),
+                FieldRequest(name='string_array_field', type=FieldType.ArrayText),
+                FieldRequest(name='bool_field', type=FieldType.Bool),
+                FieldRequest(name='custom_vector_field', type=FieldType.CustomVector),
             ],
-            tensor_fields=["title1", "desc2"]
+            tensor_fields=["title1", "desc2", "custom_vector_field"]
+        )
+        unstructured_text_index_with_random_model_request = cls.unstructured_marqo_index_request(
+            model=Model(name='random'),
+            marqo_version='2.12.0'
+        )
+        semi_structured_text_index_with_random_model_request = cls.unstructured_marqo_index_request(
+            model=Model(name='random'),
         )
-        unstructured_text_index_with_random_model_request = cls.unstructured_marqo_index_request(model=Model(name='random'))
 
         # List of indexes to loop through per test. Test itself should extract index name.
         cls.indexes = cls.create_indexes([
             structured_text_index_with_random_model_request,
-            unstructured_text_index_with_random_model_request
+            unstructured_text_index_with_random_model_request,
+            semi_structured_text_index_with_random_model_request
         ])
 
     def setUp(self) -> None:
@@ -52,17 +68,67 @@ def setUp(self) -> None:
     def tearDown(self) -> None:
         self.device_patcher.stop()
 
-    def test_get_documents_by_ids(self):
+    def test_get_documents_by_ids_structured(self):
+        index = self.indexes[0]
+
+        docs = [
+            {"_id": "1", "title1": "content 1",
+             "int_field": 1, "int_array_field": [1, 2], "int_map_field": {"a": 1},
+             "float_field": 2.9, "float_array_field": [1.0, 2.0], "float_map_field": {"b": 2.9},
+             "long_field": 10, "long_array_field": [10, 20], "long_map_field": {"a": 10},
+             "double_field": 3.9, "double_array_field": [3.0, 5.0], "double_map_field": {"b": 5.9},
+             "bool_field": True, "string_array_field": ["a", "b", "c"]},
+            {"_id": "2", "title1": "content 2", "custom_vector_field": {"content": "a", "vector": [1.0] * 384}},
+            {"_id": "3", "title1": "content 3"}
+        ]
+
+        self.add_documents(
+            config=self.config,
+            add_docs_params=AddDocsParams(
+                index_name=index.name, docs=docs, device="cpu"
+            )
+        )
+        res = tensor_search.get_documents_by_ids(
+            config=self.config, index_name=index.name, document_ids=['1', '2', '3'],
+            show_vectors=True).dict(exclude_none=True, by_alias=True)
+
+        # Check that the documents are found and have the correct content
+        for i in range(3):
+            self.assertEqual(res['results'][i]['_found'], True)
+
+            for field_name, value in res['results'][i].items():
+                if field_name in [enums.TensorField.tensor_facets, "_found"]:
+                    # ignore meta fields
+                    continue
+                if field_name == "custom_vector_field":
+                    expected_value = docs[i]["custom_vector_field"]["content"]
+                else:
+                    expected_value = docs[i][field_name]
+
+                self.assertEqual(expected_value, value)
+
+            self.assertIn(enums.TensorField.tensor_facets, res['results'][i])
+            self.assertIn(enums.TensorField.embedding, res['results'][i][enums.TensorField.tensor_facets][0])
+
+    def test_get_documents_by_ids_unstructured(self):
         for index in self.indexes:
+            if index.type == IndexType.Structured:
+                continue
+
             with self.subTest(f"Index type: {index.type}. Index name: {index.name}"):
                 docs = [
-                    {"_id": "1", "title1": "content 1"}, {"_id": "2", "title1": "content 2"},
+                    {"_id": "1", "title1": "content 1", "int_field": 1, "int_map_field": {"a": 1}, "float_field": 2.9,
+                     "float_map_field": {"b": 2.9}, "bool_field": True, "string_array_field": ["a", "b", "c"]},
+                    {"_id": "2", "title1": "content 2", "custom_vector_field": {"content": "a", "vector": [1.0] * 384}},
                     {"_id": "3", "title1": "content 3"}
                 ]
                 self.add_documents(
                     config=self.config,
-                    add_docs_params=AddDocsParams(index_name=index.name, docs=docs, device="cpu",
-                    tensor_fields=["title1", "desc2"] if isinstance(index, UnstructuredMarqoIndex) else None)
+                    add_docs_params=AddDocsParams(
+                        index_name=index.name, docs=docs, device="cpu",
+                        tensor_fields=["title1", "desc2", "custom_vector_field"],
+                        mappings={"custom_vector_field": {"type": "custom_vector"}}
+                    )
                 )
                 res = tensor_search.get_documents_by_ids(
                     config=self.config, index_name=index.name, document_ids=['1', '2', '3'],
@@ -71,8 +137,22 @@ def test_get_documents_by_ids(self):
                 # Check that the documents are found and have the correct content
                 for i in range(3):
                     self.assertEqual(res['results'][i]['_found'], True)
-                    self.assertEqual(res['results'][i]['_id'], docs[i]['_id'])
-                    self.assertEqual(res['results'][i]['title1'], docs[i]['title1'])
+
+                    for field_name, value in res['results'][i].items():
+                        if field_name in [enums.TensorField.tensor_facets, "_found"]:
+                            # ignore meta fields
+                            continue
+                        if field_name == "custom_vector_field":
+                            expected_value = docs[i]["custom_vector_field"]["content"]
+                        elif '.' in field_name:
+                            # unstructured and semi-structured indexes have all map fields flattened
+                            map_field_name, key = field_name.split('.', 1)
+                            expected_value = docs[i][map_field_name][key]
+                        else:
+                            expected_value = docs[i][field_name]
+
+                        self.assertEqual(expected_value, value)
+
                     self.assertIn(enums.TensorField.tensor_facets, res['results'][i])
                     self.assertIn(enums.TensorField.embedding, res['results'][i][enums.TensorField.tensor_facets][0])
 
diff --git a/tests/tensor_search/integ_tests/test_search_semi_structured.py b/tests/tensor_search/integ_tests/test_search_semi_structured.py
index 21de21ed6..d80c615bb 100644
--- a/tests/tensor_search/integ_tests/test_search_semi_structured.py
+++ b/tests/tensor_search/integ_tests/test_search_semi_structured.py
@@ -828,50 +828,97 @@ def test_filter_on_id_and_more(self):
                     self.assertEqual(set(expected_ids), set([hit["_id"] for hit in res["hits"]]))
 
     def test_attributes_to_retrieve(self):
-        docs = [
-            {
-                "field_1": "Exact match hehehe",
-                "field_2": "baaadd",
-                "random_field": "res res res",
-                "random_lala": "res res res haha",
-                "marqomarqo": "check check haha",
+        doc = {
+            "short_string_field": "Exact match hehehe",
+            "long_string_field": "This is a very long string." * 10,
+            "int_field": 123,
+            "float_field": 123.0,
+            "string_array": ["123", "123"],
+            "int_map": {"a": 1, "b": 2},
+            "float_map": {"c": 1.0, "d": 2.0},
+            "bool_field": True,
+            "bool_field2": False,
+            "custom_vector_field": {
+                "content": "abcd",
+                "vector": [1.0] * 384
             }
-        ]
-
-        test_inputs = (
-            (["void_field"], {"_id", "_score", "_highlights"}),
-            ([], {"_id", "_score", "_highlights"}),
-            (["field_1"], {"field_1", "_id", "_score", "_highlights"}),
-            (["field_1", "field_2"], {"field_1", "field_2", "_id", "_score", "_highlights"}),
-            (["field_1", "random_field"], {"field_1", "random_field", "_id", "_score", "_highlights"}),
-            (["field_1", "random_field", "random_lala"],
-             {"field_1", "random_field", "random_lala", "_id", "_score", "_highlights"}),
-            (["field_1", "random_field", "random_lala", "marqomarqo"],
-             {"field_1", "random_field", "random_lala", "marqomarqo", "_id", "_score", "_highlights"}),
-            (["field_1", "field_2", "random_field", "random_lala", "marqomarqo"],
-             {"field_1", "field_2", "random_field", "random_lala", "marqomarqo", "_id", "_score", "_highlights"}),
-            (None, {"field_1", "field_2", "random_field", "random_lala", "marqomarqo", "_id", "_score", "_highlights"}),
-        )
+        }
 
         self.add_documents(
             config=self.config,
             add_docs_params=AddDocsParams(
                 index_name=self.default_text_index,
-                docs=docs,
-                tensor_fields=["field_1", "field_2"]
+                docs=[doc],
+                tensor_fields=["short_string_field" , "custom_vector_field", "multimodal_combo_field"],
+                mappings={
+                    "custom_vector_field": {"type": "custom_vector"},
+                    "multimodal_combo_field": {
+                        "type": "multimodal_combination",
+                        "weights": {"short_string_field": 1.0, "long_string_field": 2.0}
+                    }
+                }
             )
         )
 
+        # meta fields are always returned
+        meta_fields = {"_id", "_score", "_highlights"}
+        non_map_fields = {"short_string_field", "long_string_field", "int_field", "float_field", "bool_field",
+                          "bool_field2", "string_array", "custom_vector_field"}
+        map_fields_flattened = {"int_map.a", "int_map.b", "float_map.c", "float_map.d"}
+
+        test_cases = (
+            # attributes_to_retrieve, expected result excluding meta_fields
+            ([], set()),  # no field is selected
+            (["non_existent_field"], set()),  # non_existent field is provided
+            (["multimodal_combo_field"], set()),  # multimodal_combination fields cannot be selected
+
+            # None or all fields
+            (None, non_map_fields | map_fields_flattened),  # not provided
+            (list(doc.keys()), non_map_fields | map_fields_flattened),  # all fields are selected
+
+            # one field
+            (["short_string_field"], {"short_string_field"}),
+            (["long_string_field"], {"long_string_field"}),
+            (["int_field"], {"int_field"}),
+            (["float_field"], {"float_field"}),
+            (["string_array"], {"string_array"}),
+            (["int_map"], {"int_map.a", "int_map.b"}),
+            (["float_map"], {"float_map.c", "float_map.d"}),
+            (["bool_field"], {"bool_field"}),
+            (["custom_vector_field"], {"custom_vector_field"}),
+            # combination of short and long string fields
+            (["short_string_field", "long_string_field"], {"short_string_field", "long_string_field"}),
+            # combination of int and int map fields
+            (["int_field", "int_map"], {"int_field", "int_map.a", "int_map.b"}),
+            # combination of fload and fload map fields
+            (["float_field", "float_map"], {"float_field", "float_map.c", "float_map.d"}),
+            # multiple boolean fields
+            (["bool_field", "bool_field2"], {"bool_field", "bool_field2"}),
+            # combination of all types of fields include non-existent fields
+            (["short_string_field", "long_string_field", "int_map", "bool_field", "multimodal_combo_field",
+              "string_array", "custom_vector_field"],
+             {"short_string_field", "long_string_field", "int_map.a", "int_map.b", "bool_field", "string_array",
+              "custom_vector_field"})
+        )
+
         for search_method in [SearchMethod.LEXICAL, SearchMethod.TENSOR]:
-            for searchable_attributes, expected_fields in test_inputs:
-                with self.subTest(
-                        f"search_method = {search_method}, searchable_attributes={searchable_attributes}, expected_fields = {expected_fields}"):
+            for attributes_to_retrieve, expected_fields in test_cases:
+                with self.subTest(f"search_method = {search_method}, attributes_to_retrieve={attributes_to_retrieve}, "
+                                  f"expected_fields = {expected_fields}"):
+
                     res = tensor_search.search(
                         config=self.config, index_name=self.default_text_index, text="Exact match hehehe",
-                        attributes_to_retrieve=searchable_attributes, search_method=search_method
+                        attributes_to_retrieve=attributes_to_retrieve, search_method=search_method
                     )
-
-                    self.assertEqual(expected_fields, set(res["hits"][0].keys()))
+                    self.assertSetEqual(expected_fields | meta_fields, set(res["hits"][0].keys()))
+                    for attribute in expected_fields:
+                        if attribute == "custom_vector_field":
+                            self.assertEqual(res["hits"][0][attribute], doc[attribute]["content"])
+                        elif '.' in attribute:
+                            map_field_name, key = attribute.split('.', maxsplit=1)
+                            self.assertEqual(res["hits"][0][attribute], doc[map_field_name][key])
+                        else:
+                            self.assertEqual(res["hits"][0][attribute], doc[attribute])
 
     def test_limit_results(self):
         """"""
diff --git a/tests/tensor_search/integ_tests/test_search_structured.py b/tests/tensor_search/integ_tests/test_search_structured.py
index e7bb30313..d65eaf7d9 100644
--- a/tests/tensor_search/integ_tests/test_search_structured.py
+++ b/tests/tensor_search/integ_tests/test_search_structured.py
@@ -77,10 +77,19 @@ def setUpClass(cls) -> None:
                              features=[FieldFeature.ScoreModifier]),
                 FieldRequest(name="score_mods_long", type=FieldType.Long,
                              features=[FieldFeature.ScoreModifier]),
+                FieldRequest(name="int_array", type=FieldType.ArrayInt),
+                FieldRequest(name="float_array", type=FieldType.ArrayFloat),
+                FieldRequest(name="long_array", type=FieldType.ArrayLong),
+                FieldRequest(name="double_array", type=FieldType.ArrayDouble),
+                FieldRequest(name="multimodal_combo_field", type=FieldType.MultimodalCombination, dependent_fields={
+                    "text_field_1": 1.0, "text_field_2": 2.0
+                }),
+                FieldRequest(name="custom_vector_field", type=FieldType.CustomVector)
             ],
 
             tensor_fields=["text_field_1", "text_field_2", "text_field_3",
-                           "text_field_4", "text_field_5", "text_field_6"]
+                           "text_field_4", "text_field_5", "text_field_6",
+                           "multimodal_combo_field", "custom_vector_field"]
         )
         default_text_index_encoded_name = cls.structured_marqo_index_request(
             name='a-b_' + str(uuid.uuid4()).replace('-', ''),
@@ -659,53 +668,76 @@ def test_lexical_filtering(self):
     #                 self.assertEqual(expected_ids, [hit["_id"] for hit in res["hits"]])
 
     def test_attributes_to_retrieve(self):
-        docs = [
-            {
-                "text_field_1": "Exact match hehehe",
-                "text_field_2": "baaadd",
-                "text_field_3": "res res res",
-                "text_field_4": "res res res haha",
-                "text_field_5": "check check haha",
-            }
-        ]
 
-        test_inputs = (
-            ([], {"_id", "_score", "_highlights"}),
-            (["text_field_1"], {"text_field_1", "_id", "_score", "_highlights"}),
-            (["text_field_1", "text_field_2"], {"text_field_1", "text_field_2", "_id", "_score", "_highlights"}),
-            (["text_field_1", "text_field_3"], {"text_field_1", "text_field_3", "_id", "_score", "_highlights"}),
-            (["text_field_1", "text_field_3", "text_field_4"],
-             {"text_field_1", "text_field_3", "text_field_4", "_id", "_score", "_highlights"}),
-            (["text_field_1", "text_field_3", "text_field_4", "text_field_5"],
-             {"text_field_1", "text_field_3", "text_field_4", "text_field_5", "_id", "_score", "_highlights"}),
-            (["text_field_1", "text_field_2", "text_field_3", "text_field_4", "text_field_5"],
-             {"text_field_1", "text_field_2", "text_field_3", "text_field_4", "text_field_5", "_id", "_score",
-              "_highlights"}),
-            # TODO Fix this subtest
-            # Not running this test case until we solve the bool issue
-            # (None, {"text_field_1", "text_field_2", "text_field_3", "text_field_4", "text_field_5", "_id", "_score",
-            #         "_highlights"}),
-        )
+        doc = {
+            "text_field_1": "Exact match hehehe",
+            "text_field_2": "This is a very long string." * 10,
+            "int_field_1": 123,
+            "float_field_1": 123.0,
+            "long_field_1": 1234,
+            "double_field_1": 1234.0,
+            "list_field_1": ["123", "123"],
+            "map_score_mods_int": {"a": 1, "b": 2},
+            "map_score_mods_float": {"c": 1.0, "d": 2.0},
+            "map_score_mods_long": {"a": 1, "b": 2},
+            "map_score_mods_double": {"c": 1.0, "d": 2.0},
+            "bool_field_1": True,
+            "bool_field_2": False,
+            "int_array": [1, 2, 3],
+            "long_array": [1, 2, 3, 4, 5],
+            "float_array": [1.0, 2.0, 3.0],
+            "double_array": [3.09, 2.09, 3.12],
+            "custom_vector_field": {
+                "content": "abcd",
+                "vector": [1.0] * 384
+            },
+        }
 
-        self.add_documents(
+        res = self.add_documents(
             config=self.config,
             add_docs_params=AddDocsParams(
                 index_name=self.default_text_index,
-                docs=docs,
-            )
+                docs=[doc],
+                mappings={
+                    "multimodal_combo_field": {
+                        "type": "multimodal_combination",
+                        "weights": {"text_field_1": 2.0, "text_field_2": 3.0}
+                    }
+                }
+            ),
+        )
+
+        print(res)
+
+        # meta fields are always returned
+        meta_fields = {"_id", "_score", "_highlights"}
+
+        test_cases = ((
+            # attributes_to_retrieve, expected result excluding meta_fields
+            ([], set()),  # no field is selected
+            (["multimodal_combo_field"], set()),  # multimodal_combination fields cannot be selected
+            (None, doc.keys()),  # not provided
+            (list(doc.keys()), doc.keys()),  # all fields are selected
         )
+            + tuple([([field], {field}) for field in doc.keys()])  # one field
+            + tuple((random_fields, set(random_fields)) for random_fields in
+                    [random.sample(doc.keys(), random.randint(2, len(doc))) for _ in range(10)]))  # random n(>1) fields, 10 times
 
         for search_method in [SearchMethod.LEXICAL, SearchMethod.TENSOR]:
-            for attributes_to_retrieve, expected_fields in test_inputs:
-                with self.subTest(
-                        f"search_method = {search_method}, attributes_to_retrieve={attributes_to_retrieve},"
-                        f" expected_fields = {expected_fields}"):
+            for attributes_to_retrieve, expected_fields in test_cases:
+                with self.subTest(f"search_method = {search_method}, attributes_to_retrieve={attributes_to_retrieve}, "
+                                  f"expected_fields = {expected_fields}"):
+
                     res = tensor_search.search(
                         config=self.config, index_name=self.default_text_index, text="Exact match hehehe",
                         attributes_to_retrieve=attributes_to_retrieve, search_method=search_method
                     )
-
-                    self.assertEqual(expected_fields, set(res["hits"][0].keys()))
+                    self.assertSetEqual(expected_fields | meta_fields, set(res["hits"][0].keys()))
+                    for attribute in expected_fields:
+                        if attribute == "custom_vector_field":
+                            self.assertEqual(res["hits"][0][attribute], doc[attribute]["content"])
+                        else:
+                            self.assertEqual(res["hits"][0][attribute], doc[attribute])
 
     def test_limit_results(self):
         vocab_source = "https://www.mit.edu/~ecprice/wordlist.10000"
diff --git a/tests/tensor_search/integ_tests/test_search_unstructured.py b/tests/tensor_search/integ_tests/test_search_unstructured.py
index 4837bc863..2adad56a1 100644
--- a/tests/tensor_search/integ_tests/test_search_unstructured.py
+++ b/tests/tensor_search/integ_tests/test_search_unstructured.py
@@ -843,50 +843,97 @@ def test_filter_on_id_and_more(self):
                     self.assertEqual(set(expected_ids), set([hit["_id"] for hit in res["hits"]]))
 
     def test_attributes_to_retrieve(self):
-        docs = [
-            {
-                "field_1": "Exact match hehehe",
-                "field_2": "baaadd",
-                "random_field": "res res res",
-                "random_lala": "res res res haha",
-                "marqomarqo": "check check haha",
+        doc = {
+            "short_string_field": "Exact match hehehe",
+            "long_string_field": "This is a very long string." * 10,
+            "int_field": 123,
+            "float_field": 123.0,
+            "string_array": ["123", "123"],
+            "int_map": {"a": 1, "b": 2},
+            "float_map": {"c": 1.0, "d": 2.0},
+            "bool_field": True,
+            "bool_field2": False,
+            "custom_vector_field": {
+                "content": "abcd",
+                "vector": [1.0] * 384
             }
-        ]
-
-        test_inputs = (
-            (["void_field"], {"_id", "_score", "_highlights"}),
-            ([], {"_id", "_score", "_highlights"}),
-            (["field_1"], {"field_1", "_id", "_score", "_highlights"}),
-            (["field_1", "field_2"], {"field_1", "field_2", "_id", "_score", "_highlights"}),
-            (["field_1", "random_field"], {"field_1", "random_field", "_id", "_score", "_highlights"}),
-            (["field_1", "random_field", "random_lala"],
-             {"field_1", "random_field", "random_lala", "_id", "_score", "_highlights"}),
-            (["field_1", "random_field", "random_lala", "marqomarqo"],
-             {"field_1", "random_field", "random_lala", "marqomarqo", "_id", "_score", "_highlights"}),
-            (["field_1", "field_2", "random_field", "random_lala", "marqomarqo"],
-             {"field_1", "field_2", "random_field", "random_lala", "marqomarqo", "_id", "_score", "_highlights"}),
-            (None, {"field_1", "field_2", "random_field", "random_lala", "marqomarqo", "_id", "_score", "_highlights"}),
-        )
+        }
 
         self.add_documents(
             config=self.config,
             add_docs_params=AddDocsParams(
                 index_name=self.default_text_index,
-                docs=docs,
-                tensor_fields=["field_1", "field_2"]
+                docs=[doc],
+                tensor_fields=["short_string_field", "custom_vector_field", "multimodal_combo_field"],
+                mappings={
+                    "custom_vector_field": {"type": "custom_vector"},
+                    "multimodal_combo_field": {
+                        "type": "multimodal_combination",
+                        "weights": {"short_string_field": 1.0, "long_string_field": 2.0}
+                    }
+                }
             )
         )
 
+        # meta fields are always returned
+        meta_fields = {"_id", "_score", "_highlights"}
+        non_map_fields = {"short_string_field", "long_string_field", "int_field", "float_field", "bool_field",
+                          "bool_field2", "string_array", "custom_vector_field"}
+        map_fields_flattened = {"int_map.a", "int_map.b", "float_map.c", "float_map.d"}
+
+        test_cases = (
+            # attributes_to_retrieve, expected result excluding meta_fields
+            ([], set()),  # no field is selected
+            (["non_existent_field"], set()),  # non_existent field is provided
+            (["multimodal_combo_field"], set()),  # multimodal_combination fields cannot be selected
+
+            # None or all fields
+            (None, non_map_fields | map_fields_flattened),  # not provided
+            (list(doc.keys()), non_map_fields | map_fields_flattened),  # all fields are selected
+
+            # one field
+            (["short_string_field"], {"short_string_field"}),
+            (["long_string_field"], {"long_string_field"}),
+            (["int_field"], {"int_field"}),
+            (["float_field"], {"float_field"}),
+            (["string_array"], {"string_array"}),
+            (["int_map"], {"int_map.a", "int_map.b"}),
+            (["float_map"], {"float_map.c", "float_map.d"}),
+            (["bool_field"], {"bool_field"}),
+            (["custom_vector_field"], {"custom_vector_field"}),
+            # combination of short and long string fields
+            (["short_string_field", "long_string_field"], {"short_string_field", "long_string_field"}),
+            # combination of int and int map fields
+            (["int_field", "int_map"], {"int_field", "int_map.a", "int_map.b"}),
+            # combination of fload and fload map fields
+            (["float_field", "float_map"], {"float_field", "float_map.c", "float_map.d"}),
+            # multiple boolean fields
+            (["bool_field", "bool_field2"], {"bool_field", "bool_field2"}),
+            # combination of all types of fields include non-existent fields
+            (["short_string_field", "long_string_field", "int_map", "bool_field", "multimodal_combo_field",
+              "string_array", "custom_vector_field"],
+             {"short_string_field", "long_string_field", "int_map.a", "int_map.b", "bool_field", "string_array",
+              "custom_vector_field"})
+        )
+
         for search_method in [SearchMethod.LEXICAL, SearchMethod.TENSOR]:
-            for searchable_attributes, expected_fields in test_inputs:
-                with self.subTest(
-                        f"search_method = {search_method}, searchable_attributes={searchable_attributes}, expected_fields = {expected_fields}"):
+            for attributes_to_retrieve, expected_fields in test_cases:
+                with self.subTest(f"search_method = {search_method}, attributes_to_retrieve={attributes_to_retrieve}, "
+                                  f"expected_fields = {expected_fields}"):
+
                     res = tensor_search.search(
                         config=self.config, index_name=self.default_text_index, text="Exact match hehehe",
-                        attributes_to_retrieve=searchable_attributes, search_method=search_method
+                        attributes_to_retrieve=attributes_to_retrieve, search_method=search_method
                     )
-
-                    self.assertEqual(expected_fields, set(res["hits"][0].keys()))
+                    self.assertSetEqual(expected_fields | meta_fields, set(res["hits"][0].keys()))
+                    for attribute in expected_fields:
+                        if attribute == "custom_vector_field":
+                            self.assertEqual(res["hits"][0][attribute], doc[attribute]["content"])
+                        elif '.' in attribute:
+                            map_field_name, key = attribute.split('.', maxsplit=1)
+                            self.assertEqual(res["hits"][0][attribute], doc[map_field_name][key])
+                        else:
+                            self.assertEqual(res["hits"][0][attribute], doc[attribute])
 
     def test_limit_results(self):
         """"""

From a6a5678845ddd6aadcb0907bea74a71a831ef271 Mon Sep 17 00:00:00 2001
From: Li Wan <li@marqo.ai>
Date: Tue, 10 Dec 2024 16:19:14 +1100
Subject: [PATCH 2/5] Add hf_transfer to dependencies (#1066)

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 8e87ed048..47a1299f7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,7 +6,7 @@ COPY vespa .
 RUN mvn clean package
 
 # Stage 2: Base image for Python setup
-FROM marqoai/marqo-base:46 as base_image
+FROM marqoai/marqo-base:47 as base_image
 
 # Allow mounting volume containing data and configs for vespa
 VOLUME /opt/vespa/var

From 3315ac6eb7d5c64f65bb970fc08686bbdc559dc3 Mon Sep 17 00:00:00 2001
From: Yihan Zhao <yihan@marqo.ai>
Date: Mon, 16 Dec 2024 12:59:47 +1100
Subject: [PATCH 3/5] Ensure the OOM error is captured in cuda health check
 (#1068)

---
 src/marqo/core/inference/device_manager.py  | 22 +++++++++++++------
 tests/core/inference/test_device_manager.py | 24 +++++++++++++++------
 2 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/src/marqo/core/inference/device_manager.py b/src/marqo/core/inference/device_manager.py
index 7fd345e57..a284baa87 100644
--- a/src/marqo/core/inference/device_manager.py
+++ b/src/marqo/core/inference/device_manager.py
@@ -82,20 +82,30 @@ def cuda_device_health_check(self) -> None:
 
         oom_errors = []
         for device in self.cuda_devices:
+            memory_stats = None
             try:
                 cuda_device = torch.device(f'cuda:{device.id}')
                 memory_stats = torch.cuda.memory_stats(cuda_device)
                 logger.debug(f'CUDA device {device.full_name} with total memory {device.total_memory}. '
                              f'Memory stats: {str(memory_stats)}')
 
-                torch.randn(3, device=cuda_device)
+                # Marqo usually allocates 20MiB cuda memory at a time when processing media files. When OOM happens,
+                # there are usually a few MiB free in reserved memory. To consistently trigger OOM from this health
+                # check request, we try to create a tensor that won't fit in the reserved memory to always trigger
+                # another allocation. Please note this is not leaky since we don't store the reference to this tensor.
+                # Once it's out of scope, the memory it takes will be returned to reserved space. Our assumption is
+                # that this method will not be called too often or with high concurrency (only used by liveness check
+                # for now which run once every few seconds).
+                tensor_size = int(20 * 1024 * 1024 / 4)  # 20MiB, float32 (4 bytes)
+                torch.randn(tensor_size, device=cuda_device)
             except RuntimeError as e:
                 if 'out of memory' in str(e).lower():
-                    logger.error(f'CUDA device {device.full_name} is out of memory. Total memory: {device.total_memory}. '
-                                 f'Memory stats: {str(memory_stats)}')
-                    allocated_mem = memory_stats.get("allocated.all.current", None) if memory_stats else None
-                    oom_errors.append(f'CUDA device {device.full_name} is out of memory:'
-                                      f' ({allocated_mem}/{device.total_memory})')
+                    logger.error(f'CUDA device {device.full_name} is out of memory. Original error: {str(e)}. '
+                                 f'Memory stats: {str(memory_stats)}.')
+                    allocated_mem = memory_stats.get("allocated_bytes.all.current", None) if memory_stats else None
+                    reserved_mem = memory_stats.get("reserved_bytes.all.current", None) if memory_stats else None
+                    oom_errors.append(f'CUDA device {device.full_name} is out of memory (reserved: {reserved_mem}, '
+                                      f'allocated: {allocated_mem}, total: {device.total_memory})')
                 else:
                     # Log out a warning message when encounter other transient errors.
                     logger.error(f'Encountered issue inspecting CUDA device {device.full_name}: {str(e)}')
diff --git a/tests/core/inference/test_device_manager.py b/tests/core/inference/test_device_manager.py
index 2688e82a8..d009ccac5 100644
--- a/tests/core/inference/test_device_manager.py
+++ b/tests/core/inference/test_device_manager.py
@@ -31,6 +31,12 @@ def _device_manager_with_multiple_cuda_devices(self, total_memory: int = 1_000_0
 
             return DeviceManager()
 
+    def _mem_stats(self, allocated: int, reserved: int) -> OrderedDict:
+        return OrderedDict({
+            "allocated_bytes.all.current": allocated,
+            "reserved_bytes.all.current": reserved
+        })
+
     def test_init_with_cpu(self):
         device_manager = self._device_manager_without_cuda()
 
@@ -79,22 +85,24 @@ def test_cuda_health_check_should_fail_when_cuda_device_is_out_of_memory(self):
 
         with mock.patch("torch.cuda.is_available", return_value=True), \
                 mock.patch("torch.randn", side_effect=RuntimeError("CUDA error: out of memory")), \
-                mock.patch("torch.cuda.memory_stats", return_value=OrderedDict({"allocated.all.current": 900_000})):
+                mock.patch("torch.cuda.memory_stats", return_value=self._mem_stats(900_000, 950_000)):
             with self.assertRaises(CudaOutOfMemoryError) as err:
                 device_manager.cuda_device_health_check()
 
-            self.assertEqual(str(err.exception), "CUDA device cuda:0(Tesla T4) is out of memory: (900000/1000000)")
+            self.assertEqual(str(err.exception), "CUDA device cuda:0(Tesla T4) is out of memory "
+                                                 "(reserved: 950000, allocated: 900000, total: 1000000)")
 
     def test_cuda_health_check_should_fail_when_any_cuda_device_is_out_of_memory(self):
         device_manager = self._device_manager_with_multiple_cuda_devices(total_memory=1_000_000)
 
         with mock.patch("torch.cuda.is_available", return_value=True), \
                 mock.patch("torch.randn", side_effect=[torch.tensor([1, 2, 3]), RuntimeError("CUDA error: out of memory")]), \
-                mock.patch("torch.cuda.memory_stats", return_value=OrderedDict({"allocated.all.current": 900_000})):
+                mock.patch("torch.cuda.memory_stats", return_value=self._mem_stats(900_000, 950_000)):
             with self.assertRaises(CudaOutOfMemoryError) as err:
                 device_manager.cuda_device_health_check()
 
-            self.assertEqual(str(err.exception), "CUDA device cuda:1(Tesla H200) is out of memory: (900000/1000000)")
+            self.assertEqual(str(err.exception), "CUDA device cuda:1(Tesla H200) is out of memory "
+                                                 "(reserved: 950000, allocated: 900000, total: 1000000)")
 
     def test_cuda_health_check_should_check_if_all_cuda_devices_are_out_of_memory(self):
         device_manager = self._device_manager_with_multiple_cuda_devices(total_memory=1_000_000)
@@ -102,12 +110,14 @@ def test_cuda_health_check_should_check_if_all_cuda_devices_are_out_of_memory(se
         with mock.patch("torch.cuda.is_available", return_value=True), \
                 mock.patch("torch.randn",
                            side_effect=[RuntimeError("CUDA error: out of memory"), RuntimeError("CUDA error: out of memory")]), \
-                mock.patch("torch.cuda.memory_stats", return_value=OrderedDict({"allocated.all.current": 900_000})):
+                mock.patch("torch.cuda.memory_stats", return_value=self._mem_stats(900_000, 950_000)):
             with self.assertRaises(CudaOutOfMemoryError) as err:
                 device_manager.cuda_device_health_check()
 
-            self.assertEqual(str(err.exception), "CUDA device cuda:0(Tesla T4) is out of memory: (900000/1000000);"
-                                                 "CUDA device cuda:1(Tesla H200) is out of memory: (900000/1000000)")
+            self.assertEqual(str(err.exception), "CUDA device cuda:0(Tesla T4) is out of memory "
+                                                 "(reserved: 950000, allocated: 900000, total: 1000000);"
+                                                 "CUDA device cuda:1(Tesla H200) is out of memory "
+                                                 "(reserved: 950000, allocated: 900000, total: 1000000)")
 
     def test_cuda_health_check_should_pass_and_log_error_message_when_cuda_calls_encounter_issue_other_than_oom(self):
         device_manager = self._device_manager_with_multiple_cuda_devices()

From 0e45ef09184bc887818a1991f916271306751c23 Mon Sep 17 00:00:00 2001
From: Yihan Zhao <yihan@marqo.ai>
Date: Fri, 20 Dec 2024 12:58:22 +1100
Subject: [PATCH 4/5] (Release 2.14) Preserving document-process node in
 services.xml when bootstrapping Vespa app (#1079)

---
 .../vespa_application_package.py              |  2 +-
 .../index_management/test_services_xml.py     | 65 +++++++++++++++++++
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/src/marqo/core/index_management/vespa_application_package.py b/src/marqo/core/index_management/vespa_application_package.py
index 275b669f0..e91e1fec1 100644
--- a/src/marqo/core/index_management/vespa_application_package.py
+++ b/src/marqo/core/index_management/vespa_application_package.py
@@ -86,7 +86,7 @@ def _cleanup_container_config(self):
         """
         container_element = self._ensure_only_one('container')
         for child in container_element.findall('*'):
-            if child.tag in ['document-api', 'search']:
+            if child.tag in ['document-api', 'document-processing', 'search']:
                 child.clear()
             elif child.tag != 'nodes':
                 container_element.remove(child)
diff --git a/tests/core/index_management/test_services_xml.py b/tests/core/index_management/test_services_xml.py
index 8fa1dcf70..85b2c7964 100644
--- a/tests/core/index_management/test_services_xml.py
+++ b/tests/core/index_management/test_services_xml.py
@@ -200,6 +200,71 @@ def test_config_components(self):
                 """
         self._assertStringsEqualIgnoringWhitespace(expected_xml, service_xml.to_xml())
 
+    def test_config_components_should_preserve_document_processing_nodes(self):
+        """
+        This test case tests that document-processing node in the `container` section is preserved when configuring
+        components since it is usually referenced in the `content` section, and we do not change content section.
+        """
+        xml = """<?xml version="1.0" encoding="utf-8" ?>
+                            <services version="1.0" xmlns:deploy="vespa" xmlns:preprocess="properties">
+                                <container id="default" version="1.0">
+                                    <document-api/>
+                                    <document-processing/>
+                                    <search/>
+                                    <nodes>
+                                        <node hostalias="node1"/>
+                                    </nodes>
+                                </container>
+                                <content id="content_default" version="1.0">
+                                    <documents>
+                                        <document type="marqo__existing_00index" mode="index"/>
+                                        <document-processing cluster="default"/>
+                                    </documents>
+                                </content>
+                            </services>
+                        """
+
+        service_xml = ServicesXml(xml)
+        service_xml.config_components()
+
+        # removed all random element and custom components from container element
+        # added searcher, handler and other custom components to container element
+        # kept nodes in container as is
+        # kept content element as is
+        expected_xml = """<?xml version="1.0" encoding="utf-8" ?>
+                            <services version="1.0" xmlns:deploy="vespa" xmlns:preprocess="properties">
+                                <container id="default" version="1.0">
+                                    <document-api/>
+                                    <document-processing/>
+                                    <search>
+                                        <chain id="marqo" inherits="vespa">
+                                            <searcher id="ai.marqo.search.HybridSearcher" bundle="marqo-custom-searchers"/>
+                                        </chain>
+                                    </search>
+                                    <nodes>
+                                        <node hostalias="node1"/>
+                                    </nodes>
+                                    <handler id="ai.marqo.index.IndexSettingRequestHandler" bundle="marqo-custom-searchers">
+                                        <binding>http://*/index-settings/*</binding>
+                                        <binding>http://*/index-settings</binding>
+                                    </handler>
+                                    <component id="ai.marqo.index.IndexSettings" bundle="marqo-custom-searchers">
+                                        <config name="ai.marqo.index.index-settings">
+                                            <indexSettingsFile>marqo_index_settings.json</indexSettingsFile>
+                                            <indexSettingsHistoryFile>marqo_index_settings_history.json</indexSettingsHistoryFile>
+                                        </config>
+                                    </component>
+                                </container>
+                                <content id="content_default" version="1.0">
+                                    <documents>
+                                        <document type="marqo__existing_00index" mode="index"/>
+                                        <document-processing cluster="default"/>
+                                    </documents>
+                                </content>
+                            </services>
+                        """
+        self._assertStringsEqualIgnoringWhitespace(expected_xml, service_xml.to_xml())
+
     def _assertStringsEqualIgnoringWhitespace(self, s1: str, s2: str):
         """Custom assertion to compare strings ignoring whitespace."""
 

From bb41f163ea9b5e54fb97371ab6649b1481d3fa50 Mon Sep 17 00:00:00 2001
From: Aditya Bharadwaj <adityabharadwaj198@gmail.com>
Date: Tue, 24 Dec 2024 12:24:48 +1100
Subject: [PATCH 5/5] (releases/2.14) Make compatibility tests run on releases
 branch and make them run on all previous patch versions (#1076)

---
 ...wards_compatibility_marqo_orchestrator.yml |  3 ++-
 .../scripts/generate_versions.py              | 19 +++++++++++--------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/backwards_compatibility_marqo_orchestrator.yml b/.github/workflows/backwards_compatibility_marqo_orchestrator.yml
index e5f63d4ac..ffa65565d 100644
--- a/.github/workflows/backwards_compatibility_marqo_orchestrator.yml
+++ b/.github/workflows/backwards_compatibility_marqo_orchestrator.yml
@@ -5,6 +5,7 @@ on:
   push:
     branches:
       - mainline
+      - releases/*
     paths-ignore:
       - '**.md'
   workflow_dispatch:
@@ -19,7 +20,7 @@ on:
 
 # Setting MAX_VERSIONS_TO_TEST, this can be a configurable value or if no input is provided, it can be a default value.
 env:
-  MAX_VERSIONS_TO_TEST: ${{ github.event.inputs.max_versions_to_test || 3 }}
+  MAX_VERSIONS_TO_TEST: ${{ github.event.inputs.max_versions_to_test || 5 }}
 
 jobs:
   check-if-image-exists:
diff --git a/tests/backwards_compatibility_tests/scripts/generate_versions.py b/tests/backwards_compatibility_tests/scripts/generate_versions.py
index 83614749a..d6a31275c 100644
--- a/tests/backwards_compatibility_tests/scripts/generate_versions.py
+++ b/tests/backwards_compatibility_tests/scripts/generate_versions.py
@@ -4,17 +4,17 @@
 import subprocess
 import sys
 
-def generate_versions(to_version: str, num_versions: int = 3) -> list:
+def generate_versions(to_version: str, num_minor_versions_to_test: int = 3) -> list:
     """
     Generate a list of previous versions based on the target version.
 
     This function generates a list of previous versions for a given target version.
-    It includes the previous patch version of the same minor version if applicable,
-    and the latest patch versions for preceding minor versions.
+    It includes all the previous patch versions of the same minor version if applicable,
+    and the latest patch versions for preceding minor versions of up to num_minor_versions_to_test.
 
     Args:
         to_version (str): The target version to generate previous versions for.
-        num_versions (int): The number of previous versions to generate. Defaults to 3.
+        num_minor_versions_to_test (int): The number of previous minor versions to generate. Defaults to 3.
 
     Returns:
         list: A list of previous versions as strings.
@@ -24,13 +24,16 @@ def generate_versions(to_version: str, num_versions: int = 3) -> list:
 
     # If this is a patch release, add the previous patch version of the same minor version
     if target_version.patch > 0:
-        prev_patch_version = f"{target_version.major}.{target_version.minor}.{target_version.patch - 1}"
-        versions.append(prev_patch_version)
+        versions.extend(
+            f"{target_version.major}.{target_version.minor}.{i}"
+            for i in range(target_version.patch - 1, -1, -1)
+        )
 
     # Gather the latest patch version for each preceding minor version
     minor = target_version.minor - 1
-    while len(versions) < num_versions and minor >= 0:
-        # Get all tags for the given minor version, sort, and pick the latest patch
+    for _ in range(num_minor_versions_to_test):
+        if minor < 0:
+            break
         tags = subprocess.check_output(
             ["git", "tag", "--list", f"{target_version.major}.{minor}.*"],
             text=True