From ed1d9889280affb6a5d09a721b25c467bc34c7d3 Mon Sep 17 00:00:00 2001 From: Yihan Zhao Date: Tue, 10 Dec 2024 12:20:36 +1100 Subject: [PATCH 1/5] Fix a bug where attributes_to_retrieve does not work for non-string fields in unstructured indexes created with Marqo 2.13 [Patching 2.14] (#1064) --- .../semi_structured_document.py | 5 +- .../semi_structured_vespa_index.py | 11 +- .../unstructured_document.py | 17 ++- src/marqo/tensor_search/tensor_search.py | 9 +- src/marqo/version.py | 2 +- .../test_semi_structured_vespa_index.py | 111 ++++++++++++++++ .../integ_tests/test_get_documents_by_ids.py | 118 +++++++++++++++--- .../test_search_semi_structured.py | 109 +++++++++++----- .../integ_tests/test_search_structured.py | 104 +++++++++------ .../integ_tests/test_search_unstructured.py | 109 +++++++++++----- 10 files changed, 467 insertions(+), 128 deletions(-) create mode 100644 tests/core/semi_structured_vespa_index/test_semi_structured_vespa_index.py diff --git a/src/marqo/core/semi_structured_vespa_index/semi_structured_document.py b/src/marqo/core/semi_structured_vespa_index/semi_structured_document.py index bd85cf37e..1540f7d21 100644 --- a/src/marqo/core/semi_structured_vespa_index/semi_structured_document.py +++ b/src/marqo/core/semi_structured_vespa_index/semi_structured_document.py @@ -168,9 +168,12 @@ def to_marqo_document(self, marqo_index: SemiStructuredMarqoIndex) -> Dict[str, marqo_document[key] = [] marqo_document[key].append(value) - # marqo_document.update(self.fixed_fields.short_string_fields) + # Add int and float fields back + # Please note that int-map and float-map fields are flattened in the result. The correct behaviour is to convert + # them back to the format when they are indexed. We will keep the behaviour as is to avoid breaking changes. marqo_document.update(self.fixed_fields.int_fields) marqo_document.update(self.fixed_fields.float_fields) + marqo_document.update({k: bool(v) for k, v in self.fixed_fields.bool_fields.items()}) marqo_document[index_constants.MARQO_DOC_ID] = self.fixed_fields.marqo__id diff --git a/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_index.py b/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_index.py index f231dff18..e28624959 100644 --- a/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_index.py +++ b/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_index.py @@ -44,14 +44,23 @@ def to_marqo_document(self, vespa_document: Dict[str, Any], return_highlights: b def to_vespa_query(self, marqo_query: MarqoQuery) -> Dict[str, Any]: # Verify attributes to retrieve, if defined if marqo_query.attributes_to_retrieve is not None: + if len(marqo_query.attributes_to_retrieve) > 0: + # Retrieve static fields content to extract non-string values from combined fields + marqo_query.attributes_to_retrieve.extend([ + common.STRING_ARRAY, + common.INT_FIELDS, + common.FLOAT_FIELDS, + common.BOOL_FIELDS, + ]) + marqo_query.attributes_to_retrieve.append(common.VESPA_FIELD_ID) + # add chunk field names for tensor fields marqo_query.attributes_to_retrieve.extend( [self.get_marqo_index().tensor_field_map[att].chunk_field_name for att in marqo_query.attributes_to_retrieve if att in self.get_marqo_index().tensor_field_map] ) - # Hybrid must be checked first since it is a subclass of Tensor and Lexical if isinstance(marqo_query, MarqoHybridQuery): return StructuredVespaIndex._to_vespa_hybrid_query(self, marqo_query) diff --git a/src/marqo/core/unstructured_vespa_index/unstructured_document.py b/src/marqo/core/unstructured_vespa_index/unstructured_document.py index 121b5ed69..6e0f29755 100644 --- a/src/marqo/core/unstructured_vespa_index/unstructured_document.py +++ b/src/marqo/core/unstructured_vespa_index/unstructured_document.py @@ -77,9 +77,9 @@ def from_vespa_document(cls, document: Dict) -> "UnstructuredVespaDocument": if unstructured_common.VESPA_DOC_HYBRID_RAW_TENSOR_SCORE in fields else None return cls(id=document[cls._VESPA_DOC_ID], - raw_tensor_score = raw_tensor_score, - raw_lexical_score = raw_lexical_score, - fields=UnstructuredVespaDocumentFields(**fields)) + raw_tensor_score=raw_tensor_score, + raw_lexical_score=raw_lexical_score, + fields=UnstructuredVespaDocumentFields(**fields)) @classmethod def from_marqo_document(cls, document: Dict, filter_string_max_length: int) -> "UnstructuredVespaDocument": @@ -87,8 +87,9 @@ def from_marqo_document(cls, document: Dict, filter_string_max_length: int) -> " add_documents""" if index_constants.MARQO_DOC_ID not in document: - raise MarqoDocumentParsingError(f"Unstructured Marqo document does not have a {index_constants.MARQO_DOC_ID} field. " - f"This should be assigned for a valid document") + raise MarqoDocumentParsingError(f"Unstructured Marqo document does not have a " + f"{index_constants.MARQO_DOC_ID} field. " + f"This should be assigned for a valid document") doc_id = document[index_constants.MARQO_DOC_ID] instance = cls(id=doc_id, fields=UnstructuredVespaDocumentFields(marqo__id=doc_id)) @@ -123,7 +124,8 @@ def from_marqo_document(cls, document: Dict, filter_string_max_length: int) -> " instance.fields.score_modifiers_fields[f"{key}.{k}"] = v else: raise MarqoDocumentParsingError(f"In document {doc_id}, field {key} has an " - f"unsupported type {type(value)} which has not been validated in advance.") + f"unsupported type {type(value)} which has not been " + f"validated in advance.") instance.fields.vespa_multimodal_params = document.get(unstructured_common.MARQO_DOC_MULTIMODAL_PARAMS, {}) instance.fields.vespa_embeddings = document.get(index_constants.MARQO_DOC_EMBEDDINGS, {}) @@ -152,8 +154,11 @@ def to_marqo_document(self, return_highlights: bool = False) -> Dict[str, Any]: marqo_document[key].append(value) # Add int and float fields back + # Please note that int-map and float-map fields are flattened in the result. The correct behaviour is to convert + # them back to the format when they are indexed. We will keep the behaviour as is to avoid breaking changes. marqo_document.update(self.fields.int_fields) marqo_document.update(self.fields.float_fields) + marqo_document.update({k: bool(v) for k, v in self.fields.bool_fields.items()}) marqo_document[index_constants.MARQO_DOC_ID] = self.fields.marqo__id diff --git a/src/marqo/tensor_search/tensor_search.py b/src/marqo/tensor_search/tensor_search.py index a3a762b57..94c35e407 100644 --- a/src/marqo/tensor_search/tensor_search.py +++ b/src/marqo/tensor_search/tensor_search.py @@ -1782,7 +1782,8 @@ def gather_documents_from_response(response: QueryResult, marqo_index: MarqoInde marqo_doc = vespa_index.to_marqo_document(doc.dict(), return_highlights=highlights) marqo_doc['_score'] = doc.relevance - if marqo_index.type == IndexType.Unstructured and attributes_to_retrieve is not None: + if (marqo_index.type in [IndexType.Unstructured, IndexType.SemiStructured] and + attributes_to_retrieve is not None): # For an unstructured index, we do the attributes_to_retrieve after search marqo_doc = unstructured_index_attributes_to_retrieve(marqo_doc, attributes_to_retrieve) @@ -1798,7 +1799,11 @@ def unstructured_index_attributes_to_retrieve(marqo_doc: Dict[str, Any], attribu str, Any]: # attributes_to_retrieve should already be validated at the start of search attributes_to_retrieve = list(set(attributes_to_retrieve).union({"_id", "_score", "_highlights"})) - return {k: v for k, v in marqo_doc.items() if k in attributes_to_retrieve} + return {k: v for k, v in marqo_doc.items() if k in attributes_to_retrieve or + # Please note that numeric map fields are flattened for unstructured or semi-structured indexes. + # Therefore, when filtering on attributes_to_retrieve, we need to also include flattened map fields + # with the specified attributes as prefixes. We keep this behaviour only for compatibility reasons. + any([k.startswith(attribute + ".") for attribute in attributes_to_retrieve])} def assign_query_to_vector_job( diff --git a/src/marqo/version.py b/src/marqo/version.py index 7e9e8e952..12f91034e 100644 --- a/src/marqo/version.py +++ b/src/marqo/version.py @@ -1,4 +1,4 @@ -__version__ = "2.14.0" +__version__ = "2.14.1" def get_version() -> str: return f"{__version__}" diff --git a/tests/core/semi_structured_vespa_index/test_semi_structured_vespa_index.py b/tests/core/semi_structured_vespa_index/test_semi_structured_vespa_index.py new file mode 100644 index 000000000..0198e3770 --- /dev/null +++ b/tests/core/semi_structured_vespa_index/test_semi_structured_vespa_index.py @@ -0,0 +1,111 @@ +import re +import time +import unittest +from typing import List, Set + +from marqo import version +from marqo.core.models import MarqoTensorQuery, MarqoLexicalQuery +from marqo.core.models.marqo_index import SemiStructuredMarqoIndex, Model, TextPreProcessing, TextSplitMethod, \ + ImagePreProcessing, HnswConfig, VectorNumericType, DistanceMetric, Field, FieldType, FieldFeature, TensorField +from marqo.core.semi_structured_vespa_index.common import STRING_ARRAY, BOOL_FIELDS, INT_FIELDS, FLOAT_FIELDS, \ + VESPA_FIELD_ID +from marqo.core.semi_structured_vespa_index.semi_structured_vespa_index import SemiStructuredVespaIndex +from marqo.core.semi_structured_vespa_index.semi_structured_vespa_schema import SemiStructuredVespaSchema + + +class TestSemiStructuredVespaIndexToVespaQuery(unittest.TestCase): + + def test_to_vespa_query_should_include_static_fields_when_attributes_to_retrieve_is_not_empty(self): + marqo_index = self._semi_structured_marqo_index(name='index1', lexical_field_names=['title'], + tensor_field_names=['title']) + vespa_index = SemiStructuredVespaIndex(marqo_index) + + for marqo_query in [ + MarqoTensorQuery(index_name=marqo_index.name, limit=10, offset=0, + attributes_to_retrieve=['title'], vector_query=[1.0] * 10), + MarqoLexicalQuery(index_name=marqo_index.name, limit=10, offset=0, + attributes_to_retrieve=['title'], and_phrases=['hello'], or_phrases=['world']), + # MarqoHybridSearch yql is just a placeholder and is generated in customer search component. + ]: + with self.subTest(test_query=marqo_query): + query = vespa_index.to_vespa_query(marqo_query) + fields = self._extract_fields_from_yql(query['yql']) + + self.assertSetEqual({VESPA_FIELD_ID, 'title', 'marqo__chunks_title', STRING_ARRAY, + BOOL_FIELDS, INT_FIELDS, FLOAT_FIELDS}, fields) + + def test_to_vespa_query_should_not_include_static_fields_when_attributes_to_retrieve_is_empty(self): + marqo_index = self._semi_structured_marqo_index(name='index1', lexical_field_names=['title'], + tensor_field_names=['title']) + vespa_index = SemiStructuredVespaIndex(marqo_index) + + for marqo_query in [ + MarqoTensorQuery(index_name=marqo_index.name, limit=10, offset=0, + attributes_to_retrieve=[], vector_query=[1.0] * 10), + MarqoLexicalQuery(index_name=marqo_index.name, limit=10, offset=0, + attributes_to_retrieve=[], and_phrases=['hello'], or_phrases=['world']), + # MarqoHybridSearch yql is just a placeholder and is generated in customer search component. + ]: + with self.subTest(test_query=marqo_query): + query = vespa_index.to_vespa_query(marqo_query) + fields = self._extract_fields_from_yql(query['yql']) + + self.assertSetEqual({VESPA_FIELD_ID}, fields) + + def _extract_fields_from_yql(self, yql: str) -> Set[str]: + # Define the regex pattern to extract fields from the SELECT clause + pattern = r"select\s+(.*?)\s+from" + + # Search for the fields between SELECT and FROM + match = re.search(pattern, yql, re.IGNORECASE) + + if match: + # Extract the fields and split them by commas, trimming any extra spaces + fields = match.group(1).split(',') + return set([field.strip() for field in fields]) + else: + raise ValueError("No fields found in the query.") + + def _semi_structured_marqo_index(self, name='index_name', + lexical_field_names: List[str] = [], + tensor_field_names: List[str] = []): + return SemiStructuredMarqoIndex( + name=name, + schema_name=name, + model=Model(name='hf/all_datasets_v4_MiniLM-L6'), + normalize_embeddings=True, + text_preprocessing=TextPreProcessing( + split_length=2, + split_overlap=0, + split_method=TextSplitMethod.Sentence + ), + image_preprocessing=ImagePreProcessing( + patch_method=None + ), + distance_metric=DistanceMetric.Angular, + vector_numeric_type=VectorNumericType.Float, + hnsw_config=HnswConfig( + ef_construction=128, + m=16 + ), + marqo_version=version.get_version(), + created_at=time.time(), + updated_at=time.time(), + treat_urls_and_pointers_as_images=True, + treat_urls_and_pointers_as_media=True, + filter_string_max_length=100, + lexical_fields=[ + Field(name=field_name, type=FieldType.Text, + features=[FieldFeature.LexicalSearch], + lexical_field_name=f'{SemiStructuredVespaSchema.FIELD_INDEX_PREFIX}{field_name}') + for field_name in lexical_field_names + ], + tensor_fields=[ + TensorField( + name=field_name, + chunk_field_name=f'{SemiStructuredVespaSchema.FIELD_CHUNKS_PREFIX}{field_name}', + embeddings_field_name=f'{SemiStructuredVespaSchema.FIELD_EMBEDDING_PREFIX}{field_name}', + ) + for field_name in tensor_field_names + ] + ) \ No newline at end of file diff --git a/tests/tensor_search/integ_tests/test_get_documents_by_ids.py b/tests/tensor_search/integ_tests/test_get_documents_by_ids.py index e8937be9c..48d3460b6 100644 --- a/tests/tensor_search/integ_tests/test_get_documents_by_ids.py +++ b/tests/tensor_search/integ_tests/test_get_documents_by_ids.py @@ -1,24 +1,18 @@ import functools import os -import pprint -import unittest import uuid from unittest import mock +from unittest.mock import patch -from marqo.api.exceptions import IndexNotFoundError from marqo.api.exceptions import ( - InvalidDocumentIdError, InvalidArgError, + InvalidArgError, IllegalRequestedDocCount ) +from marqo.core.models.marqo_index import * +from marqo.core.models.marqo_index_request import FieldRequest from marqo.tensor_search import enums from marqo.tensor_search import tensor_search -from marqo.core.models.add_docs_params import AddDocsParams from tests.marqo_test import MarqoTestCase -from marqo.core.models.marqo_index import * -from marqo.core.models.marqo_index_request import FieldRequest -from unittest.mock import patch -import os -import pprint from tests.utils.transition import * @@ -33,15 +27,37 @@ def setUpClass(cls) -> None: fields=[ FieldRequest(name='title1', type=FieldType.Text), FieldRequest(name='desc2', type=FieldType.Text), + FieldRequest(name='int_field', type=FieldType.Int), + FieldRequest(name='int_array_field', type=FieldType.ArrayInt), + FieldRequest(name='int_map_field', type=FieldType.MapInt), + FieldRequest(name='float_field', type=FieldType.Float), + FieldRequest(name='float_array_field', type=FieldType.ArrayFloat), + FieldRequest(name='float_map_field', type=FieldType.MapFloat), + FieldRequest(name='long_field', type=FieldType.Long), + FieldRequest(name='long_array_field', type=FieldType.ArrayLong), + FieldRequest(name='long_map_field', type=FieldType.MapLong), + FieldRequest(name='double_field', type=FieldType.Double), + FieldRequest(name='double_array_field', type=FieldType.ArrayDouble), + FieldRequest(name='double_map_field', type=FieldType.MapDouble), + FieldRequest(name='string_array_field', type=FieldType.ArrayText), + FieldRequest(name='bool_field', type=FieldType.Bool), + FieldRequest(name='custom_vector_field', type=FieldType.CustomVector), ], - tensor_fields=["title1", "desc2"] + tensor_fields=["title1", "desc2", "custom_vector_field"] + ) + unstructured_text_index_with_random_model_request = cls.unstructured_marqo_index_request( + model=Model(name='random'), + marqo_version='2.12.0' + ) + semi_structured_text_index_with_random_model_request = cls.unstructured_marqo_index_request( + model=Model(name='random'), ) - unstructured_text_index_with_random_model_request = cls.unstructured_marqo_index_request(model=Model(name='random')) # List of indexes to loop through per test. Test itself should extract index name. cls.indexes = cls.create_indexes([ structured_text_index_with_random_model_request, - unstructured_text_index_with_random_model_request + unstructured_text_index_with_random_model_request, + semi_structured_text_index_with_random_model_request ]) def setUp(self) -> None: @@ -52,17 +68,67 @@ def setUp(self) -> None: def tearDown(self) -> None: self.device_patcher.stop() - def test_get_documents_by_ids(self): + def test_get_documents_by_ids_structured(self): + index = self.indexes[0] + + docs = [ + {"_id": "1", "title1": "content 1", + "int_field": 1, "int_array_field": [1, 2], "int_map_field": {"a": 1}, + "float_field": 2.9, "float_array_field": [1.0, 2.0], "float_map_field": {"b": 2.9}, + "long_field": 10, "long_array_field": [10, 20], "long_map_field": {"a": 10}, + "double_field": 3.9, "double_array_field": [3.0, 5.0], "double_map_field": {"b": 5.9}, + "bool_field": True, "string_array_field": ["a", "b", "c"]}, + {"_id": "2", "title1": "content 2", "custom_vector_field": {"content": "a", "vector": [1.0] * 384}}, + {"_id": "3", "title1": "content 3"} + ] + + self.add_documents( + config=self.config, + add_docs_params=AddDocsParams( + index_name=index.name, docs=docs, device="cpu" + ) + ) + res = tensor_search.get_documents_by_ids( + config=self.config, index_name=index.name, document_ids=['1', '2', '3'], + show_vectors=True).dict(exclude_none=True, by_alias=True) + + # Check that the documents are found and have the correct content + for i in range(3): + self.assertEqual(res['results'][i]['_found'], True) + + for field_name, value in res['results'][i].items(): + if field_name in [enums.TensorField.tensor_facets, "_found"]: + # ignore meta fields + continue + if field_name == "custom_vector_field": + expected_value = docs[i]["custom_vector_field"]["content"] + else: + expected_value = docs[i][field_name] + + self.assertEqual(expected_value, value) + + self.assertIn(enums.TensorField.tensor_facets, res['results'][i]) + self.assertIn(enums.TensorField.embedding, res['results'][i][enums.TensorField.tensor_facets][0]) + + def test_get_documents_by_ids_unstructured(self): for index in self.indexes: + if index.type == IndexType.Structured: + continue + with self.subTest(f"Index type: {index.type}. Index name: {index.name}"): docs = [ - {"_id": "1", "title1": "content 1"}, {"_id": "2", "title1": "content 2"}, + {"_id": "1", "title1": "content 1", "int_field": 1, "int_map_field": {"a": 1}, "float_field": 2.9, + "float_map_field": {"b": 2.9}, "bool_field": True, "string_array_field": ["a", "b", "c"]}, + {"_id": "2", "title1": "content 2", "custom_vector_field": {"content": "a", "vector": [1.0] * 384}}, {"_id": "3", "title1": "content 3"} ] self.add_documents( config=self.config, - add_docs_params=AddDocsParams(index_name=index.name, docs=docs, device="cpu", - tensor_fields=["title1", "desc2"] if isinstance(index, UnstructuredMarqoIndex) else None) + add_docs_params=AddDocsParams( + index_name=index.name, docs=docs, device="cpu", + tensor_fields=["title1", "desc2", "custom_vector_field"], + mappings={"custom_vector_field": {"type": "custom_vector"}} + ) ) res = tensor_search.get_documents_by_ids( config=self.config, index_name=index.name, document_ids=['1', '2', '3'], @@ -71,8 +137,22 @@ def test_get_documents_by_ids(self): # Check that the documents are found and have the correct content for i in range(3): self.assertEqual(res['results'][i]['_found'], True) - self.assertEqual(res['results'][i]['_id'], docs[i]['_id']) - self.assertEqual(res['results'][i]['title1'], docs[i]['title1']) + + for field_name, value in res['results'][i].items(): + if field_name in [enums.TensorField.tensor_facets, "_found"]: + # ignore meta fields + continue + if field_name == "custom_vector_field": + expected_value = docs[i]["custom_vector_field"]["content"] + elif '.' in field_name: + # unstructured and semi-structured indexes have all map fields flattened + map_field_name, key = field_name.split('.', 1) + expected_value = docs[i][map_field_name][key] + else: + expected_value = docs[i][field_name] + + self.assertEqual(expected_value, value) + self.assertIn(enums.TensorField.tensor_facets, res['results'][i]) self.assertIn(enums.TensorField.embedding, res['results'][i][enums.TensorField.tensor_facets][0]) diff --git a/tests/tensor_search/integ_tests/test_search_semi_structured.py b/tests/tensor_search/integ_tests/test_search_semi_structured.py index 21de21ed6..d80c615bb 100644 --- a/tests/tensor_search/integ_tests/test_search_semi_structured.py +++ b/tests/tensor_search/integ_tests/test_search_semi_structured.py @@ -828,50 +828,97 @@ def test_filter_on_id_and_more(self): self.assertEqual(set(expected_ids), set([hit["_id"] for hit in res["hits"]])) def test_attributes_to_retrieve(self): - docs = [ - { - "field_1": "Exact match hehehe", - "field_2": "baaadd", - "random_field": "res res res", - "random_lala": "res res res haha", - "marqomarqo": "check check haha", + doc = { + "short_string_field": "Exact match hehehe", + "long_string_field": "This is a very long string." * 10, + "int_field": 123, + "float_field": 123.0, + "string_array": ["123", "123"], + "int_map": {"a": 1, "b": 2}, + "float_map": {"c": 1.0, "d": 2.0}, + "bool_field": True, + "bool_field2": False, + "custom_vector_field": { + "content": "abcd", + "vector": [1.0] * 384 } - ] - - test_inputs = ( - (["void_field"], {"_id", "_score", "_highlights"}), - ([], {"_id", "_score", "_highlights"}), - (["field_1"], {"field_1", "_id", "_score", "_highlights"}), - (["field_1", "field_2"], {"field_1", "field_2", "_id", "_score", "_highlights"}), - (["field_1", "random_field"], {"field_1", "random_field", "_id", "_score", "_highlights"}), - (["field_1", "random_field", "random_lala"], - {"field_1", "random_field", "random_lala", "_id", "_score", "_highlights"}), - (["field_1", "random_field", "random_lala", "marqomarqo"], - {"field_1", "random_field", "random_lala", "marqomarqo", "_id", "_score", "_highlights"}), - (["field_1", "field_2", "random_field", "random_lala", "marqomarqo"], - {"field_1", "field_2", "random_field", "random_lala", "marqomarqo", "_id", "_score", "_highlights"}), - (None, {"field_1", "field_2", "random_field", "random_lala", "marqomarqo", "_id", "_score", "_highlights"}), - ) + } self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, - docs=docs, - tensor_fields=["field_1", "field_2"] + docs=[doc], + tensor_fields=["short_string_field" , "custom_vector_field", "multimodal_combo_field"], + mappings={ + "custom_vector_field": {"type": "custom_vector"}, + "multimodal_combo_field": { + "type": "multimodal_combination", + "weights": {"short_string_field": 1.0, "long_string_field": 2.0} + } + } ) ) + # meta fields are always returned + meta_fields = {"_id", "_score", "_highlights"} + non_map_fields = {"short_string_field", "long_string_field", "int_field", "float_field", "bool_field", + "bool_field2", "string_array", "custom_vector_field"} + map_fields_flattened = {"int_map.a", "int_map.b", "float_map.c", "float_map.d"} + + test_cases = ( + # attributes_to_retrieve, expected result excluding meta_fields + ([], set()), # no field is selected + (["non_existent_field"], set()), # non_existent field is provided + (["multimodal_combo_field"], set()), # multimodal_combination fields cannot be selected + + # None or all fields + (None, non_map_fields | map_fields_flattened), # not provided + (list(doc.keys()), non_map_fields | map_fields_flattened), # all fields are selected + + # one field + (["short_string_field"], {"short_string_field"}), + (["long_string_field"], {"long_string_field"}), + (["int_field"], {"int_field"}), + (["float_field"], {"float_field"}), + (["string_array"], {"string_array"}), + (["int_map"], {"int_map.a", "int_map.b"}), + (["float_map"], {"float_map.c", "float_map.d"}), + (["bool_field"], {"bool_field"}), + (["custom_vector_field"], {"custom_vector_field"}), + # combination of short and long string fields + (["short_string_field", "long_string_field"], {"short_string_field", "long_string_field"}), + # combination of int and int map fields + (["int_field", "int_map"], {"int_field", "int_map.a", "int_map.b"}), + # combination of fload and fload map fields + (["float_field", "float_map"], {"float_field", "float_map.c", "float_map.d"}), + # multiple boolean fields + (["bool_field", "bool_field2"], {"bool_field", "bool_field2"}), + # combination of all types of fields include non-existent fields + (["short_string_field", "long_string_field", "int_map", "bool_field", "multimodal_combo_field", + "string_array", "custom_vector_field"], + {"short_string_field", "long_string_field", "int_map.a", "int_map.b", "bool_field", "string_array", + "custom_vector_field"}) + ) + for search_method in [SearchMethod.LEXICAL, SearchMethod.TENSOR]: - for searchable_attributes, expected_fields in test_inputs: - with self.subTest( - f"search_method = {search_method}, searchable_attributes={searchable_attributes}, expected_fields = {expected_fields}"): + for attributes_to_retrieve, expected_fields in test_cases: + with self.subTest(f"search_method = {search_method}, attributes_to_retrieve={attributes_to_retrieve}, " + f"expected_fields = {expected_fields}"): + res = tensor_search.search( config=self.config, index_name=self.default_text_index, text="Exact match hehehe", - attributes_to_retrieve=searchable_attributes, search_method=search_method + attributes_to_retrieve=attributes_to_retrieve, search_method=search_method ) - - self.assertEqual(expected_fields, set(res["hits"][0].keys())) + self.assertSetEqual(expected_fields | meta_fields, set(res["hits"][0].keys())) + for attribute in expected_fields: + if attribute == "custom_vector_field": + self.assertEqual(res["hits"][0][attribute], doc[attribute]["content"]) + elif '.' in attribute: + map_field_name, key = attribute.split('.', maxsplit=1) + self.assertEqual(res["hits"][0][attribute], doc[map_field_name][key]) + else: + self.assertEqual(res["hits"][0][attribute], doc[attribute]) def test_limit_results(self): """""" diff --git a/tests/tensor_search/integ_tests/test_search_structured.py b/tests/tensor_search/integ_tests/test_search_structured.py index e7bb30313..d65eaf7d9 100644 --- a/tests/tensor_search/integ_tests/test_search_structured.py +++ b/tests/tensor_search/integ_tests/test_search_structured.py @@ -77,10 +77,19 @@ def setUpClass(cls) -> None: features=[FieldFeature.ScoreModifier]), FieldRequest(name="score_mods_long", type=FieldType.Long, features=[FieldFeature.ScoreModifier]), + FieldRequest(name="int_array", type=FieldType.ArrayInt), + FieldRequest(name="float_array", type=FieldType.ArrayFloat), + FieldRequest(name="long_array", type=FieldType.ArrayLong), + FieldRequest(name="double_array", type=FieldType.ArrayDouble), + FieldRequest(name="multimodal_combo_field", type=FieldType.MultimodalCombination, dependent_fields={ + "text_field_1": 1.0, "text_field_2": 2.0 + }), + FieldRequest(name="custom_vector_field", type=FieldType.CustomVector) ], tensor_fields=["text_field_1", "text_field_2", "text_field_3", - "text_field_4", "text_field_5", "text_field_6"] + "text_field_4", "text_field_5", "text_field_6", + "multimodal_combo_field", "custom_vector_field"] ) default_text_index_encoded_name = cls.structured_marqo_index_request( name='a-b_' + str(uuid.uuid4()).replace('-', ''), @@ -659,53 +668,76 @@ def test_lexical_filtering(self): # self.assertEqual(expected_ids, [hit["_id"] for hit in res["hits"]]) def test_attributes_to_retrieve(self): - docs = [ - { - "text_field_1": "Exact match hehehe", - "text_field_2": "baaadd", - "text_field_3": "res res res", - "text_field_4": "res res res haha", - "text_field_5": "check check haha", - } - ] - test_inputs = ( - ([], {"_id", "_score", "_highlights"}), - (["text_field_1"], {"text_field_1", "_id", "_score", "_highlights"}), - (["text_field_1", "text_field_2"], {"text_field_1", "text_field_2", "_id", "_score", "_highlights"}), - (["text_field_1", "text_field_3"], {"text_field_1", "text_field_3", "_id", "_score", "_highlights"}), - (["text_field_1", "text_field_3", "text_field_4"], - {"text_field_1", "text_field_3", "text_field_4", "_id", "_score", "_highlights"}), - (["text_field_1", "text_field_3", "text_field_4", "text_field_5"], - {"text_field_1", "text_field_3", "text_field_4", "text_field_5", "_id", "_score", "_highlights"}), - (["text_field_1", "text_field_2", "text_field_3", "text_field_4", "text_field_5"], - {"text_field_1", "text_field_2", "text_field_3", "text_field_4", "text_field_5", "_id", "_score", - "_highlights"}), - # TODO Fix this subtest - # Not running this test case until we solve the bool issue - # (None, {"text_field_1", "text_field_2", "text_field_3", "text_field_4", "text_field_5", "_id", "_score", - # "_highlights"}), - ) + doc = { + "text_field_1": "Exact match hehehe", + "text_field_2": "This is a very long string." * 10, + "int_field_1": 123, + "float_field_1": 123.0, + "long_field_1": 1234, + "double_field_1": 1234.0, + "list_field_1": ["123", "123"], + "map_score_mods_int": {"a": 1, "b": 2}, + "map_score_mods_float": {"c": 1.0, "d": 2.0}, + "map_score_mods_long": {"a": 1, "b": 2}, + "map_score_mods_double": {"c": 1.0, "d": 2.0}, + "bool_field_1": True, + "bool_field_2": False, + "int_array": [1, 2, 3], + "long_array": [1, 2, 3, 4, 5], + "float_array": [1.0, 2.0, 3.0], + "double_array": [3.09, 2.09, 3.12], + "custom_vector_field": { + "content": "abcd", + "vector": [1.0] * 384 + }, + } - self.add_documents( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, - docs=docs, - ) + docs=[doc], + mappings={ + "multimodal_combo_field": { + "type": "multimodal_combination", + "weights": {"text_field_1": 2.0, "text_field_2": 3.0} + } + } + ), + ) + + print(res) + + # meta fields are always returned + meta_fields = {"_id", "_score", "_highlights"} + + test_cases = (( + # attributes_to_retrieve, expected result excluding meta_fields + ([], set()), # no field is selected + (["multimodal_combo_field"], set()), # multimodal_combination fields cannot be selected + (None, doc.keys()), # not provided + (list(doc.keys()), doc.keys()), # all fields are selected ) + + tuple([([field], {field}) for field in doc.keys()]) # one field + + tuple((random_fields, set(random_fields)) for random_fields in + [random.sample(doc.keys(), random.randint(2, len(doc))) for _ in range(10)])) # random n(>1) fields, 10 times for search_method in [SearchMethod.LEXICAL, SearchMethod.TENSOR]: - for attributes_to_retrieve, expected_fields in test_inputs: - with self.subTest( - f"search_method = {search_method}, attributes_to_retrieve={attributes_to_retrieve}," - f" expected_fields = {expected_fields}"): + for attributes_to_retrieve, expected_fields in test_cases: + with self.subTest(f"search_method = {search_method}, attributes_to_retrieve={attributes_to_retrieve}, " + f"expected_fields = {expected_fields}"): + res = tensor_search.search( config=self.config, index_name=self.default_text_index, text="Exact match hehehe", attributes_to_retrieve=attributes_to_retrieve, search_method=search_method ) - - self.assertEqual(expected_fields, set(res["hits"][0].keys())) + self.assertSetEqual(expected_fields | meta_fields, set(res["hits"][0].keys())) + for attribute in expected_fields: + if attribute == "custom_vector_field": + self.assertEqual(res["hits"][0][attribute], doc[attribute]["content"]) + else: + self.assertEqual(res["hits"][0][attribute], doc[attribute]) def test_limit_results(self): vocab_source = "https://www.mit.edu/~ecprice/wordlist.10000" diff --git a/tests/tensor_search/integ_tests/test_search_unstructured.py b/tests/tensor_search/integ_tests/test_search_unstructured.py index 4837bc863..2adad56a1 100644 --- a/tests/tensor_search/integ_tests/test_search_unstructured.py +++ b/tests/tensor_search/integ_tests/test_search_unstructured.py @@ -843,50 +843,97 @@ def test_filter_on_id_and_more(self): self.assertEqual(set(expected_ids), set([hit["_id"] for hit in res["hits"]])) def test_attributes_to_retrieve(self): - docs = [ - { - "field_1": "Exact match hehehe", - "field_2": "baaadd", - "random_field": "res res res", - "random_lala": "res res res haha", - "marqomarqo": "check check haha", + doc = { + "short_string_field": "Exact match hehehe", + "long_string_field": "This is a very long string." * 10, + "int_field": 123, + "float_field": 123.0, + "string_array": ["123", "123"], + "int_map": {"a": 1, "b": 2}, + "float_map": {"c": 1.0, "d": 2.0}, + "bool_field": True, + "bool_field2": False, + "custom_vector_field": { + "content": "abcd", + "vector": [1.0] * 384 } - ] - - test_inputs = ( - (["void_field"], {"_id", "_score", "_highlights"}), - ([], {"_id", "_score", "_highlights"}), - (["field_1"], {"field_1", "_id", "_score", "_highlights"}), - (["field_1", "field_2"], {"field_1", "field_2", "_id", "_score", "_highlights"}), - (["field_1", "random_field"], {"field_1", "random_field", "_id", "_score", "_highlights"}), - (["field_1", "random_field", "random_lala"], - {"field_1", "random_field", "random_lala", "_id", "_score", "_highlights"}), - (["field_1", "random_field", "random_lala", "marqomarqo"], - {"field_1", "random_field", "random_lala", "marqomarqo", "_id", "_score", "_highlights"}), - (["field_1", "field_2", "random_field", "random_lala", "marqomarqo"], - {"field_1", "field_2", "random_field", "random_lala", "marqomarqo", "_id", "_score", "_highlights"}), - (None, {"field_1", "field_2", "random_field", "random_lala", "marqomarqo", "_id", "_score", "_highlights"}), - ) + } self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, - docs=docs, - tensor_fields=["field_1", "field_2"] + docs=[doc], + tensor_fields=["short_string_field", "custom_vector_field", "multimodal_combo_field"], + mappings={ + "custom_vector_field": {"type": "custom_vector"}, + "multimodal_combo_field": { + "type": "multimodal_combination", + "weights": {"short_string_field": 1.0, "long_string_field": 2.0} + } + } ) ) + # meta fields are always returned + meta_fields = {"_id", "_score", "_highlights"} + non_map_fields = {"short_string_field", "long_string_field", "int_field", "float_field", "bool_field", + "bool_field2", "string_array", "custom_vector_field"} + map_fields_flattened = {"int_map.a", "int_map.b", "float_map.c", "float_map.d"} + + test_cases = ( + # attributes_to_retrieve, expected result excluding meta_fields + ([], set()), # no field is selected + (["non_existent_field"], set()), # non_existent field is provided + (["multimodal_combo_field"], set()), # multimodal_combination fields cannot be selected + + # None or all fields + (None, non_map_fields | map_fields_flattened), # not provided + (list(doc.keys()), non_map_fields | map_fields_flattened), # all fields are selected + + # one field + (["short_string_field"], {"short_string_field"}), + (["long_string_field"], {"long_string_field"}), + (["int_field"], {"int_field"}), + (["float_field"], {"float_field"}), + (["string_array"], {"string_array"}), + (["int_map"], {"int_map.a", "int_map.b"}), + (["float_map"], {"float_map.c", "float_map.d"}), + (["bool_field"], {"bool_field"}), + (["custom_vector_field"], {"custom_vector_field"}), + # combination of short and long string fields + (["short_string_field", "long_string_field"], {"short_string_field", "long_string_field"}), + # combination of int and int map fields + (["int_field", "int_map"], {"int_field", "int_map.a", "int_map.b"}), + # combination of fload and fload map fields + (["float_field", "float_map"], {"float_field", "float_map.c", "float_map.d"}), + # multiple boolean fields + (["bool_field", "bool_field2"], {"bool_field", "bool_field2"}), + # combination of all types of fields include non-existent fields + (["short_string_field", "long_string_field", "int_map", "bool_field", "multimodal_combo_field", + "string_array", "custom_vector_field"], + {"short_string_field", "long_string_field", "int_map.a", "int_map.b", "bool_field", "string_array", + "custom_vector_field"}) + ) + for search_method in [SearchMethod.LEXICAL, SearchMethod.TENSOR]: - for searchable_attributes, expected_fields in test_inputs: - with self.subTest( - f"search_method = {search_method}, searchable_attributes={searchable_attributes}, expected_fields = {expected_fields}"): + for attributes_to_retrieve, expected_fields in test_cases: + with self.subTest(f"search_method = {search_method}, attributes_to_retrieve={attributes_to_retrieve}, " + f"expected_fields = {expected_fields}"): + res = tensor_search.search( config=self.config, index_name=self.default_text_index, text="Exact match hehehe", - attributes_to_retrieve=searchable_attributes, search_method=search_method + attributes_to_retrieve=attributes_to_retrieve, search_method=search_method ) - - self.assertEqual(expected_fields, set(res["hits"][0].keys())) + self.assertSetEqual(expected_fields | meta_fields, set(res["hits"][0].keys())) + for attribute in expected_fields: + if attribute == "custom_vector_field": + self.assertEqual(res["hits"][0][attribute], doc[attribute]["content"]) + elif '.' in attribute: + map_field_name, key = attribute.split('.', maxsplit=1) + self.assertEqual(res["hits"][0][attribute], doc[map_field_name][key]) + else: + self.assertEqual(res["hits"][0][attribute], doc[attribute]) def test_limit_results(self): """""" From a6a5678845ddd6aadcb0907bea74a71a831ef271 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Tue, 10 Dec 2024 16:19:14 +1100 Subject: [PATCH 2/5] Add hf_transfer to dependencies (#1066) --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 8e87ed048..47a1299f7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,7 +6,7 @@ COPY vespa . RUN mvn clean package # Stage 2: Base image for Python setup -FROM marqoai/marqo-base:46 as base_image +FROM marqoai/marqo-base:47 as base_image # Allow mounting volume containing data and configs for vespa VOLUME /opt/vespa/var From 3315ac6eb7d5c64f65bb970fc08686bbdc559dc3 Mon Sep 17 00:00:00 2001 From: Yihan Zhao Date: Mon, 16 Dec 2024 12:59:47 +1100 Subject: [PATCH 3/5] Ensure the OOM error is captured in cuda health check (#1068) --- src/marqo/core/inference/device_manager.py | 22 +++++++++++++------ tests/core/inference/test_device_manager.py | 24 +++++++++++++++------ 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/marqo/core/inference/device_manager.py b/src/marqo/core/inference/device_manager.py index 7fd345e57..a284baa87 100644 --- a/src/marqo/core/inference/device_manager.py +++ b/src/marqo/core/inference/device_manager.py @@ -82,20 +82,30 @@ def cuda_device_health_check(self) -> None: oom_errors = [] for device in self.cuda_devices: + memory_stats = None try: cuda_device = torch.device(f'cuda:{device.id}') memory_stats = torch.cuda.memory_stats(cuda_device) logger.debug(f'CUDA device {device.full_name} with total memory {device.total_memory}. ' f'Memory stats: {str(memory_stats)}') - torch.randn(3, device=cuda_device) + # Marqo usually allocates 20MiB cuda memory at a time when processing media files. When OOM happens, + # there are usually a few MiB free in reserved memory. To consistently trigger OOM from this health + # check request, we try to create a tensor that won't fit in the reserved memory to always trigger + # another allocation. Please note this is not leaky since we don't store the reference to this tensor. + # Once it's out of scope, the memory it takes will be returned to reserved space. Our assumption is + # that this method will not be called too often or with high concurrency (only used by liveness check + # for now which run once every few seconds). + tensor_size = int(20 * 1024 * 1024 / 4) # 20MiB, float32 (4 bytes) + torch.randn(tensor_size, device=cuda_device) except RuntimeError as e: if 'out of memory' in str(e).lower(): - logger.error(f'CUDA device {device.full_name} is out of memory. Total memory: {device.total_memory}. ' - f'Memory stats: {str(memory_stats)}') - allocated_mem = memory_stats.get("allocated.all.current", None) if memory_stats else None - oom_errors.append(f'CUDA device {device.full_name} is out of memory:' - f' ({allocated_mem}/{device.total_memory})') + logger.error(f'CUDA device {device.full_name} is out of memory. Original error: {str(e)}. ' + f'Memory stats: {str(memory_stats)}.') + allocated_mem = memory_stats.get("allocated_bytes.all.current", None) if memory_stats else None + reserved_mem = memory_stats.get("reserved_bytes.all.current", None) if memory_stats else None + oom_errors.append(f'CUDA device {device.full_name} is out of memory (reserved: {reserved_mem}, ' + f'allocated: {allocated_mem}, total: {device.total_memory})') else: # Log out a warning message when encounter other transient errors. logger.error(f'Encountered issue inspecting CUDA device {device.full_name}: {str(e)}') diff --git a/tests/core/inference/test_device_manager.py b/tests/core/inference/test_device_manager.py index 2688e82a8..d009ccac5 100644 --- a/tests/core/inference/test_device_manager.py +++ b/tests/core/inference/test_device_manager.py @@ -31,6 +31,12 @@ def _device_manager_with_multiple_cuda_devices(self, total_memory: int = 1_000_0 return DeviceManager() + def _mem_stats(self, allocated: int, reserved: int) -> OrderedDict: + return OrderedDict({ + "allocated_bytes.all.current": allocated, + "reserved_bytes.all.current": reserved + }) + def test_init_with_cpu(self): device_manager = self._device_manager_without_cuda() @@ -79,22 +85,24 @@ def test_cuda_health_check_should_fail_when_cuda_device_is_out_of_memory(self): with mock.patch("torch.cuda.is_available", return_value=True), \ mock.patch("torch.randn", side_effect=RuntimeError("CUDA error: out of memory")), \ - mock.patch("torch.cuda.memory_stats", return_value=OrderedDict({"allocated.all.current": 900_000})): + mock.patch("torch.cuda.memory_stats", return_value=self._mem_stats(900_000, 950_000)): with self.assertRaises(CudaOutOfMemoryError) as err: device_manager.cuda_device_health_check() - self.assertEqual(str(err.exception), "CUDA device cuda:0(Tesla T4) is out of memory: (900000/1000000)") + self.assertEqual(str(err.exception), "CUDA device cuda:0(Tesla T4) is out of memory " + "(reserved: 950000, allocated: 900000, total: 1000000)") def test_cuda_health_check_should_fail_when_any_cuda_device_is_out_of_memory(self): device_manager = self._device_manager_with_multiple_cuda_devices(total_memory=1_000_000) with mock.patch("torch.cuda.is_available", return_value=True), \ mock.patch("torch.randn", side_effect=[torch.tensor([1, 2, 3]), RuntimeError("CUDA error: out of memory")]), \ - mock.patch("torch.cuda.memory_stats", return_value=OrderedDict({"allocated.all.current": 900_000})): + mock.patch("torch.cuda.memory_stats", return_value=self._mem_stats(900_000, 950_000)): with self.assertRaises(CudaOutOfMemoryError) as err: device_manager.cuda_device_health_check() - self.assertEqual(str(err.exception), "CUDA device cuda:1(Tesla H200) is out of memory: (900000/1000000)") + self.assertEqual(str(err.exception), "CUDA device cuda:1(Tesla H200) is out of memory " + "(reserved: 950000, allocated: 900000, total: 1000000)") def test_cuda_health_check_should_check_if_all_cuda_devices_are_out_of_memory(self): device_manager = self._device_manager_with_multiple_cuda_devices(total_memory=1_000_000) @@ -102,12 +110,14 @@ def test_cuda_health_check_should_check_if_all_cuda_devices_are_out_of_memory(se with mock.patch("torch.cuda.is_available", return_value=True), \ mock.patch("torch.randn", side_effect=[RuntimeError("CUDA error: out of memory"), RuntimeError("CUDA error: out of memory")]), \ - mock.patch("torch.cuda.memory_stats", return_value=OrderedDict({"allocated.all.current": 900_000})): + mock.patch("torch.cuda.memory_stats", return_value=self._mem_stats(900_000, 950_000)): with self.assertRaises(CudaOutOfMemoryError) as err: device_manager.cuda_device_health_check() - self.assertEqual(str(err.exception), "CUDA device cuda:0(Tesla T4) is out of memory: (900000/1000000);" - "CUDA device cuda:1(Tesla H200) is out of memory: (900000/1000000)") + self.assertEqual(str(err.exception), "CUDA device cuda:0(Tesla T4) is out of memory " + "(reserved: 950000, allocated: 900000, total: 1000000);" + "CUDA device cuda:1(Tesla H200) is out of memory " + "(reserved: 950000, allocated: 900000, total: 1000000)") def test_cuda_health_check_should_pass_and_log_error_message_when_cuda_calls_encounter_issue_other_than_oom(self): device_manager = self._device_manager_with_multiple_cuda_devices() From 0e45ef09184bc887818a1991f916271306751c23 Mon Sep 17 00:00:00 2001 From: Yihan Zhao Date: Fri, 20 Dec 2024 12:58:22 +1100 Subject: [PATCH 4/5] (Release 2.14) Preserving document-process node in services.xml when bootstrapping Vespa app (#1079) --- .../vespa_application_package.py | 2 +- .../index_management/test_services_xml.py | 65 +++++++++++++++++++ 2 files changed, 66 insertions(+), 1 deletion(-) diff --git a/src/marqo/core/index_management/vespa_application_package.py b/src/marqo/core/index_management/vespa_application_package.py index 275b669f0..e91e1fec1 100644 --- a/src/marqo/core/index_management/vespa_application_package.py +++ b/src/marqo/core/index_management/vespa_application_package.py @@ -86,7 +86,7 @@ def _cleanup_container_config(self): """ container_element = self._ensure_only_one('container') for child in container_element.findall('*'): - if child.tag in ['document-api', 'search']: + if child.tag in ['document-api', 'document-processing', 'search']: child.clear() elif child.tag != 'nodes': container_element.remove(child) diff --git a/tests/core/index_management/test_services_xml.py b/tests/core/index_management/test_services_xml.py index 8fa1dcf70..85b2c7964 100644 --- a/tests/core/index_management/test_services_xml.py +++ b/tests/core/index_management/test_services_xml.py @@ -200,6 +200,71 @@ def test_config_components(self): """ self._assertStringsEqualIgnoringWhitespace(expected_xml, service_xml.to_xml()) + def test_config_components_should_preserve_document_processing_nodes(self): + """ + This test case tests that document-processing node in the `container` section is preserved when configuring + components since it is usually referenced in the `content` section, and we do not change content section. + """ + xml = """ + + + + + + + + + + + + + + + + + """ + + service_xml = ServicesXml(xml) + service_xml.config_components() + + # removed all random element and custom components from container element + # added searcher, handler and other custom components to container element + # kept nodes in container as is + # kept content element as is + expected_xml = """ + + + + + + + + + + + + + + http://*/index-settings/* + http://*/index-settings + + + + marqo_index_settings.json + marqo_index_settings_history.json + + + + + + + + + + + """ + self._assertStringsEqualIgnoringWhitespace(expected_xml, service_xml.to_xml()) + def _assertStringsEqualIgnoringWhitespace(self, s1: str, s2: str): """Custom assertion to compare strings ignoring whitespace.""" From bb41f163ea9b5e54fb97371ab6649b1481d3fa50 Mon Sep 17 00:00:00 2001 From: Aditya Bharadwaj Date: Tue, 24 Dec 2024 12:24:48 +1100 Subject: [PATCH 5/5] (releases/2.14) Make compatibility tests run on releases branch and make them run on all previous patch versions (#1076) --- ...wards_compatibility_marqo_orchestrator.yml | 3 ++- .../scripts/generate_versions.py | 19 +++++++++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/.github/workflows/backwards_compatibility_marqo_orchestrator.yml b/.github/workflows/backwards_compatibility_marqo_orchestrator.yml index e5f63d4ac..ffa65565d 100644 --- a/.github/workflows/backwards_compatibility_marqo_orchestrator.yml +++ b/.github/workflows/backwards_compatibility_marqo_orchestrator.yml @@ -5,6 +5,7 @@ on: push: branches: - mainline + - releases/* paths-ignore: - '**.md' workflow_dispatch: @@ -19,7 +20,7 @@ on: # Setting MAX_VERSIONS_TO_TEST, this can be a configurable value or if no input is provided, it can be a default value. env: - MAX_VERSIONS_TO_TEST: ${{ github.event.inputs.max_versions_to_test || 3 }} + MAX_VERSIONS_TO_TEST: ${{ github.event.inputs.max_versions_to_test || 5 }} jobs: check-if-image-exists: diff --git a/tests/backwards_compatibility_tests/scripts/generate_versions.py b/tests/backwards_compatibility_tests/scripts/generate_versions.py index 83614749a..d6a31275c 100644 --- a/tests/backwards_compatibility_tests/scripts/generate_versions.py +++ b/tests/backwards_compatibility_tests/scripts/generate_versions.py @@ -4,17 +4,17 @@ import subprocess import sys -def generate_versions(to_version: str, num_versions: int = 3) -> list: +def generate_versions(to_version: str, num_minor_versions_to_test: int = 3) -> list: """ Generate a list of previous versions based on the target version. This function generates a list of previous versions for a given target version. - It includes the previous patch version of the same minor version if applicable, - and the latest patch versions for preceding minor versions. + It includes all the previous patch versions of the same minor version if applicable, + and the latest patch versions for preceding minor versions of up to num_minor_versions_to_test. Args: to_version (str): The target version to generate previous versions for. - num_versions (int): The number of previous versions to generate. Defaults to 3. + num_minor_versions_to_test (int): The number of previous minor versions to generate. Defaults to 3. Returns: list: A list of previous versions as strings. @@ -24,13 +24,16 @@ def generate_versions(to_version: str, num_versions: int = 3) -> list: # If this is a patch release, add the previous patch version of the same minor version if target_version.patch > 0: - prev_patch_version = f"{target_version.major}.{target_version.minor}.{target_version.patch - 1}" - versions.append(prev_patch_version) + versions.extend( + f"{target_version.major}.{target_version.minor}.{i}" + for i in range(target_version.patch - 1, -1, -1) + ) # Gather the latest patch version for each preceding minor version minor = target_version.minor - 1 - while len(versions) < num_versions and minor >= 0: - # Get all tags for the given minor version, sort, and pick the latest patch + for _ in range(num_minor_versions_to_test): + if minor < 0: + break tags = subprocess.check_output( ["git", "tag", "--list", f"{target_version.major}.{minor}.*"], text=True