Skip to content

Commit

Permalink
Patch unstructured bug for :: in value (#732)
Browse files Browse the repository at this point in the history
  • Loading branch information
wanliAlex authored Jan 24, 2024
1 parent d20d6f0 commit da275df
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def extract_highlights(self) -> List[Dict[str, str]]:
["cells"].keys())[0])
except KeyError:
raise VespaDocumentParsingError("No match features found in the document")
field_name, content = self.vespa_chunks[chunk_index].split("::", 2)
field_name, content = self.vespa_chunks[chunk_index].split("::", 1)
return [{field_name: content}]


Expand Down Expand Up @@ -118,7 +118,7 @@ def to_marqo_document(self, return_highlights: bool = False) -> Dict[str, Any]:
marqo_document.update(self.fields.long_string_fields)
# Reconstruct string arrays
for string_array in self.fields.string_arrays:
key, value = string_array.split("::", 2)
key, value = string_array.split("::", 1)
if key not in marqo_document:
marqo_document[key] = []
marqo_document[key].append(value)
Expand Down Expand Up @@ -146,7 +146,7 @@ def to_marqo_document(self, return_highlights: bool = False) -> Dict[str, Any]:
for chunk, embedding in zip(self.fields.vespa_chunks, embeddings_list):
if "::" not in chunk:
raise VespaDocumentParsingError(f"Chunk {chunk} does not have a field_name::content format")
field_name, content = chunk.split("::", 2)
field_name, content = chunk.split("::", 1)
if field_name not in marqo_document[index_constants.MARQO_DOC_TENSORS]:
marqo_document[index_constants.MARQO_DOC_TENSORS][field_name] = dict()
marqo_document[index_constants.MARQO_DOC_TENSORS][field_name][index_constants.MARQO_DOC_CHUNKS]\
Expand Down
22 changes: 22 additions & 0 deletions tests/tensor_search/integ_tests/test_search_unstructured.py
Original file line number Diff line number Diff line change
Expand Up @@ -1175,3 +1175,25 @@ def test_tensor_search_highlights_format(self):
self.assertTrue(isinstance(hit["_highlights"], list))
self.assertEqual(1, len(hit["_highlights"])) # We only have 1 highlight now
self.assertTrue(isinstance(hit["_highlights"][0], dict))

def test_search_with_content_double_colon(self):
docs = [
{"_id": "1", "text_field": "::my_text"} # This should work properly
]
tensor_search.add_documents(
config=self.config,
add_docs_params=AddDocsParams(
index_name=self.default_text_index,
docs=docs,
tensor_fields=["text_field"]
)
)
tensor_search_result = tensor_search.search(
text="some text",
index_name=self.default_text_index,
config=self.config,
search_method=SearchMethod.TENSOR,
)
self.assertEqual(1, len(tensor_search_result['hits']))
self.assertEqual("1", tensor_search_result['hits'][0]['_id'])

0 comments on commit da275df

Please sign in to comment.