From ecb76142f7b7fd95bf228332e3c197d04d99003c Mon Sep 17 00:00:00 2001 From: Erwin Pe Date: Mon, 3 Feb 2025 09:02:51 -0500 Subject: [PATCH] MONGOCRYPT-762 Generate text search token sets from StrEncode output (#946) --- ...mc-fle2-insert-update-payload-private-v2.h | 3 +- src/mc-fle2-insert-update-payload-v2.c | 8 + src/mongocrypt-buffer-private.h | 6 + src/mongocrypt-buffer.c | 29 +- src/mongocrypt-marking.c | 207 +++++++++---- test/test-mongocrypt-buffer.c | 25 ++ test/test-mongocrypt-marking.c | 276 ++++++++++++++++-- 7 files changed, 458 insertions(+), 96 deletions(-) diff --git a/src/mc-fle2-insert-update-payload-private-v2.h b/src/mc-fle2-insert-update-payload-private-v2.h index 138b10e86..fe1c7476a 100644 --- a/src/mc-fle2-insert-update-payload-private-v2.h +++ b/src/mc-fle2-insert-update-payload-private-v2.h @@ -33,7 +33,8 @@ _mongocrypt_buffer_t encryptedTokens; \ } mc_Text##Type##TokenSet_t; \ void mc_Text##Type##TokenSet_init(mc_Text##Type##TokenSet_t *); \ - void mc_Text##Type##TokenSet_cleanup(mc_Text##Type##TokenSet_t *) + void mc_Text##Type##TokenSet_cleanup(mc_Text##Type##TokenSet_t *); \ + void mc_Text##Type##TokenSet_shallow_copy(const mc_Text##Type##TokenSet_t *src, mc_Text##Type##TokenSet_t *dest) DEF_TEXT_SEARCH_TOKEN_SET(Exact); DEF_TEXT_SEARCH_TOKEN_SET(Substring); diff --git a/src/mc-fle2-insert-update-payload-v2.c b/src/mc-fle2-insert-update-payload-v2.c index f379f3952..9f181673b 100644 --- a/src/mc-fle2-insert-update-payload-v2.c +++ b/src/mc-fle2-insert-update-payload-v2.c @@ -32,6 +32,14 @@ _mongocrypt_buffer_cleanup(&ts->escDerivedToken); \ _mongocrypt_buffer_cleanup(&ts->serverDerivedFromDataToken); \ _mongocrypt_buffer_cleanup(&ts->encryptedTokens); \ + } \ + void mc_Text##Type##TokenSet_shallow_copy(const mc_Text##Type##TokenSet_t *src, mc_Text##Type##TokenSet_t *dst) { \ + BSON_ASSERT_PARAM(src); \ + BSON_ASSERT_PARAM(dst); \ + _mongocrypt_buffer_set_to(&src->edcDerivedToken, &dst->edcDerivedToken); \ + _mongocrypt_buffer_set_to(&src->escDerivedToken, &dst->escDerivedToken); \ + _mongocrypt_buffer_set_to(&src->serverDerivedFromDataToken, &dst->serverDerivedFromDataToken); \ + _mongocrypt_buffer_set_to(&src->encryptedTokens, &dst->encryptedTokens); \ } DEF_TEXT_SEARCH_TOKEN_SET_INIT_CLEANUP(Exact) diff --git a/src/mongocrypt-buffer-private.h b/src/mongocrypt-buffer-private.h index 18a604777..127b91846 100644 --- a/src/mongocrypt-buffer-private.h +++ b/src/mongocrypt-buffer-private.h @@ -162,4 +162,10 @@ bool _mongocrypt_buffer_from_subrange(_mongocrypt_buffer_t *out, uint32_t offset, uint32_t len) MONGOCRYPT_WARN_UNUSED_RESULT; +/* _mongocrypt_buffer_copy_from_string_as_bson_value initializes @out, wraps the provided string + * into a BSON value, and copies the BSON value to @out. No BSON validation is performed on @str. + * Caller must call _mongocrypt_buffer_cleanup. + */ +void _mongocrypt_buffer_copy_from_string_as_bson_value(_mongocrypt_buffer_t *out, const char *str, int len); + #endif /* MONGOCRYPT_BUFFER_H */ diff --git a/src/mongocrypt-buffer.c b/src/mongocrypt-buffer.c index fb872d5ce..257c6a351 100644 --- a/src/mongocrypt-buffer.c +++ b/src/mongocrypt-buffer.c @@ -317,7 +317,10 @@ bool _mongocrypt_buffer_to_bson_value(_mongocrypt_buffer_t *plaintext, uint8_t t return ret; } -void _mongocrypt_buffer_from_iter(_mongocrypt_buffer_t *plaintext, bson_iter_t *iter) { +static void _mongocrypt_buffer_copy_as_bson_value(_mongocrypt_buffer_t *plaintext, + bool (*append_func)(bson_t *bson, const void *data, int len), + const void *data, + int len) { bson_t wrapper = BSON_INITIALIZER; int32_t offset = INT32_LEN /* skips document size */ + TYPE_LEN /* element type */ @@ -326,13 +329,14 @@ void _mongocrypt_buffer_from_iter(_mongocrypt_buffer_t *plaintext, bson_iter_t * uint8_t *wrapper_data; BSON_ASSERT_PARAM(plaintext); - BSON_ASSERT_PARAM(iter); + BSON_ASSERT_PARAM(append_func); /* It is not straightforward to transform a bson_value_t to a string of * bytes. As a workaround, we wrap the value in a bson document with an empty * key, then use the raw buffer from inside the new bson_t, skipping the * length and type header information and the key name. */ - bson_append_iter(&wrapper, "", 0, iter); + append_func(&wrapper, data, len); + wrapper_data = ((uint8_t *)bson_get_data(&wrapper)); BSON_ASSERT(wrapper.len >= (uint32_t)offset + NULL_BYTE_LEN); plaintext->len = wrapper.len - (uint32_t)offset - NULL_BYTE_LEN; /* the final null byte */ @@ -345,6 +349,25 @@ void _mongocrypt_buffer_from_iter(_mongocrypt_buffer_t *plaintext, bson_iter_t * bson_destroy(&wrapper); } +static bool _append_iter(bson_t *bson, const void *iter, int len) { + return bson_append_iter(bson, "", 0, (const bson_iter_t *)iter); +} + +static bool _append_utf8(bson_t *bson, const void *str, int len) { + return bson_append_utf8(bson, "", 0, (const char *)str, len); +} + +void _mongocrypt_buffer_copy_from_string_as_bson_value(_mongocrypt_buffer_t *plaintext, const char *str, int len) { + BSON_ASSERT_PARAM(str); + BSON_ASSERT(len >= 0); + _mongocrypt_buffer_copy_as_bson_value(plaintext, _append_utf8, str, len); +} + +void _mongocrypt_buffer_from_iter(_mongocrypt_buffer_t *plaintext, bson_iter_t *iter) { + BSON_ASSERT_PARAM(iter); + _mongocrypt_buffer_copy_as_bson_value(plaintext, _append_iter, iter, 0); +} + bool _mongocrypt_buffer_from_uuid_iter(_mongocrypt_buffer_t *buf, bson_iter_t *iter) { const uint8_t *data; bson_subtype_t subtype; diff --git a/src/mongocrypt-marking.c b/src/mongocrypt-marking.c index ba8ebbb43..50025638f 100644 --- a/src/mongocrypt-marking.c +++ b/src/mongocrypt-marking.c @@ -29,6 +29,8 @@ #include "mc-range-edge-generation-private.h" #include "mc-range-encoding-private.h" #include "mc-range-mincover-private.h" +#include "mc-str-encode-string-sets-private.h" +#include "mc-text-search-str-encode-private.h" #include "mc-tokens-private.h" #include "mongocrypt-buffer-private.h" #include "mongocrypt-ciphertext-private.h" @@ -1126,26 +1128,22 @@ static bool _fle2_generate_TextSearchTokenSets(_mongocrypt_key_broker_t *kb, mc_FLE2InsertUpdatePayloadV2_t *payload, const _mongocrypt_buffer_t *indexKeyId, const mc_FLE2TextSearchInsertSpec_t *spec, - const _mongocrypt_buffer_t *value, int64_t contentionFactor, mongocrypt_status_t *status) { BSON_ASSERT_PARAM(kb); BSON_ASSERT_PARAM(payload); BSON_ASSERT_PARAM(indexKeyId); BSON_ASSERT_PARAM(spec); - BSON_ASSERT_PARAM(value); _mongocrypt_crypto_t *crypto = kb->crypt->crypto; mc_TextSearchTokenSets_t *tsts = &payload->textSearchTokenSets.tsts; _FLE2EncryptedPayloadCommon_t common = {{0}}; bool res = false; - // TODO MONGOCRYPT-759 implement case folding; for now let foldedValue be a copy of value. - _mongocrypt_buffer_t foldedValue = {0}; - _mongocrypt_buffer_init(&foldedValue); - _mongocrypt_buffer_copy_to(value, &foldedValue); - - // TODO MONGOCRYPT-762 do StrEncode here to get substring sets to encode + mc_str_encode_sets_t *encodeSets = mc_text_search_str_encode(spec, status); + if (!encodeSets) { + goto fail; + } // Start the token derivations if (!_get_tokenKey(kb, indexKeyId, &common.tokenKey, status)) { @@ -1164,72 +1162,154 @@ static bool _fle2_generate_TextSearchTokenSets(_mongocrypt_key_broker_t *kb, goto fail; } - if (!_fle2_generate_TextExactTokenSet(kb, - &tsts->exact, - &foldedValue, - contentionFactor, - common.collectionsLevel1Token, - common.serverTokenDerivationLevel1Token, - status)) { - goto fail; + // Generate exact token set singleton + { + _mongocrypt_buffer_t asBsonValue; + _mongocrypt_buffer_init(&asBsonValue); + BSON_ASSERT(encodeSets->exact.len < INT_MAX); + _mongocrypt_buffer_copy_from_string_as_bson_value(&asBsonValue, + (const char *)encodeSets->exact.data, + (int)encodeSets->exact.len); + if (!_fle2_generate_TextExactTokenSet(kb, + &tsts->exact, + &asBsonValue, + contentionFactor, + common.collectionsLevel1Token, + common.serverTokenDerivationLevel1Token, + status)) { + _mongocrypt_buffer_cleanup(&asBsonValue); + goto fail; + } + _mongocrypt_buffer_cleanup(&asBsonValue); } - if (spec->substr.set) { - // TODO MONGOCRYPT-762 iterate on StrEncode substrings set - mc_TextSubstringTokenSet_t substrSet = {{0}}; - mc_TextSubstringTokenSet_init(&substrSet); + const char *substring; + uint32_t bytelen; + uint32_t appendCount; - if (!_fle2_generate_TextSubstringTokenSet(kb, - &substrSet, - &foldedValue, - contentionFactor, - common.collectionsLevel1Token, - common.serverTokenDerivationLevel1Token, - status)) { - mc_TextSubstringTokenSet_cleanup(&substrSet); - goto fail; + // Generate array of substring token sets + if (encodeSets->substring_set) { + mc_substring_set_iter_t set_itr; + mc_substring_set_iter_init(&set_itr, encodeSets->substring_set); + + while (mc_substring_set_iter_next(&set_itr, &substring, &bytelen, &appendCount)) { + BSON_ASSERT(appendCount > 0); + BSON_ASSERT(bytelen < INT_MAX); + + mc_TextSubstringTokenSet_t tset = {{0}}; + + _mongocrypt_buffer_t asBsonValue; + _mongocrypt_buffer_init(&asBsonValue); + _mongocrypt_buffer_copy_from_string_as_bson_value(&asBsonValue, substring, (int)bytelen); + + if (!_fle2_generate_TextSubstringTokenSet(kb, + &tset, + &asBsonValue, + contentionFactor, + common.collectionsLevel1Token, + common.serverTokenDerivationLevel1Token, + status)) { + _mongocrypt_buffer_cleanup(&asBsonValue); + mc_TextSubstringTokenSet_cleanup(&tset); + goto fail; + } + _mongocrypt_buffer_cleanup(&asBsonValue); + + if (appendCount > 1) { + mc_TextSubstringTokenSet_t tset_copy; + mc_TextSubstringTokenSet_shallow_copy(&tset, &tset_copy); + for (; appendCount > 1; appendCount--) { + _mc_array_append_val(&tsts->substringArray, tset_copy); + } + } + _mc_array_append_val(&tsts->substringArray, tset); // array now owns tset } - _mc_array_append_val(&tsts->substringArray, substrSet); - } - if (spec->suffix.set) { - // TODO MONGOCRYPT-762 iterate on StrEncode suffixes set - mc_TextSuffixTokenSet_t suffixSet = {{0}}; - mc_TextSuffixTokenSet_init(&suffixSet); - - if (!_fle2_generate_TextSuffixTokenSet(kb, - &suffixSet, - &foldedValue, - contentionFactor, - common.collectionsLevel1Token, - common.serverTokenDerivationLevel1Token, - status)) { - mc_TextSuffixTokenSet_cleanup(&suffixSet); - goto fail; + } + + // Generate array of suffix token sets + if (encodeSets->suffix_set) { + mc_affix_set_iter_t set_itr; + mc_affix_set_iter_init(&set_itr, encodeSets->suffix_set); + + while (mc_affix_set_iter_next(&set_itr, &substring, &bytelen, &appendCount)) { + BSON_ASSERT(appendCount > 0); + BSON_ASSERT(bytelen < INT_MAX); + + mc_TextSuffixTokenSet_t tset = {{0}}; + mc_TextSuffixTokenSet_init(&tset); + + _mongocrypt_buffer_t asBsonValue; + _mongocrypt_buffer_init(&asBsonValue); + _mongocrypt_buffer_copy_from_string_as_bson_value(&asBsonValue, substring, (int)bytelen); + + if (!_fle2_generate_TextSuffixTokenSet(kb, + &tset, + &asBsonValue, + contentionFactor, + common.collectionsLevel1Token, + common.serverTokenDerivationLevel1Token, + status)) { + _mongocrypt_buffer_cleanup(&asBsonValue); + mc_TextSuffixTokenSet_cleanup(&tset); + goto fail; + } + _mongocrypt_buffer_cleanup(&asBsonValue); + + if (appendCount > 1) { + mc_TextSuffixTokenSet_t tset_copy; + mc_TextSuffixTokenSet_shallow_copy(&tset, &tset_copy); + for (; appendCount > 1; appendCount--) { + _mc_array_append_val(&tsts->suffixArray, tset_copy); + } + } + _mc_array_append_val(&tsts->suffixArray, tset); // array now owns tset } - _mc_array_append_val(&tsts->suffixArray, suffixSet); - } - if (spec->prefix.set) { - // TODO MONGOCRYPT-762 iterate on StrEncode suffixes set - mc_TextPrefixTokenSet_t prefixSet = {{0}}; - mc_TextPrefixTokenSet_init(&prefixSet); - - if (!_fle2_generate_TextPrefixTokenSet(kb, - &prefixSet, - &foldedValue, - contentionFactor, - common.collectionsLevel1Token, - common.serverTokenDerivationLevel1Token, - status)) { - mc_TextPrefixTokenSet_cleanup(&prefixSet); - goto fail; + } + + // Generate array of prefix token sets + if (encodeSets->prefix_set) { + mc_affix_set_iter_t set_itr; + mc_affix_set_iter_init(&set_itr, encodeSets->prefix_set); + + while (mc_affix_set_iter_next(&set_itr, &substring, &bytelen, &appendCount)) { + BSON_ASSERT(appendCount > 0); + BSON_ASSERT(bytelen < INT_MAX); + + mc_TextPrefixTokenSet_t tset = {{0}}; + mc_TextPrefixTokenSet_init(&tset); + + _mongocrypt_buffer_t asBsonValue; + _mongocrypt_buffer_init(&asBsonValue); + _mongocrypt_buffer_copy_from_string_as_bson_value(&asBsonValue, substring, (int)bytelen); + + if (!_fle2_generate_TextPrefixTokenSet(kb, + &tset, + &asBsonValue, + contentionFactor, + common.collectionsLevel1Token, + common.serverTokenDerivationLevel1Token, + status)) { + _mongocrypt_buffer_cleanup(&asBsonValue); + mc_TextPrefixTokenSet_cleanup(&tset); + goto fail; + } + _mongocrypt_buffer_cleanup(&asBsonValue); + + if (appendCount > 1) { + mc_TextPrefixTokenSet_t tset_copy; + mc_TextPrefixTokenSet_shallow_copy(&tset, &tset_copy); + for (; appendCount > 1; appendCount--) { + _mc_array_append_val(&tsts->prefixArray, tset_copy); // array now owns tset_copy + } + } + _mc_array_append_val(&tsts->prefixArray, tset); // moves ownership of tset } - _mc_array_append_val(&tsts->prefixArray, prefixSet); } payload->textSearchTokenSets.set = true; res = true; fail: _FLE2EncryptedPayloadCommon_cleanup(&common); - _mongocrypt_buffer_cleanup(&foldedValue); + mc_str_encode_sets_destroy(encodeSets); return res; } @@ -1350,7 +1430,6 @@ static bool _mongocrypt_fle2_placeholder_to_insert_update_ciphertextForTextSearc &payload, &placeholder->index_key_id, &insertSpec, - &value, payload.contentionFactor, status)) { goto fail; diff --git a/test/test-mongocrypt-buffer.c b/test/test-mongocrypt-buffer.c index bb1bf441b..1945be85d 100644 --- a/test/test-mongocrypt-buffer.c +++ b/test/test-mongocrypt-buffer.c @@ -16,6 +16,7 @@ #include +#include "mongocrypt-buffer-private.h" #include "test-mongocrypt-assert.h" #include "test-mongocrypt.h" @@ -232,6 +233,29 @@ static void _test_mongocrypt_buffer_from_subrange(_mongocrypt_tester_t *tester) _mongocrypt_buffer_cleanup(&input); } +static void _test_mongocrypt_buffer_copy_from_string_as_bson_value(_mongocrypt_tester_t *tester) { + _mongocrypt_buffer_t buf; + _mongocrypt_buffer_t expectedLenBuf; + const char *data = "foobar"; + + // expect output to contain 4-byte length + data + null string terminator + size_t expectedLen = sizeof(int32_t) + strlen(data) + sizeof(uint8_t); + _mongocrypt_buffer_copy_from_hex(&expectedLenBuf, "07000000"); + + _mongocrypt_buffer_copy_from_string_as_bson_value(&buf, data, (int)strlen(data)); + ASSERT(buf.len == expectedLen); + + // check 4-byte length + ASSERT_CMPBYTES(expectedLenBuf.data, expectedLenBuf.len, buf.data, expectedLenBuf.len); + // check data + null byte + ASSERT_CMPBYTES((const uint8_t *)data, + strlen(data) + 1, + buf.data + expectedLenBuf.len, + buf.len - expectedLenBuf.len); + _mongocrypt_buffer_cleanup(&buf); + _mongocrypt_buffer_cleanup(&expectedLenBuf); +} + void _mongocrypt_tester_install_buffer(_mongocrypt_tester_t *tester) { INSTALL_TEST(_test_mongocrypt_buffer_from_iter); INSTALL_TEST(_test_mongocrypt_buffer_copy_from_data_and_size); @@ -239,4 +263,5 @@ void _mongocrypt_tester_install_buffer(_mongocrypt_tester_t *tester) { INSTALL_TEST(_test_mongocrypt_buffer_steal_from_string); INSTALL_TEST(_test_mongocrypt_buffer_copy_from_uint64_le); INSTALL_TEST(_test_mongocrypt_buffer_from_subrange); + INSTALL_TEST(_test_mongocrypt_buffer_copy_from_string_as_bson_value); } diff --git a/test/test-mongocrypt-marking.c b/test/test-mongocrypt-marking.c index ab472198c..7d4ab28c6 100644 --- a/test/test-mongocrypt-marking.c +++ b/test/test-mongocrypt-marking.c @@ -26,6 +26,7 @@ #include "test-mongocrypt-assert.h" #include "test-mongocrypt.h" #include +#include /* Create a basis marking buffer with valid values for the given fields. */ static void _make_marking(bson_t *bson, _mongocrypt_buffer_t *buf) { @@ -1215,16 +1216,16 @@ static void validate_text_search_token_set_common(bson_iter_t *iter_at_token_set mongocrypt_binary_t encrypted_token_bin; validate_and_get_bindata(&ts_bson, "d", BSON_SUBTYPE_BINARY, &token_bin); - ASSERT(token_bin.len == MONGOCRYPT_HMAC_SHA256_LEN); + ASSERT_CMPUINT32(token_bin.len, ==, MONGOCRYPT_HMAC_SHA256_LEN); validate_and_get_bindata(&ts_bson, "l", BSON_SUBTYPE_BINARY, &token_bin); - ASSERT(token_bin.len == MONGOCRYPT_HMAC_SHA256_LEN); + ASSERT_CMPUINT32(token_bin.len, ==, MONGOCRYPT_HMAC_SHA256_LEN); validate_and_get_bindata(&ts_bson, "s", BSON_SUBTYPE_BINARY, &esc_token_bin); - ASSERT(esc_token_bin.len == MONGOCRYPT_HMAC_SHA256_LEN); + ASSERT_CMPUINT32(esc_token_bin.len, ==, MONGOCRYPT_HMAC_SHA256_LEN); validate_and_get_bindata(&ts_bson, "p", BSON_SUBTYPE_BINARY, &encrypted_token_bin); - ASSERT(encrypted_token_bin.len == (16 + MONGOCRYPT_HMAC_SHA256_LEN)); + ASSERT_CMPUINT32(encrypted_token_bin.len, ==, (16 + MONGOCRYPT_HMAC_SHA256_LEN)); // validate crypto of p validate_encrypted_token(crypt, &encrypted_token_bin, &esc_token_bin, false, NULL); @@ -1251,11 +1252,11 @@ static size_t validate_text_search_token_set_array_common(bson_iter_t *iter_at_a return count; } -typedef enum { - TEXTSEARCH_SPEC_HAS_SUBSTRING = 1 << 0, - TEXTSEARCH_SPEC_HAS_SUFFIX = 1 << 1, - TEXTSEARCH_SPEC_HAS_PREFIX = 1 << 2, -} text_search_spec_query_type_flags; +typedef struct { + size_t substrings; + size_t suffixes; + size_t prefixes; +} text_search_expected_token_counts; // Assert that the fields in a insert/update payload V2 for text search match our expectations. // Specifically, checks that the length of these fields, and the values of deterministic fields, @@ -1266,27 +1267,28 @@ static void validate_text_search_ciphertext(_mongocrypt_tester_t *tester, const char *text_value, mongocrypt_fle2_placeholder_type_t type, uint64_t contention_max, - text_search_spec_query_type_flags flags) { + text_search_expected_token_counts expected_tag_counts) { bson_t iup_bson; bson_iter_t iter; ASSERT(_mongocrypt_buffer_to_bson(&ciphertext->data, &iup_bson)); mc_ServerDataEncryptionLevel1Token_t *sdel1Token = getSDEL1Token(crypt); + const mongocrypt_binary_t *keyId = TEST_BIN(16); // don't free! if (type == MONGOCRYPT_FLE2_PLACEHOLDER_TYPE_INSERT) { - ASSERT(ciphertext->blob_subtype == MC_SUBTYPE_FLE2InsertUpdatePayloadV2); - ASSERT(ciphertext->original_bson_type == 0); // unset - ASSERT(ciphertext->key_id.len == 0); // unset + ASSERT_CMPUINT8(ciphertext->blob_subtype, ==, MC_SUBTYPE_FLE2InsertUpdatePayloadV2); + ASSERT_CMPUINT8(ciphertext->original_bson_type, ==, 0); // unset + ASSERT_CMPUINT32(ciphertext->key_id.len, ==, 0); // unset iupv2_fields_common res = validate_iupv2_common(&iup_bson); // validate u, t, k have correct values - ASSERT(memcmp(res.u.data, (TEST_BIN(16))->data, 16) == 0); - ASSERT(res.t == BSON_TYPE_UTF8); - ASSERT(res.k <= contention_max); + ASSERT_CMPBYTES(keyId->data, keyId->len, res.u.data, res.u.len); + ASSERT_CMPUINT32(res.t, ==, BSON_TYPE_UTF8); + ASSERT_CMPUINT64(res.k, <=, contention_max); // validate e is ServerDataEncryptionLevel1Token = HMAC(RootKey, 3) - ASSERT(res.e.len == mc_ServerDataEncryptionLevel1Token_get(sdel1Token)->len); + ASSERT_CMPUINT32(res.e.len, ==, mc_ServerDataEncryptionLevel1Token_get(sdel1Token)->len); ASSERT(memcmp(res.e.data, mc_ServerDataEncryptionLevel1Token_get(sdel1Token)->data, res.e.len) == 0); // validate crypto of p @@ -1299,8 +1301,8 @@ static void validate_text_search_ciphertext(_mongocrypt_tester_t *tester, const _mongocrypt_value_encryption_algorithm_t *fle2alg = _mcFLE2v2AEADAlgorithm(); // assert first 16 bytes == userKeyId == indexKeyId - ASSERT(res.v.len > 16); - ASSERT(memcmp(res.v.data, res.u.data, res.u.len) == 0); + ASSERT_CMPUINT32(res.v.len, >, 16); + ASSERT_CMPBYTES(keyId->data, keyId->len, res.v.data, 16); _mongocrypt_buffer_t key, aad, ctext, ptext; _mongocrypt_buffer_init_size(&key, MONGOCRYPT_KEY_LEN); @@ -1314,9 +1316,9 @@ static void validate_text_search_ciphertext(_mongocrypt_tester_t *tester, ASSERT_OK_STATUS(fle2alg->do_decrypt(crypt->crypto, &aad, &key, &ctext, &ptext, &pbytes, status), status); // BSON strings have 5 (4 for size + 1 null terminator) bytes of overhead - ASSERT(pbytes >= 5); - ASSERT(strlen(text_value) == pbytes - 5); - ASSERT(0 == strcmp(text_value, (char *)(ptext.data + 4))); + ASSERT_CMPUINT32(pbytes, >=, 5); + ASSERT_CMPSIZE_T(strlen(text_value), ==, (pbytes - 5)); + ASSERT_STREQUAL(text_value, ((char *)(ptext.data + 4))); _mongocrypt_buffer_cleanup(&ptext); _mongocrypt_buffer_cleanup(&ctext); @@ -1344,21 +1346,107 @@ static void validate_text_search_ciphertext(_mongocrypt_tester_t *tester, size_t tscount = 0; ASSERT(bson_iter_init_find(&b_iter, &b_bson, "s")); tscount = validate_text_search_token_set_array_common(&b_iter, crypt); - ASSERT((tscount > 0) == !!(flags & TEXTSEARCH_SPEC_HAS_SUBSTRING)); + ASSERT_CMPSIZE_T(expected_tag_counts.substrings, ==, tscount); ASSERT(bson_iter_init_find(&b_iter, &b_bson, "u")); tscount = validate_text_search_token_set_array_common(&b_iter, crypt); - ASSERT((tscount > 0) == !!(flags & TEXTSEARCH_SPEC_HAS_SUFFIX)); + ASSERT_CMPSIZE_T(expected_tag_counts.suffixes, ==, tscount); ASSERT(bson_iter_init_find(&b_iter, &b_bson, "p")); tscount = validate_text_search_token_set_array_common(&b_iter, crypt); - ASSERT((tscount > 0) == !!(flags & TEXTSEARCH_SPEC_HAS_PREFIX)); + ASSERT_CMPSIZE_T(expected_tag_counts.prefixes, ==, tscount); } mc_ServerDataEncryptionLevel1Token_destroy(sdel1Token); bson_destroy(&iup_bson); } +static size_t calculate_expected_substring_tag_count(size_t beta, size_t mlen, size_t ub, size_t lb) { + ASSERT_CMPSIZE_T(beta, <=, (SIZE_MAX - 15)); + ASSERT_CMPSIZE_T(lb, <=, ub); + ASSERT_CMPSIZE_T(mlen, >=, ub); + + size_t cbclen = 16 * ((beta + 15) / 16); + if (beta > mlen || lb > cbclen) { + return 0; + } + size_t maxkgram1 = 0; + size_t maxkgram2 = 0; + for (size_t j = lb; j <= ub; j++) { + maxkgram1 += (mlen - j + 1); + } + for (size_t j = lb; j <= BSON_MIN(ub, cbclen); j++) { + maxkgram2 += (cbclen - j + 1); + } + return BSON_MIN(maxkgram1, maxkgram2); // msize +} + +static size_t calculate_expected_nfix_tag_count(size_t beta, size_t ub, size_t lb) { + ASSERT_CMPSIZE_T(beta, <=, (SIZE_MAX - 15)); + ASSERT_CMPSIZE_T(lb, <=, ub); + size_t cbclen = 16 * ((beta + 15) / 16); + if (lb > cbclen) { + return 0; + } + return BSON_MIN(ub, cbclen) - lb + 1; +} + +// Runs _mongocrypt_marking_to_ciphertext to compute the ciphertext for the given marking. +static bool test_text_search_insert_marking_to_ciphertext(_mongocrypt_tester_t *tester, + mongocrypt_t *crypt, + _mongocrypt_ciphertext_t *out, + const char *test_string, + int test_string_len, + int mlen, + mongocrypt_status_t *status) { + ASSERT_CMPINT(mlen, >, 0); + + mongocrypt_ctx_t *ctx = mongocrypt_ctx_new(crypt); + // Set up encryption environment + ASSERT_OK(mongocrypt_ctx_encrypt_init(ctx, "test", -1, TEST_FILE("./test/example/cmd.json")), ctx); + // Add a test key + _mongocrypt_buffer_t keyId; + _mongocrypt_buffer_from_binary(&keyId, TEST_BIN(16)); + keyId.subtype = BSON_SUBTYPE_UUID; + _mongocrypt_key_broker_add_test_key(&ctx->kb, &keyId); + + _mongocrypt_buffer_t marking_buf; + _mongocrypt_marking_t marking; + + bson_t *marking_bson = bson_new(); + BSON_APPEND_INT32(marking_bson, "t", 1); + BSON_APPEND_INT32(marking_bson, "a", 4); + BSON_APPEND_INT64(marking_bson, "cm", 2); + bson_t text_spec; + BSON_APPEND_DOCUMENT_BEGIN(marking_bson, "v", &text_spec); + bson_append_utf8(&text_spec, "v", 1, test_string, test_string_len); + BSON_APPEND_BOOL(&text_spec, "casef", false); + BSON_APPEND_BOOL(&text_spec, "diacf", false); + bson_t subspec; + BSON_APPEND_DOCUMENT_BEGIN(&text_spec, "substr", &subspec); + BSON_APPEND_INT32(&subspec, "mlen", mlen); + BSON_APPEND_INT32(&subspec, "ub", 1); + BSON_APPEND_INT32(&subspec, "lb", 1); + ASSERT(bson_append_document_end(&text_spec, &subspec)); + ASSERT(bson_append_document_end(marking_bson, &text_spec)); + + // Add key identifier info to the marking + BSON_APPEND_BINARY(marking_bson, "ki", BSON_SUBTYPE_UUID, (TEST_BIN(16))->data, 16); + BSON_APPEND_BINARY(marking_bson, "ku", BSON_SUBTYPE_UUID, (TEST_BIN(16))->data, 16); + _make_marking(marking_bson, &marking_buf); + // Use FLE2 as the subtype (default is FLE1) + marking_buf.data[0] = MC_SUBTYPE_FLE2EncryptionPlaceholder; + _parse_ok(&marking_buf, &marking); + + bool result = _mongocrypt_marking_to_ciphertext((void *)&ctx->kb, &marking, out, status); + + _mongocrypt_buffer_cleanup(&marking_buf); + bson_destroy(marking_bson); + _mongocrypt_marking_cleanup(&marking); + mongocrypt_ctx_destroy(ctx); + return result; +} + static void test_mc_marking_to_ciphertext_fle2_text_search(_mongocrypt_tester_t *tester) { if (!_aes_ctr_is_supported_by_os) { TEST_PRINTF("Common Crypto with no CTR support detected. Skipping."); @@ -1382,6 +1470,8 @@ static void test_mc_marking_to_ciphertext_fle2_text_search(_mongocrypt_tester_t _mongocrypt_ciphertext_t ciphertext; _mongocrypt_ciphertext_init(&ciphertext); mongocrypt_t *crypt = _mongocrypt_tester_mongocrypt(TESTER_MONGOCRYPT_DEFAULT); + text_search_expected_token_counts counts = {0}; + counts.substrings = calculate_expected_substring_tag_count(6, 1000, 100, 10); get_ciphertext_from_marking_json(tester, crypt, markingJSON, &ciphertext); validate_text_search_ciphertext(tester, @@ -1390,7 +1480,7 @@ static void test_mc_marking_to_ciphertext_fle2_text_search(_mongocrypt_tester_t "foobar", MONGOCRYPT_FLE2_PLACEHOLDER_TYPE_INSERT, 2, - TEXTSEARCH_SPEC_HAS_SUBSTRING); + counts); mongocrypt_destroy(crypt); _mongocrypt_ciphertext_cleanup(&ciphertext); @@ -1413,6 +1503,8 @@ static void test_mc_marking_to_ciphertext_fle2_text_search(_mongocrypt_tester_t _mongocrypt_ciphertext_t ciphertext; _mongocrypt_ciphertext_init(&ciphertext); mongocrypt_t *crypt = _mongocrypt_tester_mongocrypt(TESTER_MONGOCRYPT_DEFAULT); + text_search_expected_token_counts counts = {0}; + counts.suffixes = counts.prefixes = calculate_expected_nfix_tag_count(6, 100, 10); get_ciphertext_from_marking_json(tester, crypt, markingJSON, &ciphertext); validate_text_search_ciphertext(tester, @@ -1421,7 +1513,7 @@ static void test_mc_marking_to_ciphertext_fle2_text_search(_mongocrypt_tester_t "foobar", MONGOCRYPT_FLE2_PLACEHOLDER_TYPE_INSERT, 2, - TEXTSEARCH_SPEC_HAS_SUFFIX | TEXTSEARCH_SPEC_HAS_PREFIX); + counts); mongocrypt_destroy(crypt); _mongocrypt_ciphertext_cleanup(&ciphertext); @@ -1443,6 +1535,10 @@ static void test_mc_marking_to_ciphertext_fle2_text_search(_mongocrypt_tester_t _mongocrypt_ciphertext_t ciphertext; _mongocrypt_ciphertext_init(&ciphertext); mongocrypt_t *crypt = _mongocrypt_tester_mongocrypt(TESTER_MONGOCRYPT_DEFAULT); + text_search_expected_token_counts counts = {0}; + + // beta is 1 for empty strings + counts.prefixes = calculate_expected_nfix_tag_count(1, 100, 10); get_ciphertext_from_marking_json(tester, crypt, markingJSON, &ciphertext); validate_text_search_ciphertext(tester, @@ -1451,8 +1547,132 @@ static void test_mc_marking_to_ciphertext_fle2_text_search(_mongocrypt_tester_t "", MONGOCRYPT_FLE2_PLACEHOLDER_TYPE_INSERT, 2, - TEXTSEARCH_SPEC_HAS_PREFIX); + counts); + mongocrypt_destroy(crypt); + _mongocrypt_ciphertext_cleanup(&ciphertext); + } + // Test string cbc-padded length is less than lb (ie. substring/suffix/prefix tag sets will be + // empty) + { + const char *markingJSON = RAW_STRING({ + 't' : 1, + 'a' : 4, + 'v' : { + 'v' : "foobar", + 'casef' : false, + 'diacf' : false, + 'substr' : + {'mlen' : {'$numberInt' : '1000'}, 'ub' : {'$numberInt' : '100'}, 'lb' : {'$numberInt' : '20'}}, + 'prefix' : {'ub' : {'$numberInt' : '100'}, 'lb' : {'$numberInt' : '20'}}, + 'suffix' : {'ub' : {'$numberInt' : '100'}, 'lb' : {'$numberInt' : '20'}} + }, + 'cm' : {'$numberLong' : '2'} + }); + _mongocrypt_ciphertext_t ciphertext; + _mongocrypt_ciphertext_init(&ciphertext); + mongocrypt_t *crypt = _mongocrypt_tester_mongocrypt(TESTER_MONGOCRYPT_DEFAULT); + text_search_expected_token_counts counts = {0}; + + get_ciphertext_from_marking_json(tester, crypt, markingJSON, &ciphertext); + validate_text_search_ciphertext(tester, + &ciphertext, + crypt, + "foobar", + MONGOCRYPT_FLE2_PLACEHOLDER_TYPE_INSERT, + 2, + counts); + mongocrypt_destroy(crypt); + _mongocrypt_ciphertext_cleanup(&ciphertext); + } + + // Test string exceeds mlen + { + _mongocrypt_ciphertext_t ciphertext; + _mongocrypt_ciphertext_init(&ciphertext); + mongocrypt_t *crypt = _mongocrypt_tester_mongocrypt(TESTER_MONGOCRYPT_DEFAULT); + mongocrypt_status_t *status = mongocrypt_status_new(); + + ASSERT_FAILS_STATUS( + test_text_search_insert_marking_to_ciphertext(tester, crypt, &ciphertext, "foobar", 6, 3, status), + status, + "longer than the maximum length for substring indexing"); + + mongocrypt_status_destroy(status); + mongocrypt_destroy(crypt); + _mongocrypt_ciphertext_cleanup(&ciphertext); + } + + // Test string is not valid utf-8 + { + _mongocrypt_ciphertext_t ciphertext; + _mongocrypt_ciphertext_init(&ciphertext); + mongocrypt_t *crypt = _mongocrypt_tester_mongocrypt(TESTER_MONGOCRYPT_DEFAULT); + const char *expected_msg = "String passed in was not valid UTF-8"; + mongocrypt_status_t *status = mongocrypt_status_new(); + + // invalid utf-8 byte 0xff + ASSERT_FAILS_STATUS(test_text_search_insert_marking_to_ciphertext(tester, + crypt, + &ciphertext, + "foob\xffr", + 6, + INT32_MAX, + status), + status, + expected_msg); + _mongocrypt_status_reset(status); + // embedded null byte + ASSERT_FAILS_STATUS(test_text_search_insert_marking_to_ciphertext(tester, + crypt, + &ciphertext, + "foob\x00r", + 6, + INT32_MAX, + status), + status, + expected_msg); + _mongocrypt_status_reset(status); + // overlong encoding of 'a' (\x61) + ASSERT_FAILS_STATUS(test_text_search_insert_marking_to_ciphertext(tester, + crypt, + &ciphertext, + "foob\xE0\x81\xA1r", + 8, + INT32_MAX, + status), + status, + expected_msg); + + mongocrypt_status_destroy(status); + mongocrypt_destroy(crypt); + _mongocrypt_ciphertext_cleanup(&ciphertext); + } + + // test string is too large + { + _mongocrypt_ciphertext_t ciphertext; + _mongocrypt_ciphertext_init(&ciphertext); + mongocrypt_t *crypt = _mongocrypt_tester_mongocrypt(TESTER_MONGOCRYPT_DEFAULT); + const char *expected_msg = "String passed in was too long"; + mongocrypt_status_t *status = mongocrypt_status_new(); + + int len = (16 * 1024 * 1024) + 2; + char *large_str = bson_malloc(len); + memset(large_str, 'a', len); + large_str[len - 1] = '\0'; + + ASSERT_FAILS_STATUS(test_text_search_insert_marking_to_ciphertext(tester, + crypt, + &ciphertext, + large_str, + len - 1, + INT32_MAX, + status), + status, + expected_msg); + bson_free(large_str); + mongocrypt_status_destroy(status); mongocrypt_destroy(crypt); _mongocrypt_ciphertext_cleanup(&ciphertext); }