From 27cf0951591bcd4aa6b51cac6f5b200a7d27ed85 Mon Sep 17 00:00:00 2001 From: Luigi Dell'Aquila Date: Mon, 20 Jan 2025 16:02:20 +0100 Subject: [PATCH] ES|QL JOIN: non existing keys and duplicate keys (#730) --- joins/challenges/default.json | 19 ++- joins/challenges/large.json | 151 +++++++++++++++++++++ joins/challenges/small.json | 19 ++- joins/index-lookup_idx_100000_f10_x10.json | 21 +++ joins/operations/default.json | 31 +++-- joins/track.json | 18 +++ 6 files changed, 249 insertions(+), 10 deletions(-) create mode 100644 joins/challenges/large.json create mode 100644 joins/index-lookup_idx_100000_f10_x10.json diff --git a/joins/challenges/default.json b/joins/challenges/default.json index c1c0ba99..127809b9 100644 --- a/joins/challenges/default.json +++ b/joins/challenges/default.json @@ -114,6 +114,24 @@ }, {% endfor %} +{% for i in range(2, 7) %} + { + "operation": "esql_lookup_join_100k_to_{{idx_suffix[i]}}", + "tags": ["lookup", "join", "limit1000"], + "clients": 1, + "warmup-iterations": 10, + "iterations": 50 + }, +{% endfor %} + + { + "operation": "esql_lookup_join_100k_keys_x10_limit1000", + "tags": ["lookup", "join", "limit1000"], + "clients": 1, + "warmup-iterations": 5, + "iterations": 50 + }, + { "operation": "esql_lookup_join_1k_keys_where_no_match", "tags": ["lookup", "join"], @@ -130,6 +148,5 @@ "iterations": 50 } - ] } diff --git a/joins/challenges/large.json b/joins/challenges/large.json new file mode 100644 index 00000000..c271e4cf --- /dev/null +++ b/joins/challenges/large.json @@ -0,0 +1,151 @@ + { + "name": "esql-large", + "description": "Performance benchmarks for internal R&D on query languages. This is work in progress", + "default": false, + "schedule": [ + { + "operation": "delete-index", + "tags": ["setup"] + }, + { + "operation": { + "operation-type": "create-index", + "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} { + {# non-serverless-index-settings-marker-start #}{%- if build_flavor != "serverless" or serverless_operator == true -%} + {% if p_include_non_serverless_index_settings %} + "index.translog.flush_threshold_size": "4g", + {% endif %} + {%- endif -%}{# non-serverless-index-settings-marker-end #} + "index.codec": "best_compression", + "index.refresh_interval": "30s" + }{%- endif %} + }, + "tags": ["setup"] + }, + + { + "name": "check-cluster-health", + "operation": { + "operation-type": "cluster-health", + "index": "join_base_idx", + "request-params": { + "wait_for_status": "{{cluster_health | default('green')}}", + "wait_for_no_relocating_shards": "true" + }, + "retry-until-success": true + }, + "tags": ["setup"] + }, + + { + "operation": "index-base", + "clients": {{bulk_indexing_clients | default(8)}}, + "tags": ["setup"] + }, + { + "operation": "index-small-lookup-indices", + "clients": {{bulk_indexing_clients | default(8)}}, + "tags": ["setup"] + }, + { + "operation": "index-lookup-1m", + "clients": {{bulk_indexing_clients | default(8)}}, + "tags": ["setup"] + }, + { + "operation": "index-lookup-5m", + "clients": {{bulk_indexing_clients | default(8)}}, + "tags": ["setup"] + }, + { + "operation": "index-lookup-100m", + "clients": {{bulk_indexing_clients | default(8)}}, + "tags": ["setup"] + }, + { + "name": "refresh-after-index", + "operation": "refresh", + "tags": ["setup"] + }, + + +{% for i in range(idx_suffix|length) %} + { + "operation": "esql_lookup_join_{{idx_suffix[i]}}_keys_limit1", + "tags": ["lookup", "join", "limit1"], + "clients": 1, + "warmup-iterations": 10, + "iterations": 50 + }, + { + "operation": "esql_lookup_join_{{idx_suffix[i]}}_keys_limit1000", + "tags": ["lookup", "join", "limit1000"], + "clients": 1, + "warmup-iterations": 10, + "iterations": 50 + }, + { + "operation": "esql_lookup_join_{{idx_suffix[i]}}_keys_limit10000", + "tags": ["lookup", "join", "limit10000"], + "clients": 1, + "warmup-iterations": 10, + "iterations": 50 + }, + { + "operation": "esql_lookup_join_{{idx_suffix[i]}}_keys_keep_limit10000", + "tags": ["lookup", "join", "limit10000"], + "clients": 1, + "warmup-iterations": 10, + "iterations": 50 + }, + { + "operation": "esql_lookup_join_{{idx_suffix[i]}}_keys_sort_limit10000", + "tags": ["lookup", "join", "limit10000"], + "clients": 1, + "warmup-iterations": 5, + "iterations": 20 + }, + { + "operation": "esql_lookup_join_{{idx_suffix[i]}}_keys_where_no_match", + "tags": ["lookup", "join"], + "clients": 1, + "warmup-iterations": 5, + "iterations": 20 + }, + { + "operation": "esql_lookup_join_{{idx_suffix[i]}}_keys_where_limit1000", + "tags": ["lookup", "join", "limit1000"], + "clients": 1, + "warmup-iterations": 10, + "iterations": 50 + }, +{% endfor %} + +{% for i in range(2, 7) %} + { + "operation": "esql_lookup_join_100k_to_{{idx_suffix[i]}}", + "tags": ["lookup", "join", "limit1000"], + "clients": 1, + "warmup-iterations": 10, + "iterations": 50 + }, +{% endfor %} + + { + "operation": "esql_lookup_join_100k_keys_x10_limit1000", + "tags": ["lookup", "join", "limit1000"], + "clients": 1, + "warmup-iterations": 5, + "iterations": 50 + }, + + { + "operation": "esql_lookup_join_1k_100k_200k_500k", + "tags": ["lookup", "join", "limit1000"], + "clients": 1, + "warmup-iterations": 10, + "iterations": 50 + } + + ] + } diff --git a/joins/challenges/small.json b/joins/challenges/small.json index 8497bda4..626214b3 100644 --- a/joins/challenges/small.json +++ b/joins/challenges/small.json @@ -102,6 +102,24 @@ }, {% endfor %} +{% for i in range(2, 6) %} + { + "operation": "esql_lookup_join_100k_to_{{idx_suffix[i]}}", + "tags": ["lookup", "join", "limit1000"], + "clients": 1, + "warmup-iterations": 10, + "iterations": 50 + }, +{% endfor %} + + { + "operation": "esql_lookup_join_100k_keys_x10_limit1000", + "tags": ["lookup", "join", "limit1000"], + "clients": 1, + "warmup-iterations": 5, + "iterations": 50 + }, + { "operation": "esql_lookup_join_1k_keys_sort_limit10000", "tags": ["lookup", "join", "limit10000"], @@ -126,6 +144,5 @@ "iterations": 50 } - ] } diff --git a/joins/index-lookup_idx_100000_f10_x10.json b/joins/index-lookup_idx_100000_f10_x10.json new file mode 100644 index 00000000..3af4c5c9 --- /dev/null +++ b/joins/index-lookup_idx_100000_f10_x10.json @@ -0,0 +1,21 @@ +{% set p_include_non_serverless_index_settings = (include_non_serverless_index_settings | default(build_flavor != "serverless")) %} + +{ + "settings": { + "index.mode": "lookup", + "auto_expand_replicas": {{ auto_expand_replicas | default("0-all") | tojson }} + {# non-serverless-index-settings-marker-start #}{%- if build_flavor != "serverless" or serverless_operator == true -%} + , "index.requests.cache.enable": false + {%- endif -%}{# non-serverless-index-settings-marker-end #} + }, + "mappings": { + "_source": { + "mode": {{ source_mode | default("stored") | tojson }} + }, + "properties": { + "key_100000": { + "type": "keyword" + } + } + } +} diff --git a/joins/operations/default.json b/joins/operations/default.json index 7e14b3f1..c1f7af1e 100644 --- a/joins/operations/default.json +++ b/joins/operations/default.json @@ -19,6 +19,13 @@ "bulk-size": {{bulk_size | default(10000)}}, "ingest-percentage": 100 }, + { + "name": "index-lookup-100k_x10", + "operation-type": "bulk", + "indices": ["lookup_idx_100000_f10_x10"], + "bulk-size": {{bulk_size | default(10000)}}, + "ingest-percentage": {{ingest_percentage | default(100)}} + }, { "name": "index-lookup-1m", "operation-type": "bulk", @@ -84,13 +91,21 @@ {% endfor %} +{% for i in range(2, 7) %} { - "name": "esql_lookup_join_1k_100k_200k_500k", - "operation-type": "esql", - "query": "FROM join_base_idx | lookup join lookup_idx_1000_f10 on key_1000 | rename lookup_keyword_0 as lk_1k | lookup join lookup_idx_100000_f10 on key_100000 | rename lookup_keyword_0 as lk_100k | lookup join lookup_idx_200000_f10 on key_200000 | rename lookup_keyword_0 as lk_200k | lookup join lookup_idx_500000_f10 on key_500000 | rename lookup_keyword_0 as lk_500k | keep id, key_1000, key_100000, key_200000, key_500000, lk_1k, lk_100k, lk_200k, lk_500k | limit 1000" - } - - - - + "name": "esql_lookup_join_100k_to_{{idx_suffix[i]}}", + "operation-type": "esql", + "query": "FROM join_base_idx | rename key_100000 as key_{{key_suffix[i]}}| lookup join lookup_idx_{{key_suffix[i]}}_f10 on key_{{key_suffix[i]}} | limit 1000" + }, +{% endfor %} + { + "name": "esql_lookup_join_100k_keys_x10_limit1000", + "operation-type": "esql", + "query": "FROM join_base_idx | lookup join lookup_idx_100000_f10_x10 on key_100000 | limit 1000" + }, + { + "name": "esql_lookup_join_1k_100k_200k_500k", + "operation-type": "esql", + "query": "FROM join_base_idx | lookup join lookup_idx_1000_f10 on key_1000 | rename lookup_keyword_0 as lk_1k | lookup join lookup_idx_100000_f10 on key_100000 | rename lookup_keyword_0 as lk_100k | lookup join lookup_idx_200000_f10 on key_200000 | rename lookup_keyword_0 as lk_200k | lookup join lookup_idx_500000_f10 on key_500000 | rename lookup_keyword_0 as lk_500k | keep id, key_1000, key_100000, key_200000, key_500000, lk_1k, lk_100k, lk_200k, lk_500k | limit 1000" + } diff --git a/joins/track.json b/joins/track.json index bd9498e3..bc27a452 100644 --- a/joins/track.json +++ b/joins/track.json @@ -16,6 +16,10 @@ "name": "lookup_idx_100000_f10", "body": "index-lookup_idx_100000_f10.json" }, + { + "name": "lookup_idx_100000_f10_x10", + "body": "index-lookup_idx_100000_f10_x10.json" + }, { "name": "lookup_idx_200000_f10", "body": "index-lookup_idx_200000_f10.json" @@ -66,6 +70,20 @@ } ] }, + { + "name": "lookup_idx_100000_f10_x10", + "base-url": "https://rally-tracks.elastic.co/joins", + "documents": [ + { + "target-index": "lookup_idx_100000_f10_x10", + "source-file": "lookup_idx_100000_f10_x10.json.bz2", + "#COMMENT": "Lookup index with 1M documents and 100k distinct keys (keyworks, \"0\"..\"99999\")", + "document-count": 1000000, + "compressed-bytes": 22483295, + "uncompressed-bytes": 412777900 + } + ] + }, { "name": "lookup_idx_200000_f10", "base-url": "https://rally-tracks.elastic.co/joins",