From c9aea6d2d31538d99d35ee8a88b26695fbaef611 Mon Sep 17 00:00:00 2001 From: Pierre <105686771+pieturin@users.noreply.github.com> Date: Thu, 16 Jan 2025 15:38:15 -0800 Subject: [PATCH 01/24] Fix memory leak in forgotten node ping ext code path (#1574) When processing a cluster bus PING extension, there is a memory leak when adding a new key to the `nodes_black_list` dict. We now make sure to free the key `sds` if the dict did not take ownership of it. Signed-off-by: Pierre Turin --- src/cluster_legacy.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index a171fe3fda..3689e76d61 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -2936,6 +2936,10 @@ void clusterProcessPingExtensions(clusterMsg *hdr, clusterLink *link) { if (n && n != myself && !(nodeIsReplica(myself) && myself->replicaof == n)) { sds id = sdsnewlen(forgotten_node_ext->name, CLUSTER_NAMELEN); dictEntry *de = dictAddOrFind(server.cluster->nodes_black_list, id); + if (dictGetKey(de) != id) { + /* The dict did not take ownership of the id string, so we need to free it. */ + sdsfree(id); + } uint64_t expire = server.unixtime + ntohu64(forgotten_node_ext->ttl); dictSetUnsignedIntegerVal(de, expire); clusterDelNode(n); From 2d0b8e360847146fa43a8463773dde8125187fa7 Mon Sep 17 00:00:00 2001 From: Pierre <105686771+pieturin@users.noreply.github.com> Date: Thu, 16 Jan 2025 23:56:52 -0800 Subject: [PATCH 02/24] Update comments and log message in cluster_legacy.c (#1561) Update comments and log message in `cluster_legacy.c`. Follow-up from #1441. Signed-off-by: Pierre Turin Co-authored-by: Ping Xie Co-authored-by: Binbin --- src/cluster_legacy.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 3689e76d61..94d3532dfc 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3282,8 +3282,8 @@ int clusterProcessPacket(clusterLink *link) { /* Unable to retrieve the node's IP address from the connection. Without a * valid IP, the node becomes unusable in the cluster. This failure might be * due to the connection being closed. */ - serverLog(LL_NOTICE, "Closing link even though we received a MEET packet on it, " - "because the connection has an error"); + serverLog(LL_NOTICE, "Closing cluster link due to failure to retrieve IP from the connection, " + "possibly caused by a closed connection."); freeClusterLink(link); return 0; } @@ -3306,14 +3306,14 @@ int clusterProcessPacket(clusterLink *link) { clusterAddNode(node); clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); } else { - /* A second MEET packet was received on an existing link during the handshake process. - * This happens when the other node detects no inbound link, and re-sends a MEET packet - * before this node can respond with a PING. This MEET is a no-op. + /* A second MEET packet was received on an existing link during the handshake + * process. This happens when the other node detects no inbound link, and + * re-sends a MEET packet before this node can respond with a PING. + * This MEET is a no-op. * - * Note: Nodes in HANDSHAKE state are not fully "known" (random names), so the sender - * remains unidentified at this point. The MEET packet might be re-sent if the inbound - * connection is still unestablished by the next cron cycle. - */ + * Note: Nodes in HANDSHAKE state are not fully "known" (random names), so the + * sender remains unidentified at this point. The MEET packet might be re-sent + * if the inbound connection is still unestablished by the next cron cycle. */ debugServerAssert(link->inbound && nodeInHandshake(link->node)); } @@ -3322,16 +3322,19 @@ int clusterProcessPacket(clusterLink *link) { * of the message type. */ clusterProcessGossipSection(hdr, link); } else if (sender->link && nodeExceedsHandshakeTimeout(sender, now)) { - /* The MEET packet is from a known node, after the handshake timeout, so the sender thinks that I do not - * know it. - * Free my outbound link to that node, triggering a reconnect and a PING over the new link. - * Once that node receives our PING, it should recognize the new connection as an inbound link from me. - * We should only free the outbound link if the node is known for more time than the handshake timeout, - * since during this time, the other side might still be trying to complete the handshake. */ + /* The MEET packet is from a known node, after the handshake timeout, so the sender + * thinks that I do not know it. + * Free my outbound link to that node, triggering a reconnect and a PING over the + * new link. + * Once that node receives our PING, it should recognize the new connection as an + * inbound link from me. We should only free the outbound link if the node is known + * for more time than the handshake timeout, since during this time, the other side + * might still be trying to complete the handshake. */ /* We should always receive a MEET packet on an inbound link. */ serverAssert(link != sender->link); - serverLog(LL_NOTICE, "Freeing outbound link to node %.40s (%s) after receiving a MEET packet from this known node", + serverLog(LL_NOTICE, "Freeing outbound link to node %.40s (%s) after receiving a MEET packet " + "from this known node", sender->name, sender->human_nodename); freeClusterLink(sender->link); } From b2e4155f54d335d7a6b0764ef963eaf5ef863f8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Sun, 19 Jan 2025 19:23:00 +0100 Subject: [PATCH 03/24] Lower latenct-monitor-threashold in expire-cycle test case (#1584) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test case checks for expire-cycle in LATENCY LATEST, but with the new hash table, the expiry-cycle is too fast to be logged by latency monitor. Lower the latency monitor threshold to make it more likely to be logged. Fixes #1580 --------- Signed-off-by: Viktor Söderqvist --- tests/unit/latency-monitor.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/latency-monitor.tcl b/tests/unit/latency-monitor.tcl index e4f45389d7..cabb90d382 100644 --- a/tests/unit/latency-monitor.tcl +++ b/tests/unit/latency-monitor.tcl @@ -143,7 +143,7 @@ tags {"needs:debug"} { } ;# tag test {LATENCY of expire events are correctly collected} { - r config set latency-monitor-threshold 20 + r config set latency-monitor-threshold 1 r config set lazyfree-lazy-expire no r flushdb if {$::valgrind} {set count 100000} else {set count 1000000} From 3032ccd48a17f9da78799dea0f8f976ee4312531 Mon Sep 17 00:00:00 2001 From: ranshid <88133677+ranshid@users.noreply.github.com> Date: Mon, 20 Jan 2025 08:04:47 +0200 Subject: [PATCH 04/24] Change the shared format for dual channel replication logs (#1586) change the format of the dual channel replication logs so that it will not conflict with existing log formats like modules. Fixes: https://github.com/valkey-io/valkey/issues/1509 Signed-off-by: Ran Shidlansik --- src/server.h | 2 +- tests/integration/dual-channel-replication.tcl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/server.h b/src/server.h index 29cbdc5c65..8f4fb6a6b4 100644 --- a/src/server.h +++ b/src/server.h @@ -3915,7 +3915,7 @@ void debugPauseProcess(void); /* dualChannelServerLog - Log messages related to dual-channel operations * This macro wraps the serverLog function, prepending "" * to the log message. */ -#define dualChannelServerLog(level, ...) serverLog(level, " " __VA_ARGS__) +#define dualChannelServerLog(level, ...) serverLog(level, "Dual channel replication: " __VA_ARGS__) #define serverDebug(fmt, ...) printf("DEBUG %s:%d > " fmt "\n", __FILE__, __LINE__, __VA_ARGS__) #define serverDebugMark() printf("-- MARK %s:%d --\n", __FILE__, __LINE__) diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl index ec6783c1c9..dc0de5a734 100644 --- a/tests/integration/dual-channel-replication.tcl +++ b/tests/integration/dual-channel-replication.tcl @@ -487,7 +487,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { } wait_for_value_to_propagate_to_replica $primary $replica "key1" # Confirm the occurrence of a race condition. - wait_for_log_messages -1 {"* Psync established after rdb load*"} 0 2000 1 + wait_for_log_messages -1 {"*Dual channel replication: Psync established after rdb load*"} 0 2000 1 } } } From dd92d079dcb15a984493def7f53746669e7cd693 Mon Sep 17 00:00:00 2001 From: ranshid <88133677+ranshid@users.noreply.github.com> Date: Mon, 20 Jan 2025 20:28:45 +0200 Subject: [PATCH 05/24] Fix Protocol desync regression test (#1590) The desync regression test was created as a regression test for the following bug: in case we embed NULL termination inside inline/multi-bulk message we will not be able to perform strchr in order to identify the newline(\n)/carriage-return(\r) in the client query buffer. this can influence (for example) replica reading primary stream and keep filling it's query buffer endlessly consuming more and more memory. In order to handle the above risk, a check was added to verify the inline bulk and multi-bulk size are not exceeding the 64K bytes in the query-buffer. A test was placed in order to verify this. This PR introduce the following fixes to the desync regression test: 1. fix the sent payload to flush 1024 bytes block of 'A's instead of 'payload' which was sent by mistake. 2. Make sure that the connection is correctly terminated on protocol error by the server after exceeding the 64K and not over 64K. 3. add another test intrinsic which will also verify the nested bulk with embedded null termination (was not verified before) fixes https://github.com/valkey-io/valkey/issues/1583 NOTE: Although it is possible to change the use of strchr to a more "safe" utility (eg memchr) which will not pause scan at first occurrence of '\0', we still like to protect against over excessive usage of the query buffer and also preserve the current behavior(?). We will look into improving this though in a followup issue. --------- Signed-off-by: Ran Shidlansik Signed-off-by: ranshid <88133677+ranshid@users.noreply.github.com> --- tests/unit/protocol.tcl | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/tests/unit/protocol.tcl b/tests/unit/protocol.tcl index 0d05f6dee5..7153173e0d 100644 --- a/tests/unit/protocol.tcl +++ b/tests/unit/protocol.tcl @@ -69,7 +69,7 @@ start_server {tags {"protocol network"}} { } set c 0 - foreach seq [list "\x00" "*\x00" "$\x00"] { + foreach seq [list "\x00" "*\x00" "$\x00" "*1\r\n$\x00"] { incr c test "Protocol desync regression test #$c" { if {$::tls} { @@ -77,29 +77,31 @@ start_server {tags {"protocol network"}} { } else { set s [socket [srv 0 host] [srv 0 port]] } + fconfigure $s -blocking 0 puts -nonewline $s $seq + # PROTO_INLINE_MAX_SIZE is hardcoded in Valkey code to 64K. doing the same here + # since we would like to validate it is enforced. + set PROTO_INLINE_MAX_SIZE [expr 1024 * 64] set payload [string repeat A 1024]"\n" - set test_start [clock seconds] - set test_time_limit 30 - while 1 { + set payload_size 0 + while {$payload_size <= $PROTO_INLINE_MAX_SIZE} { if {[catch { - puts -nonewline $s payload - flush $s incr payload_size [string length $payload] + puts -nonewline $s $payload + flush $s }]} { - set retval [gets $s] - close $s + assert_morethan $payload_size $PROTO_INLINE_MAX_SIZE break - } else { - set elapsed [expr {[clock seconds]-$test_start}] - if {$elapsed > $test_time_limit} { - close $s - error "assertion:Valkey did not closed connection after protocol desync" - } } } - set retval - } {*Protocol error*} + + wait_for_condition 50 100 { + [string match {*Protocol error*} [gets $s]] + } else { + fail "expected connection to be closed on protocol error after sending $payload_size bytes" + } + close $s + } } unset c From 7fc958da52aab644daf55ba39cc9f0092b063fbd Mon Sep 17 00:00:00 2001 From: ranshid <88133677+ranshid@users.noreply.github.com> Date: Tue, 21 Jan 2025 08:57:01 +0200 Subject: [PATCH 06/24] fix test Protocol desync regression test with TLS (#1593) remove socket nonblocking and simplify the validation fixes https://github.com/valkey-io/valkey/issues/1592 Signed-off-by: ranshid --- tests/unit/protocol.tcl | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/unit/protocol.tcl b/tests/unit/protocol.tcl index 7153173e0d..5baba99f8b 100644 --- a/tests/unit/protocol.tcl +++ b/tests/unit/protocol.tcl @@ -77,7 +77,6 @@ start_server {tags {"protocol network"}} { } else { set s [socket [srv 0 host] [srv 0 port]] } - fconfigure $s -blocking 0 puts -nonewline $s $seq # PROTO_INLINE_MAX_SIZE is hardcoded in Valkey code to 64K. doing the same here # since we would like to validate it is enforced. @@ -94,12 +93,7 @@ start_server {tags {"protocol network"}} { break } } - - wait_for_condition 50 100 { - [string match {*Protocol error*} [gets $s]] - } else { - fail "expected connection to be closed on protocol error after sending $payload_size bytes" - } + assert_match {*Protocol error*} [gets $s]] close $s } } From 99ed308817286fd980a987f877ffa4df197f03a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Thu, 23 Jan 2025 11:26:54 +0100 Subject: [PATCH 07/24] Add cross-version test framework (and a simple test) (#1371) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This includes a way to run two versions of the server from the TCL test framework. It's a preparation to add more cross-version tests. The runtest script accepts a new parameter ./runtest --other-server-path path/to/valkey-server and a new tag "needs:other-server" for test cases and start_server. Tests with this tag are automatically skipped if `--other-server-path` is not provided. This PR adds it in a CI job with Valkey 7.2.7 by downloading a binary release. Fixes #76 --------- Signed-off-by: Viktor Söderqvist --- .github/workflows/ci.yml | 7 ++- tests/README.md | 29 +++++++----- .../integration/cross-version-replication.tcl | 34 ++++++++++++++ tests/support/server.tcl | 46 +++++++++++++------ tests/test_helper.tcl | 34 ++++++++++---- 5 files changed, 116 insertions(+), 34 deletions(-) create mode 100644 tests/integration/cross-version-replication.tcl diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index df3eaa1905..a1014faa99 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,10 +18,15 @@ jobs: # Fail build if there are warnings # build with TLS just for compilation coverage run: make -j4 all-with-unit-tests SERVER_CFLAGS='-Werror' BUILD_TLS=yes USE_FAST_FLOAT=yes + - name: install old server for compatibility testing + run: | + cd tests/tmp + wget https://download.valkey.io/releases/valkey-7.2.7-noble-x86_64.tar.gz + tar -xvf valkey-7.2.7-noble-x86_64.tar.gz - name: test run: | sudo apt-get install tcl8.6 tclx - ./runtest --verbose --tags -slow --dump-logs + ./runtest --verbose --tags -slow --dump-logs --other-server-path tests/tmp/valkey-7.2.7-noble-x86_64/bin/valkey-server - name: module api test run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs - name: validate commands.def up to date diff --git a/tests/README.md b/tests/README.md index 06756f394b..8cac6c19fa 100644 --- a/tests/README.md +++ b/tests/README.md @@ -24,16 +24,17 @@ There are additional runtime options that can further adjust the test suite to match different external server configurations. All options are listed by `./runtest --help`. The following table is just a subset of the options: -| Option | Impact | -| -------------------- | -------------------------------------------------------- | -| `--singledb` | Only use database 0, don't assume others are supported. | -| `--ignore-encoding` | Skip all checks for specific encoding. | -| `--ignore-digest` | Skip key value digest validations. | -| `--cluster-mode` | Run in strict Valkey Cluster compatibility mode. | -| `--large-memory` | Enables tests that consume more than 100MB | -| `--tls` | Run tests with TLS. See below. | -| `--tls-module` | Run tests with TLS, when TLS support is built as a module. | -| `--help` | Displays the full set of options. | +| Option | Impact | +| -------------------------- | -------------------------------------------------------- | +| `--singledb` | Only use database 0, don't assume others are supported. | +| `--ignore-encoding` | Skip all checks for specific encoding. | +| `--ignore-digest` | Skip key value digest validations. | +| `--cluster-mode` | Run in strict Valkey Cluster compatibility mode. | +| `--large-memory` | Enables tests that consume more than 100MB | +| `--tls` | Run tests with TLS. See below. | +| `--tls-module` | Run tests with TLS, when TLS support is built as a module. | +| `--other-server-path PATH` | Run compatibility tests with an other server executable. | +| `--help` | Displays the full set of options. | Running with TLS requires the following preparations: @@ -86,18 +87,22 @@ Tags can be applied in different context levels: The following compatibility and capability tags are currently used: | Tag | Indicates | -| --------------------- | --------- | +| ------------------------- | --------- | | `external:skip` | Not compatible with external servers. | +| `cluster` | Uses cluster with multiple nodes. | | `cluster:skip` | Not compatible with `--cluster-mode`. | | `large-memory` | Test that requires more than 100MB | +| `tls` | Uses TLS. | | `tls:skip` | Not compatible with `--tls`. | -| `needs:repl` | Uses replication and needs to be able to `SYNC` from server. | +| `ipv6` | Uses IPv6. | +| `needs:repl`, `repl` | Uses replication and needs to be able to `SYNC` from server. | | `needs:debug` | Uses the `DEBUG` command or other debugging focused commands (like `OBJECT REFCOUNT`). | | `needs:pfdebug` | Uses the `PFDEBUG` command. | | `needs:config-maxmemory` | Uses `CONFIG SET` to manipulate memory limit, eviction policies, etc. | | `needs:config-resetstat` | Uses `CONFIG RESETSTAT` to reset statistics. | | `needs:reset` | Uses `RESET` to reset client connections. | | `needs:save` | Uses `SAVE` or `BGSAVE` to create an RDB file. | +| `needs:other-server` | Requires `--other-server-path`. | When using an external server (`--host` and `--port`), filtering using the `external:skip` tags is done automatically. diff --git a/tests/integration/cross-version-replication.tcl b/tests/integration/cross-version-replication.tcl new file mode 100644 index 0000000000..6ec3d0db80 --- /dev/null +++ b/tests/integration/cross-version-replication.tcl @@ -0,0 +1,34 @@ +# Test replication from an older version primary. +# +# Use minimal.conf to make sure we don't use any configs not supported on the old version. + +proc server_name_and_version {} { + set server_name [s server_name] + if {$server_name eq {}} { + set server_name redis + } + set server_version [s "${server_name}_version"] + return "$server_name $server_version" +} + +start_server {tags {"repl needs:other-server external:skip"} start-other-server 1 config "minimal.conf"} { + set primary_name_and_version [server_name_and_version] + r set foo bar + + start_server {} { + test "Start replication from $primary_name_and_version" { + r replicaof [srv -1 host] [srv -1 port] + wait_for_sync r + # The key has been transferred. + assert_equal bar [r get foo] + assert_equal up [s master_link_status] + } + + test "Replicate a SET command from $primary_name_and_version" { + r -1 set baz quux + wait_for_ofs_sync [srv 0 client] [srv -1 client] + set reply [r get baz] + assert_equal $reply quux + } + } +} diff --git a/tests/support/server.tcl b/tests/support/server.tcl index bd3135e9d9..c081e51338 100644 --- a/tests/support/server.tcl +++ b/tests/support/server.tcl @@ -2,9 +2,9 @@ set ::global_overrides {} set ::tags {} set ::valgrind_errors {} -proc start_server_error {config_file error} { +proc start_server_error {executable config_file error} { set err {} - append err "Can't start the Valkey server\n" + append err "Can't start $executable\n" append err "CONFIGURATION:\n" append err [exec cat $config_file] append err "\nERROR:\n" @@ -216,6 +216,11 @@ proc tags_acceptable {tags err_return} { return 0 } + if {$::other_server_path eq {} && [lsearch $tags "needs:other-server"] >= 0} { + set err "Other server path not provided" + return 0 + } + if {$::external && [lsearch $tags "external:skip"] >= 0} { set err "Not supported on external server" return 0 @@ -289,8 +294,8 @@ proc create_server_config_file {filename config config_lines} { close $fp } -proc spawn_server {config_file stdout stderr args} { - set cmd [list src/valkey-server $config_file] +proc spawn_server {executable config_file stdout stderr args} { + set cmd [list $executable $config_file] set args {*}$args if {[llength $args] > 0} { lappend cmd {*}$args @@ -319,7 +324,7 @@ proc spawn_server {config_file stdout stderr args} { } # Wait for actual startup, return 1 if port is busy, 0 otherwise -proc wait_server_started {config_file stdout stderr pid} { +proc wait_server_started {executable config_file stdout stderr pid} { set checkperiod 100; # Milliseconds set maxiter [expr {120*1000/$checkperiod}] ; # Wait up to 2 minutes. set port_busy 0 @@ -330,7 +335,7 @@ proc wait_server_started {config_file stdout stderr pid} { after $checkperiod incr maxiter -1 if {$maxiter == 0} { - start_server_error $config_file "No PID detected in log $stdout" + start_server_error $executable $config_file "No PID detected in log $stdout" puts "--- LOG CONTENT ---" puts [exec cat $stdout] puts "-------------------" @@ -347,7 +352,7 @@ proc wait_server_started {config_file stdout stderr pid} { # Configuration errors are unexpected, but it's helpful to fail fast # to give the feedback to the test runner. if {[regexp {FATAL CONFIG FILE ERROR} [exec cat $stderr]]} { - start_server_error $config_file "Configuration issue prevented Valkey startup" + start_server_error $executable $config_file "Configuration issue prevented Valkey startup" break } } @@ -441,6 +446,7 @@ proc start_server {options {code undefined}} { set args {} set keep_persistence false set config_lines {} + set start_other_server 0 # Wait for the server to be ready and check for server liveness/client connectivity before starting the test. set wait_ready true @@ -448,6 +454,9 @@ proc start_server {options {code undefined}} { # parse options foreach {option value} $options { switch $option { + "start-other-server" { + set start_other_server $value ; # boolean, 0 or 1 + } "config" { set baseconfig $value } @@ -498,6 +507,15 @@ proc start_server {options {code undefined}} { return } + if {$start_other_server} { + set executable $::other_server_path + if {![file executable $executable]} { + error "File not found or not executable: $executable" + } + } else { + set executable "src/valkey-server" + } + set data [split [exec cat "tests/assets/$baseconfig"] "\n"] set config {} if {$::tls} { @@ -588,15 +606,15 @@ proc start_server {options {code undefined}} { set server_started 0 while {$server_started == 0} { if {$::verbose} { - puts -nonewline "=== ($tags) Starting server ${::host}:${port} " + puts -nonewline "=== ($tags) Starting server on ${::host}:${port} " } send_data_packet $::test_server_fd "server-spawning" "port $port" - set pid [spawn_server $config_file $stdout $stderr $args] + set pid [spawn_server $executable $config_file $stdout $stderr $args] # check that the server actually started - set port_busy [wait_server_started $config_file $stdout $stderr $pid] + set port_busy [wait_server_started $executable $config_file $stdout $stderr $pid] # Sometimes we have to try a different port, even if we checked # for availability. Other test clients may grab the port before we @@ -634,7 +652,7 @@ proc start_server {options {code undefined}} { if {!$serverisup} { set err {} append err [exec cat $stdout] "\n" [exec cat $stderr] - start_server_error $config_file $err + start_server_error $executable $config_file $err return } set server_started 1 @@ -647,6 +665,7 @@ proc start_server {options {code undefined}} { if {[dict exists $config $port_param]} { set port [dict get $config $port_param] } # setup config dict + dict set srv "executable" $executable dict set srv "config_file" $config_file dict set srv "config" $config dict set srv "pid" $pid @@ -801,12 +820,13 @@ proc restart_server {level wait_ready rotate_logs {reconnect 1} {shutdown sigter close $fd } + set executable [dict get $srv "executable"] set config_file [dict get $srv "config_file"] - set pid [spawn_server $config_file $stdout $stderr {}] + set pid [spawn_server $executable $config_file $stdout $stderr {}] # check that the server actually started - wait_server_started $config_file $stdout $stderr $pid + wait_server_started $executable $config_file $stdout $stderr $pid # update the pid in the servers list dict set srv "pid" $pid diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 54bb923674..662449134a 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -69,6 +69,7 @@ set ::single_tests {} set ::run_solo_tests {} set ::skip_till "" set ::external 0; # If "1" this means, we are running against external instance +set ::other_server_path {}; # Used in upgrade and inter-version tests set ::file ""; # If set, runs only the tests in this comma separated list set ::curfile ""; # Hold the filename of the current suite set ::accurate 0; # If true runs fuzz tests with more iterations @@ -568,26 +569,36 @@ proc send_data_packet {fd status data {elapsed 0}} { proc print_help_screen {} { puts [join { - "--cluster Run the cluster tests, by default cluster tests run along with all tests." - "--moduleapi Run the module API tests, this option should only be used in runtest-moduleapi which will build the test module." + # This is for terminal output, so assume default term width of 80 columns. -----| + "--cluster Run the cluster tests, by default cluster tests run along" + " with all tests." + "--moduleapi Run the module API tests, this option should only be used in" + " runtest-moduleapi which will build the test module." "--valgrind Run the test over valgrind." "--durable suppress test crashes and keep running" "--stack-logging Enable OSX leaks/malloc stack logging." "--accurate Run slow randomized tests for more iterations." "--quiet Don't show individual tests." - "--single Just execute the specified unit (see next option). This option can be repeated." + "--single Just execute the specified unit (see next option). This" + " option can be repeated." "--verbose Increases verbosity." "--list-tests List all the available test units." - "--only Just execute the specified test by test name or tests that match regexp (if starts with '/'). This option can be repeated." + "--only Just execute the specified test by test name or tests that" + " match regexp (if starts with '/'). This option" + " can be repeated." "--skip-till Skip all units until (and including) the specified one." "--skipunit Skip one unit." "--clients Number of test clients (default 16)." "--timeout Test timeout in seconds (default 20 min)." "--force-failure Force the execution of a test that always fails." "--config Extra config file argument." - "--skipfile Name of a file containing test names or regexp patterns (if starts with '/') that should be skipped (one per line). This option can be repeated." - "--skiptest Test name or regexp pattern (if starts with '/') to skip. This option can be repeated." - "--tags Run only tests having specified tags or not having '-' prefixed tags." + "--skipfile Name of a file containing test names or regexp patterns (if" + " starts with '/') that should be skipped (one per" + " line). This option can be repeated." + "--skiptest Test name or regexp pattern (if starts with '/') to" + " skip. This option can be repeated." + "--tags Run only tests having specified tags or not having '-'" + " prefixed tags." "--dont-clean Don't delete valkey log files after the run." "--dont-pre-clean Don't delete existing valkey log files before the run." "--no-latency Skip latency measurements and validation by some tests." @@ -595,13 +606,17 @@ proc print_help_screen {} { "--stop Blocks once the first test fails." "--loop Execute the specified set of tests forever." "--loops Execute the specified set of tests several times." - "--wait-server Wait after server is started (so that you can attach a debugger)." + "--wait-server Wait after server is started (so that you can attach a" + " debugger)." "--dump-logs Dump server log on test failure." "--io-threads Run tests with IO threads." "--tls Run tests in TLS mode." "--tls-module Run tests in TLS mode with Valkey module." "--host Run tests against an external host." "--port TCP port to use against external host." + "--other-server-path " + " Path to another version of valkey-server, used for inter-" + " version compatibility testing." "--baseport Initial port number for spawned valkey servers." "--portcount Port range for spawned valkey servers." "--singledb Use a single database, avoid SELECT." @@ -676,6 +691,9 @@ for {set j 0} {$j < [llength $argv]} {incr j} { } elseif {$opt eq {--port}} { set ::port $arg incr j + } elseif {$opt eq {--other-server-path}} { + set ::other_server_path $arg + incr j } elseif {$opt eq {--baseport}} { set ::baseport $arg incr j From f2510783f9ed70172bd1a2ab9faa40afc99d54d5 Mon Sep 17 00:00:00 2001 From: Nadav Gigi <95503908+NadavGigi@users.noreply.github.com> Date: Thu, 23 Jan 2025 13:17:20 +0200 Subject: [PATCH 08/24] Accelerate hash table iterator with value prefetching (#1568) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR builds upon the [previous entry prefetching optimization](https://github.com/valkey-io/valkey/pull/1501) to further enhance performance by implementing value prefetching for hashtable iterators. ## Implementation Modified `hashtableInitIterator` to accept a new flags parameter, allowing control over iterator behavior. Implemented conditional value prefetching within `hashtableNext` based on the new `HASHTABLE_ITER_PREFETCH_VALUES` flag. When the flag is set, hashtableNext now calls `prefetchBucketValues` at the start of each new bucket, preemptively loading the values of filled entries into the CPU cache. The actual prefetching of values is performed using type-specific callback functions implemented in `server.c`: - For `robj` the `hashtableObjectPrefetchValue` callback is used to prefetch the value if not embeded. This implementation is specifically focused on main database iterations at this stage. Applying it to hashtables that hold other object types should not be problematic, but its performance benefits for those cases will need to be proven through testing and benchmarking. ## Performance ### Setup: - 64cores Graviton 3 Amazon EC2 instance. - 50 mil keys with different value sizes. - Running valkey server over RAM file system. - crc checksum and comperssion off. ### Action - save command. ### Results The results regarding the duration of “save” command was taken from “info all” command. ``` +--------------------+------------------+------------------+ | Prefetching | Value size (byte)| Time (seconds) | +--------------------+------------------+------------------+ | No | 100 | 20.112279 | | Yes | 100 | 12.758519 | | No | 40 | 16.945366 | | Yes | 40 | 10.902022 | | No | 20 | 9.817000 | | Yes | 20 | 9.626821 | | No | 10 | 9.71510 | | Yes | 10 | 9.510565 | +--------------------+------------------+------------------+ ``` The results largely align with our expectations, showing significant improvements for larger values (100 bytes and 40 bytes) that are stored outside the robj. For smaller values (20 bytes and 10 bytes) that are embedded within the robj, we see almost no improvement, which is as expected. However, the small improvement observed even for these embedded values is somewhat surprising. Given that we are not actively prefetching these embedded values, this minor performance gain was not anticipated. perf record on save command **without** value prefetching: ``` --99.98%--rdbSaveDb | |--91.38%--rdbSaveKeyValuePair | | | |--42.72%--rdbSaveRawString | | | | | |--26.69%--rdbWriteRaw | | | | | | | --25.75%--rioFileWrite.lto_priv.0 | | | | | --15.41%--rdbSaveLen | | | | | |--7.58%--rdbWriteRaw | | | | | | | --7.08%--rioFileWrite.lto_priv.0 | | | | | | | --6.54%--_IO_fwrite | | | | | | | | --7.42%--rdbWriteRaw.constprop.1 | | | | | --7.18%--rioFileWrite.lto_priv.0 | | | | | --6.73%--_IO_fwrite | | | | | |--40.44%--rdbSaveStringObject | | | --7.62%--rdbSaveObjectType | | | --7.39%--rdbWriteRaw.constprop.1 | | | --7.04%--rioFileWrite.lto_priv.0 | | | --6.59%--_IO_fwrite | | --7.33%--hashtableNext.constprop.1 | --6.28%--prefetchNextBucketEntries.lto_priv.0 ``` perf record on save command **with** value prefetching: ``` rdbSaveRio | --99.93%--rdbSaveDb | |--79.81%--rdbSaveKeyValuePair | | | |--66.79%--rdbSaveRawString | | | | | |--42.31%--rdbWriteRaw | | | | | | | --40.74%--rioFileWrite.lto_priv.0 | | | | | --23.37%--rdbSaveLen | | | | | |--11.78%--rdbWriteRaw | | | | | | | --11.03%--rioFileWrite.lto_priv.0 | | | | | | | --10.30%--_IO_fwrite | | | | | | | | | --10.98%--rdbWriteRaw.constprop.1 | | | | | --10.44%--rioFileWrite.lto_priv.0 | | | | | --9.74%--_IO_fwrite | | | | | | |--11.33%--rdbSaveObjectType | | | | | --10.96%--rdbWriteRaw.constprop.1 | | | | | --10.51%--rioFileWrite.lto_priv.0 | | | | | --9.75%--_IO_fwrite | | | | | | --0.77%--rdbSaveStringObject | --18.39%--hashtableNext | |--10.04%--hashtableObjectPrefetchValue | --6.06%--prefetchNextBucketEntries ``` Conclusions: The prefetching strategy appears to be working as intended, shifting the performance bottleneck from data access to I/O operations. The significant reduction in rdbSaveStringObject time suggests that string objects(which are the values) are being accessed more efficiently. Signed-off-by: NadavGigi --- src/acl.c | 8 +-- src/aof.c | 4 +- src/cluster.c | 2 +- src/cluster_legacy.c | 2 +- src/db.c | 4 +- src/debug.c | 4 +- src/hashtable.c | 110 ++++++++++++++++++++++++-------------- src/hashtable.h | 13 +++-- src/kvstore.c | 22 +++----- src/kvstore.h | 5 +- src/latency.c | 4 +- src/module.c | 4 +- src/object.c | 8 +-- src/pubsub.c | 4 +- src/rdb.c | 6 +-- src/server.c | 29 ++++++---- src/sort.c | 2 +- src/t_hash.c | 2 +- src/t_set.c | 4 +- src/t_zset.c | 6 +-- src/unit/test_hashtable.c | 6 +-- src/unit/test_kvstore.c | 8 +-- 22 files changed, 147 insertions(+), 110 deletions(-) diff --git a/src/acl.c b/src/acl.c index 184fa54116..807ef744d2 100644 --- a/src/acl.c +++ b/src/acl.c @@ -655,7 +655,7 @@ void ACLChangeSelectorPerm(aclSelector *selector, struct serverCommand *cmd, int ACLResetFirstArgsForCommand(selector, id); if (cmd->subcommands_ht) { hashtableIterator iter; - hashtableInitSafeIterator(&iter, cmd->subcommands_ht); + hashtableInitIterator(&iter, cmd->subcommands_ht, HASHTABLE_ITER_SAFE); void *next; while (hashtableNext(&iter, &next)) { struct serverCommand *sub = next; @@ -673,7 +673,7 @@ void ACLChangeSelectorPerm(aclSelector *selector, struct serverCommand *cmd, int * found and the operation was performed. */ void ACLSetSelectorCommandBitsForCategory(hashtable *commands, aclSelector *selector, uint64_t cflag, int value) { hashtableIterator iter; - hashtableInitIterator(&iter, commands); + hashtableInitIterator(&iter, commands, 0); void *next; while (hashtableNext(&iter, &next)) { struct serverCommand *cmd = next; @@ -741,7 +741,7 @@ void ACLCountCategoryBitsForCommands(hashtable *commands, unsigned long *off, uint64_t cflag) { hashtableIterator iter; - hashtableInitIterator(&iter, commands); + hashtableInitIterator(&iter, commands, 0); void *next; while (hashtableNext(&iter, &next)) { struct serverCommand *cmd = next; @@ -2765,7 +2765,7 @@ sds getAclErrorMessage(int acl_res, user *user, struct serverCommand *cmd, sds e /* ACL CAT category */ void aclCatWithFlags(client *c, hashtable *commands, uint64_t cflag, int *arraylen) { hashtableIterator iter; - hashtableInitIterator(&iter, commands); + hashtableInitIterator(&iter, commands, 0); void *next; while (hashtableNext(&iter, &next)) { struct serverCommand *cmd = next; diff --git a/src/aof.c b/src/aof.c index 024cdb2771..c6828d4b6e 100644 --- a/src/aof.c +++ b/src/aof.c @@ -1891,7 +1891,7 @@ int rewriteSortedSetObject(rio *r, robj *key, robj *o) { } else if (o->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = o->ptr; hashtableIterator iter; - hashtableInitIterator(&iter, zs->ht); + hashtableInitIterator(&iter, zs->ht, 0); void *next; while (hashtableNext(&iter, &next)) { zskiplistNode *node = next; @@ -2217,7 +2217,7 @@ int rewriteAppendOnlyFileRio(rio *aof) { if (rioWrite(aof, selectcmd, sizeof(selectcmd) - 1) == 0) goto werr; if (rioWriteBulkLongLong(aof, j) == 0) goto werr; - kvs_it = kvstoreIteratorInit(db->keys); + kvs_it = kvstoreIteratorInit(db->keys, HASHTABLE_ITER_SAFE | HASHTABLE_ITER_PREFETCH_VALUES); /* Iterate this DB writing every entry */ void *next; while (kvstoreIteratorNext(kvs_it, &next)) { diff --git a/src/cluster.c b/src/cluster.c index 309279e0be..cedcd9ecb1 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -910,7 +910,7 @@ void clusterCommand(client *c) { unsigned int numkeys = maxkeys > keys_in_slot ? keys_in_slot : maxkeys; addReplyArrayLen(c, numkeys); kvstoreHashtableIterator *kvs_di = NULL; - kvs_di = kvstoreGetHashtableIterator(server.db->keys, slot); + kvs_di = kvstoreGetHashtableIterator(server.db->keys, slot, 0); for (unsigned int i = 0; i < numkeys; i++) { void *next; serverAssert(kvstoreHashtableIteratorNext(kvs_di, &next)); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 94d3532dfc..5e976d3060 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -6347,7 +6347,7 @@ unsigned int delKeysInSlot(unsigned int hashslot) { kvstoreHashtableIterator *kvs_di = NULL; void *next; - kvs_di = kvstoreGetHashtableSafeIterator(server.db->keys, hashslot); + kvs_di = kvstoreGetHashtableIterator(server.db->keys, hashslot, HASHTABLE_ITER_SAFE); while (kvstoreHashtableIteratorNext(kvs_di, &next)) { robj *valkey = next; enterExecutionUnit(1, 0); diff --git a/src/db.c b/src/db.c index 535d493954..f2a000030b 100644 --- a/src/db.c +++ b/src/db.c @@ -895,9 +895,9 @@ void keysCommand(client *c) { kvstoreHashtableIterator *kvs_di = NULL; kvstoreIterator *kvs_it = NULL; if (pslot != -1) { - kvs_di = kvstoreGetHashtableSafeIterator(c->db->keys, pslot); + kvs_di = kvstoreGetHashtableIterator(c->db->keys, pslot, HASHTABLE_ITER_SAFE); } else { - kvs_it = kvstoreIteratorInit(c->db->keys); + kvs_it = kvstoreIteratorInit(c->db->keys, HASHTABLE_ITER_SAFE); } void *next; while (kvs_di ? kvstoreHashtableIteratorNext(kvs_di, &next) : kvstoreIteratorNext(kvs_it, &next)) { diff --git a/src/debug.c b/src/debug.c index 915e0c264d..b7f8df04fa 100644 --- a/src/debug.c +++ b/src/debug.c @@ -207,7 +207,7 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o) } else if (o->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = o->ptr; hashtableIterator iter; - hashtableInitIterator(&iter, zs->ht); + hashtableInitIterator(&iter, zs->ht, 0); void *next; while (hashtableNext(&iter, &next)) { @@ -291,7 +291,7 @@ void computeDatasetDigest(unsigned char *final) { for (int j = 0; j < server.dbnum; j++) { serverDb *db = server.db + j; if (kvstoreSize(db->keys) == 0) continue; - kvstoreIterator *kvs_it = kvstoreIteratorInit(db->keys); + kvstoreIterator *kvs_it = kvstoreIteratorInit(db->keys, HASHTABLE_ITER_SAFE | HASHTABLE_ITER_PREFETCH_VALUES); /* hash the DB id, so the same dataset moved in a different DB will lead to a different digest */ aux = htonl(j); diff --git a/src/hashtable.c b/src/hashtable.c index 3f1eff19c1..23097eb246 100644 --- a/src/hashtable.c +++ b/src/hashtable.c @@ -300,7 +300,7 @@ typedef struct { long index; uint16_t pos_in_bucket; uint8_t table; - uint8_t safe; + uint8_t flags; union { /* Unsafe iterator fingerprint for misuse detection. */ uint64_t fingerprint; @@ -936,6 +936,7 @@ static inline incrementalFind *incrementalFindFromOpaque(hashtableIncrementalFin /* Prefetches all filled entries in the given bucket to optimize future memory access. */ static void prefetchBucketEntries(bucket *b) { + if (!b->presence) return; for (int pos = 0; pos < numBucketPositions(b); pos++) { if (isPositionFilled(b, pos)) { valkey_prefetch(b->entries[pos]); @@ -979,6 +980,26 @@ static void prefetchNextBucketEntries(iter *iter, bucket *current_bucket) { } } +/* Prefetches the values associated with the entries in the given bucket by + * calling the entryPrefetchValue callback in the hashtableType */ +static void prefetchBucketValues(bucket *b, hashtable *ht) { + if (!b->presence) return; + assert(ht->type->entryPrefetchValue != NULL); + for (int pos = 0; pos < numBucketPositions(b); pos++) { + if (isPositionFilled(b, pos)) { + ht->type->entryPrefetchValue(b->entries[pos]); + } + } +} + +static inline int isSafe(iter *iter) { + return (iter->flags & HASHTABLE_ITER_SAFE); +} + +static inline int shouldPrefetchValues(iter *iter) { + return (iter->flags & HASHTABLE_ITER_PREFETCH_VALUES); +} + /* --- API functions --- */ /* Allocates and initializes a new hashtable specified by the given type. */ @@ -1792,31 +1813,32 @@ size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction f /* --- Iterator --- */ -/* Initialize a iterator, that is not allowed to insert, delete or even lookup - * entries in the hashtable, because such operations can trigger incremental - * rehashing which moves entries around and confuses the iterator. Only - * hashtableNext is allowed. Each entry is returned exactly once. Call - * hashtableResetIterator when you are done. See also - * hashtableInitSafeIterator. */ -void hashtableInitIterator(hashtableIterator *iterator, hashtable *ht) { - iter *iter; - iter = iteratorFromOpaque(iterator); - iter->hashtable = ht; - iter->table = 0; - iter->index = -1; - iter->safe = 0; -} - -/* Initialize a safe iterator, which is allowed to modify the hash table while - * iterating. It pauses incremental rehashing to prevent entries from moving - * around. Call hashtableNext to fetch each entry. You must call - * hashtableResetIterator when you are done with a safe iterator. +/* Initialize an iterator for a hashtable. * - * It's allowed to insert and replace entries. Deleting entries is only allowed - * for the entry that was just returned by hashtableNext. Deleting other entries - * is possible, but doing so can cause internal fragmentation, so don't. + * The 'flags' argument can be used to tweak the behaviour. It's a bitwise-or + * (zero means no flags) of the following: + * + * - HASHTABLE_ITER_SAFE: Use a safe iterator that can handle + * modifications to the hash table while iterating. + * - HASHTABLE_ITER_PREFETCH_VALUES: Enables prefetching of entries values, + * which can improve performance in some scenarios. Because the hashtable is generic and + * doesn't care which object we store, the callback entryPrefetchValue must be set to help + * us prefetch necessary fields of specific object types stored in the hashtable. + * + * For a non-safe iterator (default, when HASHTABLE_ITER_SAFE is not set): + * It is not allowed to insert, delete or even lookup entries in the hashtable, + * because such operations can trigger incremental rehashing which moves entries + * around and confuses the iterator. Only hashtableNext is allowed. Each entry + * is returned exactly once. + * + * For a safe iterator (when HASHTABLE_ITER_SAFE is set): + * It is allowed to modify the hash table while iterating. It pauses incremental + * rehashing to prevent entries from moving around. It's allowed to insert and + * replace entries. Deleting entries is only allowed for the entry that was just + * returned by hashtableNext. Deleting other entries is possible, but doing so + * can cause internal fragmentation, so don't. * - * Guarantees: + * Guarantees for safe iterators: * * - Entries that are in the hash table for the entire iteration are returned * exactly once. @@ -1829,18 +1851,31 @@ void hashtableInitIterator(hashtableIterator *iterator, hashtable *ht) { * * - Entries that are inserted during the iteration may or may not be returned * by the iterator. + * + * Call hashtableNext to fetch each entry. You must call hashtableResetIterator + * when you are done with the iterator. */ -void hashtableInitSafeIterator(hashtableIterator *iterator, hashtable *ht) { - hashtableInitIterator(iterator, ht); +void hashtableInitIterator(hashtableIterator *iterator, hashtable *ht, uint8_t flags) { + iter *iter; + iter = iteratorFromOpaque(iterator); + iter->hashtable = ht; + iter->table = 0; + iter->index = -1; + iter->flags = flags; +} + +/* Reinitializes the iterator for the provided hashtable while + * preserving the flags from its previous initialization. */ +void hashtableReinitIterator(hashtableIterator *iterator, hashtable *ht) { iter *iter = iteratorFromOpaque(iterator); - iter->safe = 1; + hashtableInitIterator(iterator, ht, iter->flags); } /* Resets a stack-allocated iterator. */ void hashtableResetIterator(hashtableIterator *iterator) { iter *iter = iteratorFromOpaque(iterator); if (!(iter->index == -1 && iter->table == 0)) { - if (iter->safe) { + if (isSafe(iter)) { hashtableResumeRehashing(iter->hashtable); assert(iter->hashtable->pause_rehash >= 0); } else { @@ -1850,21 +1885,13 @@ void hashtableResetIterator(hashtableIterator *iterator) { } /* Allocates and initializes an iterator. */ -hashtableIterator *hashtableCreateIterator(hashtable *ht) { +hashtableIterator *hashtableCreateIterator(hashtable *ht, uint8_t flags) { iter *iter = zmalloc(sizeof(*iter)); hashtableIterator *opaque = iteratorToOpaque(iter); - hashtableInitIterator(opaque, ht); + hashtableInitIterator(opaque, ht, flags); return opaque; } -/* Allocates and initializes a safe iterator. */ -hashtableIterator *hashtableCreateSafeIterator(hashtable *ht) { - hashtableIterator *iterator = hashtableCreateIterator(ht); - iter *iter = iteratorFromOpaque(iterator); - iter->safe = 1; - return iterator; -} - /* Resets and frees the memory of an allocated iterator, i.e. one created using * hashtableCreate(Safe)Iterator. */ void hashtableReleaseIterator(hashtableIterator *iterator) { @@ -1880,7 +1907,7 @@ int hashtableNext(hashtableIterator *iterator, void **elemptr) { while (1) { if (iter->index == -1 && iter->table == 0) { /* It's the first call to next. */ - if (iter->safe) { + if (isSafe(iter)) { hashtablePauseRehashing(iter->hashtable); iter->last_seen_size = iter->hashtable->used[iter->table]; } else { @@ -1907,7 +1934,7 @@ int hashtableNext(hashtableIterator *iterator, void **elemptr) { iter->bucket = getChildBucket(iter->bucket); } else if (iter->pos_in_bucket >= ENTRIES_PER_BUCKET) { /* Bucket index done. */ - if (iter->safe) { + if (isSafe(iter)) { /* If entries in this bucket chain have been deleted, * they've left empty spaces in the buckets. The chain is * not automatically compacted when rehashing is paused. If @@ -1936,6 +1963,9 @@ int hashtableNext(hashtableIterator *iterator, void **elemptr) { } bucket *b = iter->bucket; if (iter->pos_in_bucket == 0) { + if (shouldPrefetchValues(iter)) { + prefetchBucketValues(b, iter->hashtable); + } prefetchNextBucketEntries(iter, b); } if (!isPositionFilled(b, iter->pos_in_bucket)) { diff --git a/src/hashtable.h b/src/hashtable.h index 4291cf5a5d..67e8a139f8 100644 --- a/src/hashtable.h +++ b/src/hashtable.h @@ -60,6 +60,8 @@ typedef struct { /* Callback to free an entry when it's overwritten or deleted. * Optional. */ void (*entryDestructor)(void *entry); + /* Callback to prefetch the value associated with a hashtable entry. */ + void (*entryPrefetchValue)(const void *entry); /* Callback to control when resizing should be allowed. */ int (*resizeAllowed)(size_t moreMem, double usedRatio); /* Invoked at the start of rehashing. */ @@ -91,6 +93,10 @@ typedef void (*hashtableScanFunction)(void *privdata, void *entry); /* Scan flags */ #define HASHTABLE_SCAN_EMIT_REF (1 << 0) +/* Iterator flags */ +#define HASHTABLE_ITER_SAFE (1 << 0) +#define HASHTABLE_ITER_PREFETCH_VALUES (1 << 1) + /* --- Prototypes --- */ /* Hash function (global seed) */ @@ -144,11 +150,10 @@ int hashtableIncrementalFindGetResult(hashtableIncrementalFindState *state, void /* Iteration & scan */ size_t hashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata); size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata, void *(*defragfn)(void *), int flags); -void hashtableInitIterator(hashtableIterator *iter, hashtable *ht); -void hashtableInitSafeIterator(hashtableIterator *iter, hashtable *ht); +void hashtableInitIterator(hashtableIterator *iter, hashtable *ht, uint8_t flags); +void hashtableReinitIterator(hashtableIterator *iterator, hashtable *ht); void hashtableResetIterator(hashtableIterator *iter); -hashtableIterator *hashtableCreateIterator(hashtable *ht); -hashtableIterator *hashtableCreateSafeIterator(hashtable *ht); +hashtableIterator *hashtableCreateIterator(hashtable *ht, uint8_t flags); void hashtableReleaseIterator(hashtableIterator *iter); int hashtableNext(hashtableIterator *iter, void **elemptr); diff --git a/src/kvstore.c b/src/kvstore.c index d6db4d3fe1..76bfb35d98 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -467,7 +467,7 @@ void kvstoreGetStats(kvstore *kvs, char *buf, size_t bufsize, int full) { hashtableStats *mainHtStats = NULL; hashtableStats *rehashHtStats = NULL; hashtable *ht; - kvstoreIterator *kvs_it = kvstoreIteratorInit(kvs); + kvstoreIterator *kvs_it = kvstoreIteratorInit(kvs, HASHTABLE_ITER_SAFE); while ((ht = kvstoreIteratorNextHashtable(kvs_it))) { hashtableStats *stats = hashtableGetStatsHt(ht, 0, full); if (!mainHtStats) { @@ -576,12 +576,12 @@ int kvstoreNumHashtables(kvstore *kvs) { /* Returns kvstore iterator that can be used to iterate through sub-hash tables. * * The caller should free the resulting kvs_it with kvstoreIteratorRelease. */ -kvstoreIterator *kvstoreIteratorInit(kvstore *kvs) { +kvstoreIterator *kvstoreIteratorInit(kvstore *kvs, uint8_t flags) { kvstoreIterator *kvs_it = zmalloc(sizeof(*kvs_it)); kvs_it->kvs = kvs; kvs_it->didx = -1; kvs_it->next_didx = kvstoreGetFirstNonEmptyHashtableIndex(kvs_it->kvs); /* Finds first non-empty hashtable index. */ - hashtableInitSafeIterator(&kvs_it->di, NULL); + hashtableInitIterator(&kvs_it->di, NULL, flags); return kvs_it; } @@ -625,7 +625,7 @@ int kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next) { /* No current hashtable or reached the end of the hash table. */ hashtable *ht = kvstoreIteratorNextHashtable(kvs_it); if (!ht) return 0; - hashtableInitSafeIterator(&kvs_it->di, ht); + hashtableReinitIterator(&kvs_it->di, ht); return hashtableNext(&kvs_it->di, next); } } @@ -691,23 +691,15 @@ unsigned long kvstoreHashtableSize(kvstore *kvs, int didx) { return hashtableSize(ht); } -kvstoreHashtableIterator *kvstoreGetHashtableIterator(kvstore *kvs, int didx) { +kvstoreHashtableIterator *kvstoreGetHashtableIterator(kvstore *kvs, int didx, uint8_t flags) { kvstoreHashtableIterator *kvs_di = zmalloc(sizeof(*kvs_di)); kvs_di->kvs = kvs; kvs_di->didx = didx; - hashtableInitIterator(&kvs_di->di, kvstoreGetHashtable(kvs, didx)); + hashtableInitIterator(&kvs_di->di, kvstoreGetHashtable(kvs, didx), flags); return kvs_di; } -kvstoreHashtableIterator *kvstoreGetHashtableSafeIterator(kvstore *kvs, int didx) { - kvstoreHashtableIterator *kvs_di = zmalloc(sizeof(*kvs_di)); - kvs_di->kvs = kvs; - kvs_di->didx = didx; - hashtableInitSafeIterator(&kvs_di->di, kvstoreGetHashtable(kvs, didx)); - return kvs_di; -} - -/* Free the kvs_di returned by kvstoreGetHashtableIterator and kvstoreGetHashtableSafeIterator. */ +/* Free the kvs_di returned by kvstoreGetHashtableIterator. */ void kvstoreReleaseHashtableIterator(kvstoreHashtableIterator *kvs_di) { /* The hashtable may be deleted during the iteration process, so here need to check for NULL. */ if (kvstoreGetHashtable(kvs_di->kvs, kvs_di->didx)) { diff --git a/src/kvstore.h b/src/kvstore.h index 1a8c74a6b9..d5db1a89aa 100644 --- a/src/kvstore.h +++ b/src/kvstore.h @@ -43,7 +43,7 @@ void kvstoreHashtableTrackMemUsage(hashtable *s, ssize_t delta); size_t kvstoreHashtableMetadataSize(void); /* kvstore iterator specific functions */ -kvstoreIterator *kvstoreIteratorInit(kvstore *kvs); +kvstoreIterator *kvstoreIteratorInit(kvstore *kvs, uint8_t flags); void kvstoreIteratorRelease(kvstoreIterator *kvs_it); int kvstoreIteratorGetCurrentHashtableIndex(kvstoreIterator *kvs_it); int kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next); @@ -57,8 +57,7 @@ unsigned long kvstoreHashtableRehashingCount(kvstore *kvs); /* Specific hashtable access by hashtable-index */ unsigned long kvstoreHashtableSize(kvstore *kvs, int didx); -kvstoreHashtableIterator *kvstoreGetHashtableIterator(kvstore *kvs, int didx); -kvstoreHashtableIterator *kvstoreGetHashtableSafeIterator(kvstore *kvs, int didx); +kvstoreHashtableIterator *kvstoreGetHashtableIterator(kvstore *kvs, int didx, uint8_t flags); void kvstoreReleaseHashtableIterator(kvstoreHashtableIterator *kvs_id); int kvstoreHashtableIteratorNext(kvstoreHashtableIterator *kvs_di, void **next); int kvstoreHashtableRandomEntry(kvstore *kvs, int didx, void **found); diff --git a/src/latency.c b/src/latency.c index 2beb4859d1..fa448dac35 100644 --- a/src/latency.c +++ b/src/latency.c @@ -528,7 +528,7 @@ void fillCommandCDF(client *c, struct hdr_histogram *histogram) { * a per command cumulative distribution of latencies. */ void latencyAllCommandsFillCDF(client *c, hashtable *commands, int *command_with_data) { hashtableIterator iter; - hashtableInitSafeIterator(&iter, commands); + hashtableInitIterator(&iter, commands, HASHTABLE_ITER_SAFE); void *next; while (hashtableNext(&iter, &next)) { struct serverCommand *cmd = next; @@ -565,7 +565,7 @@ void latencySpecificCommandsFillCDF(client *c) { if (cmd->subcommands_ht) { hashtableIterator iter; - hashtableInitSafeIterator(&iter, cmd->subcommands_ht); + hashtableInitIterator(&iter, cmd->subcommands_ht, HASHTABLE_ITER_SAFE); void *next; while (hashtableNext(&iter, &next)) { struct serverCommand *sub = next; diff --git a/src/module.c b/src/module.c index 75dcd81cd6..17ac4ddf02 100644 --- a/src/module.c +++ b/src/module.c @@ -12162,7 +12162,7 @@ int moduleFreeCommand(struct ValkeyModule *module, struct serverCommand *cmd) { if (cmd->subcommands_ht) { hashtableIterator iter; void *next; - hashtableInitSafeIterator(&iter, cmd->subcommands_ht); + hashtableInitIterator(&iter, cmd->subcommands_ht, HASHTABLE_ITER_SAFE); while (hashtableNext(&iter, &next)) { struct serverCommand *sub = next; if (moduleFreeCommand(module, sub) != C_OK) continue; @@ -12185,7 +12185,7 @@ void moduleUnregisterCommands(struct ValkeyModule *module) { /* Unregister all the commands registered by this module. */ hashtableIterator iter; void *next; - hashtableInitSafeIterator(&iter, server.commands); + hashtableInitIterator(&iter, server.commands, HASHTABLE_ITER_SAFE); while (hashtableNext(&iter, &next)) { struct serverCommand *cmd = next; if (moduleFreeCommand(module, cmd) != C_OK) continue; diff --git a/src/object.c b/src/object.c index b8200dd815..94c2985edb 100644 --- a/src/object.c +++ b/src/object.c @@ -630,7 +630,7 @@ void dismissSetObject(robj *o, size_t size_hint) { * page size, and there's a high chance we'll actually dismiss something. */ if (size_hint / hashtableSize(ht) >= server.page_size) { hashtableIterator iter; - hashtableInitIterator(&iter, ht); + hashtableInitIterator(&iter, ht, 0); void *next; while (hashtableNext(&iter, &next)) { sds item = next; @@ -682,7 +682,7 @@ void dismissHashObject(robj *o, size_t size_hint) { * a page size, and there's a high chance we'll actually dismiss something. */ if (size_hint / hashtableSize(ht) >= server.page_size) { hashtableIterator iter; - hashtableInitIterator(&iter, ht); + hashtableInitIterator(&iter, ht, 0); void *next; while (hashtableNext(&iter, &next)) { dismissHashTypeEntry(next); @@ -1156,7 +1156,7 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { asize = sizeof(*o) + hashtableMemUsage(ht); hashtableIterator iter; - hashtableInitIterator(&iter, ht); + hashtableInitIterator(&iter, ht, 0); void *next; while (hashtableNext(&iter, &next) && samples < sample_size) { sds element = next; @@ -1197,7 +1197,7 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { } else if (o->encoding == OBJ_ENCODING_HASHTABLE) { hashtable *ht = o->ptr; hashtableIterator iter; - hashtableInitIterator(&iter, ht); + hashtableInitIterator(&iter, ht, 0); void *next; asize = sizeof(*o) + hashtableMemUsage(ht); diff --git a/src/pubsub.c b/src/pubsub.c index 27b5611788..be6c739e98 100644 --- a/src/pubsub.c +++ b/src/pubsub.c @@ -366,7 +366,7 @@ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype ty void pubsubShardUnsubscribeAllChannelsInSlot(unsigned int slot) { if (!kvstoreHashtableSize(server.pubsubshard_channels, slot)) return; - kvstoreHashtableIterator *kvs_di = kvstoreGetHashtableSafeIterator(server.pubsubshard_channels, slot); + kvstoreHashtableIterator *kvs_di = kvstoreGetHashtableIterator(server.pubsubshard_channels, slot, HASHTABLE_ITER_SAFE); void *element; while (kvstoreHashtableIteratorNext(kvs_di, &element)) { dict *clients = element; @@ -730,7 +730,7 @@ void channelList(client *c, sds pat, kvstore *pubsub_channels) { replylen = addReplyDeferredLen(c); for (unsigned int i = 0; i < slot_cnt; i++) { if (!kvstoreHashtableSize(pubsub_channels, i)) continue; - kvstoreHashtableIterator *kvs_di = kvstoreGetHashtableIterator(pubsub_channels, i); + kvstoreHashtableIterator *kvs_di = kvstoreGetHashtableIterator(pubsub_channels, i, 0); void *next; while (kvstoreHashtableIteratorNext(kvs_di, &next)) { dict *clients = next; diff --git a/src/rdb.c b/src/rdb.c index 0bb5d7d45d..6653e99c3a 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -887,7 +887,7 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) { nwritten += n; hashtableIterator iterator; - hashtableInitIterator(&iterator, set); + hashtableInitIterator(&iterator, set, 0); void *next; while (hashtableNext(&iterator, &next)) { sds ele = next; @@ -959,7 +959,7 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) { nwritten += n; hashtableIterator iter; - hashtableInitIterator(&iter, ht); + hashtableInitIterator(&iter, ht, 0); void *next; while (hashtableNext(&iter, &next)) { sds field = hashTypeEntryGetField(next); @@ -1349,7 +1349,7 @@ ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter) { if ((res = rdbSaveLen(rdb, expires_size)) < 0) goto werr; written += res; - kvs_it = kvstoreIteratorInit(db->keys); + kvs_it = kvstoreIteratorInit(db->keys, HASHTABLE_ITER_SAFE | HASHTABLE_ITER_PREFETCH_VALUES); int last_slot = -1; /* Iterate this DB writing every entry */ void *next; diff --git a/src/server.c b/src/server.c index 144841eff9..ab06a24470 100644 --- a/src/server.c +++ b/src/server.c @@ -577,6 +577,15 @@ const void *hashtableObjectGetKey(const void *entry) { return objectGetKey(entry); } +/* Prefetch the value if it's not embedded. */ +void hashtableObjectPrefetchValue(const void *entry) { + const robj *obj = entry; + if (obj->encoding != OBJ_ENCODING_EMBSTR && + obj->encoding != OBJ_ENCODING_INT) { + valkey_prefetch(obj->ptr); + } +} + int hashtableObjKeyCompare(const void *key1, const void *key2) { const robj *o1 = key1, *o2 = key2; return hashtableSdsKeyCompare(o1->ptr, o2->ptr); @@ -589,6 +598,7 @@ void hashtableObjectDestructor(void *val) { /* Kvstore->keys, keys are sds strings, vals are Objects. */ hashtableType kvstoreKeysHashtableType = { + .entryPrefetchValue = hashtableObjectPrefetchValue, .entryGetKey = hashtableObjectGetKey, .hashFunction = hashtableSdsHash, .keyCompare = hashtableSdsKeyCompare, @@ -602,6 +612,7 @@ hashtableType kvstoreKeysHashtableType = { /* Kvstore->expires */ hashtableType kvstoreExpiresHashtableType = { + .entryPrefetchValue = hashtableObjectPrefetchValue, .entryGetKey = hashtableObjectGetKey, .hashFunction = hashtableSdsHash, .keyCompare = hashtableSdsKeyCompare, @@ -3205,7 +3216,7 @@ void populateCommandTable(void) { void resetCommandTableStats(hashtable *commands) { hashtableIterator iter; void *next; - hashtableInitSafeIterator(&iter, commands); + hashtableInitIterator(&iter, commands, HASHTABLE_ITER_SAFE); while (hashtableNext(&iter, &next)) { struct serverCommand *c = next; c->microseconds = 0; @@ -4988,7 +4999,7 @@ void addReplyCommandSubCommands(client *c, void *next; hashtableIterator iter; - hashtableInitSafeIterator(&iter, cmd->subcommands_ht); + hashtableInitIterator(&iter, cmd->subcommands_ht, HASHTABLE_ITER_SAFE); while (hashtableNext(&iter, &next)) { struct serverCommand *sub = next; if (use_map) addReplyBulkCBuffer(c, sub->fullname, sdslen(sub->fullname)); @@ -5150,7 +5161,7 @@ void commandCommand(client *c) { hashtableIterator iter; void *next; addReplyArrayLen(c, hashtableSize(server.commands)); - hashtableInitIterator(&iter, server.commands); + hashtableInitIterator(&iter, server.commands, 0); while (hashtableNext(&iter, &next)) { struct serverCommand *cmd = next; addReplyCommandInfo(c, cmd); @@ -5209,7 +5220,7 @@ int shouldFilterFromCommandList(struct serverCommand *cmd, commandListFilter *fi void commandListWithFilter(client *c, hashtable *commands, commandListFilter filter, int *numcmds) { hashtableIterator iter; void *next; - hashtableInitIterator(&iter, commands); + hashtableInitIterator(&iter, commands, 0); while (hashtableNext(&iter, &next)) { struct serverCommand *cmd = next; if (!shouldFilterFromCommandList(cmd, &filter)) { @@ -5228,7 +5239,7 @@ void commandListWithFilter(client *c, hashtable *commands, commandListFilter fil void commandListWithoutFilter(client *c, hashtable *commands, int *numcmds) { hashtableIterator iter; void *next; - hashtableInitIterator(&iter, commands); + hashtableInitIterator(&iter, commands, 0); while (hashtableNext(&iter, &next)) { struct serverCommand *cmd = next; addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname)); @@ -5290,7 +5301,7 @@ void commandInfoCommand(client *c) { hashtableIterator iter; void *next; addReplyArrayLen(c, hashtableSize(server.commands)); - hashtableInitIterator(&iter, server.commands); + hashtableInitIterator(&iter, server.commands, 0); while (hashtableNext(&iter, &next)) { struct serverCommand *cmd = next; addReplyCommandInfo(c, cmd); @@ -5312,7 +5323,7 @@ void commandDocsCommand(client *c) { hashtableIterator iter; void *next; addReplyMapLen(c, hashtableSize(server.commands)); - hashtableInitIterator(&iter, server.commands); + hashtableInitIterator(&iter, server.commands, 0); while (hashtableNext(&iter, &next)) { struct serverCommand *cmd = next; addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname)); @@ -5441,7 +5452,7 @@ const char *getSafeInfoString(const char *s, size_t len, char **tmp) { sds genValkeyInfoStringCommandStats(sds info, hashtable *commands) { hashtableIterator iter; void *next; - hashtableInitSafeIterator(&iter, commands); + hashtableInitIterator(&iter, commands, HASHTABLE_ITER_SAFE); while (hashtableNext(&iter, &next)) { struct serverCommand *c = next; char *tmpsafe; @@ -5478,7 +5489,7 @@ sds genValkeyInfoStringACLStats(sds info) { sds genValkeyInfoStringLatencyStats(sds info, hashtable *commands) { hashtableIterator iter; void *next; - hashtableInitSafeIterator(&iter, commands); + hashtableInitIterator(&iter, commands, HASHTABLE_ITER_SAFE); while (hashtableNext(&iter, &next)) { struct serverCommand *c = next; char *tmpsafe; diff --git a/src/sort.c b/src/sort.c index 7af96141e8..754ebef4a2 100644 --- a/src/sort.c +++ b/src/sort.c @@ -447,7 +447,7 @@ void sortCommandGeneric(client *c, int readonly) { } else if (sortval->type == OBJ_ZSET) { hashtable *ht = ((zset *)sortval->ptr)->ht; hashtableIterator iter; - hashtableInitIterator(&iter, ht); + hashtableInitIterator(&iter, ht, 0); void *next; while (hashtableNext(&iter, &next)) { zskiplistNode *node = next; diff --git a/src/t_hash.c b/src/t_hash.c index b6e6457bb6..b347ecf31f 100644 --- a/src/t_hash.c +++ b/src/t_hash.c @@ -382,7 +382,7 @@ void hashTypeInitIterator(robj *subject, hashTypeIterator *hi) { hi->fptr = NULL; hi->vptr = NULL; } else if (hi->encoding == OBJ_ENCODING_HASHTABLE) { - hashtableInitIterator(&hi->iter, subject->ptr); + hashtableInitIterator(&hi->iter, subject->ptr, 0); } else { serverPanic("Unknown hash encoding"); } diff --git a/src/t_set.c b/src/t_set.c index 4279baf82f..a69345de4f 100644 --- a/src/t_set.c +++ b/src/t_set.c @@ -317,7 +317,7 @@ setTypeIterator *setTypeInitIterator(robj *subject) { si->subject = subject; si->encoding = subject->encoding; if (si->encoding == OBJ_ENCODING_HASHTABLE) { - si->hashtable_iterator = hashtableCreateIterator(subject->ptr); + si->hashtable_iterator = hashtableCreateIterator(subject->ptr, 0); } else if (si->encoding == OBJ_ENCODING_INTSET) { si->ii = 0; } else if (si->encoding == OBJ_ENCODING_LISTPACK) { @@ -1179,7 +1179,7 @@ void srandmemberWithCountCommand(client *c) { /* CASE 3 & 4: send the result to the user. */ { hashtableIterator iter; - hashtableInitIterator(&iter, ht); + hashtableInitIterator(&iter, ht, 0); addReplyArrayLen(c, count); serverAssert(count == hashtableSize(ht)); diff --git a/src/t_zset.c b/src/t_zset.c index 77c96613b7..2444f3ecd0 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -2092,7 +2092,7 @@ static void zuiInitIterator(zsetopsrc *op) { it->is.is = op->subject->ptr; it->is.ii = 0; } else if (op->encoding == OBJ_ENCODING_HASHTABLE) { - it->ht.iter = hashtableCreateIterator(op->subject->ptr); + it->ht.iter = hashtableCreateIterator(op->subject->ptr, 0); } else if (op->encoding == OBJ_ENCODING_LISTPACK) { it->lp.lp = op->subject->ptr; it->lp.p = lpFirst(it->lp.lp); @@ -2349,7 +2349,7 @@ static size_t zsetHashtableGetMaxElementLength(hashtable *ht, size_t *totallen) size_t maxelelen = 0; hashtableIterator iter; - hashtableInitIterator(&iter, ht); + hashtableInitIterator(&iter, ht, 0); void *next; while (hashtableNext(&iter, &next)) { zskiplistNode *node = next; @@ -2749,7 +2749,7 @@ static void zunionInterDiffGenericCommand(client *c, robj *dstkey, int numkeysIn /* Step 2: Create the skiplist using final score ordering */ hashtableIterator iter; - hashtableInitIterator(&iter, dstzset->ht); + hashtableInitIterator(&iter, dstzset->ht, 0); void *next; while (hashtableNext(&iter, &next)) { diff --git a/src/unit/test_hashtable.c b/src/unit/test_hashtable.c index 689440e43d..71a7dde841 100644 --- a/src/unit/test_hashtable.c +++ b/src/unit/test_hashtable.c @@ -547,7 +547,7 @@ int test_iterator(int argc, char **argv, int flags) { size_t num_returned = 0; hashtableIterator iter; void *next; - hashtableInitIterator(&iter, ht); + hashtableInitIterator(&iter, ht, 0); while (hashtableNext(&iter, &next)) { uint8_t *entry = next; num_returned++; @@ -592,7 +592,7 @@ int test_safe_iterator(int argc, char **argv, int flags) { size_t num_returned = 0; hashtableIterator iter; void *next; - hashtableInitSafeIterator(&iter, ht); + hashtableInitIterator(&iter, ht, HASHTABLE_ITER_SAFE); while (hashtableNext(&iter, &next)) { uint8_t *entry = next; size_t index = entry - entry_counts; @@ -657,7 +657,7 @@ int test_compact_bucket_chain(int argc, char **argv, int flags) { size_t num_chained_buckets = hashtableChainedBuckets(ht, 0); size_t num_returned = 0; hashtableIterator iter; - hashtableInitSafeIterator(&iter, ht); + hashtableInitIterator(&iter, ht, HASHTABLE_ITER_SAFE); void *entry; while (hashtableNext(&iter, &entry)) { /* As long as the iterator is still returning entries from the same diff --git a/src/unit/test_kvstore.c b/src/unit/test_kvstore.c index d4cc91d6d8..55b311c4ba 100644 --- a/src/unit/test_kvstore.c +++ b/src/unit/test_kvstore.c @@ -77,7 +77,7 @@ int test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable(int argc, char **arg TEST_ASSERT(kvstoreHashtableAdd(kvs1, didx, stringFromInt(i))); } - kvs_it = kvstoreIteratorInit(kvs1); + kvs_it = kvstoreIteratorInit(kvs1, HASHTABLE_ITER_SAFE); while (kvstoreIteratorNext(kvs_it, &key)) { curr_slot = kvstoreIteratorGetCurrentHashtableIndex(kvs_it); TEST_ASSERT(kvstoreHashtableDelete(kvs1, curr_slot, key)); @@ -110,7 +110,7 @@ int test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable(int argc, char **argv, TEST_ASSERT(kvstoreHashtableAdd(kvs2, didx, stringFromInt(i))); } - kvs_it = kvstoreIteratorInit(kvs2); + kvs_it = kvstoreIteratorInit(kvs2, HASHTABLE_ITER_SAFE); while (kvstoreIteratorNext(kvs_it, &key)) { curr_slot = kvstoreIteratorGetCurrentHashtableIndex(kvs_it); TEST_ASSERT(kvstoreHashtableDelete(kvs2, curr_slot, key)); @@ -146,7 +146,7 @@ int test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable(int argc, c TEST_ASSERT(kvstoreHashtableAdd(kvs1, didx, stringFromInt(i))); } - kvs_di = kvstoreGetHashtableSafeIterator(kvs1, didx); + kvs_di = kvstoreGetHashtableIterator(kvs1, didx, HASHTABLE_ITER_SAFE); while (kvstoreHashtableIteratorNext(kvs_di, &key)) { TEST_ASSERT(kvstoreHashtableDelete(kvs1, didx, key)); } @@ -177,7 +177,7 @@ int test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable(int argc, cha TEST_ASSERT(kvstoreHashtableAdd(kvs2, didx, stringFromInt(i))); } - kvs_di = kvstoreGetHashtableSafeIterator(kvs2, didx); + kvs_di = kvstoreGetHashtableIterator(kvs2, didx, HASHTABLE_ITER_SAFE); while (kvstoreHashtableIteratorNext(kvs_di, &key)) { TEST_ASSERT(kvstoreHashtableDelete(kvs2, didx, key)); } From 3f21705a6c51d7f412f80dd7aabd83dac45b2493 Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Fri, 24 Jan 2025 11:41:40 +0800 Subject: [PATCH 09/24] Feature COMMANDLOG to record slow execution and large request/reply (#1294) As discussed in PR #336. We have different types of resources like CPU, memory, network, etc. The `slowlog` can only record commands eat lots of CPU during the processing phase (doesn't include read/write network time), but can not record commands eat too many memory and network. For example: 1. run "SET key value(10 megabytes)" command would not be recored in slowlog, since when processing it the SET command only insert the value's pointer into db dict. But that command eats huge memory in query buffer and bandwidth from network. In this case, just 1000 tps can cause 10GB/s network flow. 2. run "GET key" command and the key's value length is 10 megabytes. The get command can eat huge memory in output buffer and bandwidth to network. This PR introduces a new command `COMMANDLOG`, to log commands that consume significant network bandwidth, including both input and output. Users can retrieve the results using `COMMANDLOG get large-request` and `COMMANDLOG get large-reply`, all subcommands for `COMMANDLOG` are: * `COMMANDLOG HELP` * `COMMANDLOG GET ` * `COMMANDLOG LEN ` * `COMMANDLOG RESET ` And the slowlog is also incorporated into the commandlog. For each of these three types, additional configs have been added for control: * `commandlog-request-larger-than` and `commandlog-large-request-max-len` represent the threshold for large requests(the unit is Bytes) and the maximum number of commands that can be recorded. * `commandlog-reply-larger-than` and `commandlog-large-reply-max-len` represent the threshold for large replies(the unit is Bytes) and the maximum number of commands that can be recorded. * `commandlog-execution-slower-than` and `commandlog-slow-execution-max-len` represent the threshold for slow executions(the unit is microseconds) and the maximum number of commands that can be recorded. * Additionally, `slowlog-log-slower-than` and `slowlog-max-len` are now set as aliases for these two new configs. --------- Signed-off-by: zhaozhao.zz Co-authored-by: Madelyn Olson Co-authored-by: Ping Xie --- cmake/Modules/SourceFiles.cmake | 2 +- src/Makefile | 2 +- src/blocked.c | 16 +- src/commandlog.c | 265 +++++++++++++++++++++ src/{slowlog.h => commandlog.h} | 29 ++- src/commands.def | 154 +++++++++++- src/commands/commandlog-get.json | 85 +++++++ src/commands/commandlog-help.json | 22 ++ src/commands/commandlog-len.json | 46 ++++ src/commands/commandlog-reset.json | 43 ++++ src/commands/commandlog.json | 9 + src/commands/exec.json | 2 +- src/commands/slowlog-get.json | 5 + src/commands/slowlog-help.json | 5 + src/commands/slowlog-len.json | 5 + src/commands/slowlog-reset.json | 5 + src/commands/slowlog.json | 7 +- src/config.c | 10 +- src/latency.c | 4 +- src/module.c | 8 +- src/multi.c | 2 +- src/networking.c | 4 +- src/server.c | 33 +-- src/server.h | 28 ++- src/slowlog.c | 193 --------------- tests/unit/commandlog.tcl | 361 +++++++++++++++++++++++++++++ valkey.conf | 85 +++++-- 27 files changed, 1138 insertions(+), 292 deletions(-) create mode 100644 src/commandlog.c rename src/{slowlog.h => commandlog.h} (71%) create mode 100644 src/commands/commandlog-get.json create mode 100644 src/commands/commandlog-help.json create mode 100644 src/commands/commandlog-len.json create mode 100644 src/commands/commandlog-reset.json create mode 100644 src/commands/commandlog.json delete mode 100644 src/slowlog.c create mode 100644 tests/unit/commandlog.tcl diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake index 80c3f0c876..e51f9b7600 100644 --- a/cmake/Modules/SourceFiles.cmake +++ b/cmake/Modules/SourceFiles.cmake @@ -47,7 +47,7 @@ set(VALKEY_SERVER_SRCS ${CMAKE_SOURCE_DIR}/src/cluster_slot_stats.c ${CMAKE_SOURCE_DIR}/src/crc16.c ${CMAKE_SOURCE_DIR}/src/endianconv.c - ${CMAKE_SOURCE_DIR}/src/slowlog.c + ${CMAKE_SOURCE_DIR}/src/commandlog.c ${CMAKE_SOURCE_DIR}/src/eval.c ${CMAKE_SOURCE_DIR}/src/bio.c ${CMAKE_SOURCE_DIR}/src/rio.c diff --git a/src/Makefile b/src/Makefile index 9e4075660d..7a951193e4 100644 --- a/src/Makefile +++ b/src/Makefile @@ -416,7 +416,7 @@ endif ENGINE_NAME=valkey SERVER_NAME=$(ENGINE_NAME)-server$(PROG_SUFFIX) ENGINE_SENTINEL_NAME=$(ENGINE_NAME)-sentinel$(PROG_SUFFIX) -ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o hashtable.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o allocator_defrag.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o rdma.o scripting_engine.o +ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o hashtable.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o commandlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o allocator_defrag.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o rdma.o scripting_engine.o ENGINE_CLI_NAME=$(ENGINE_NAME)-cli$(PROG_SUFFIX) ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o ENGINE_BENCHMARK_NAME=$(ENGINE_NAME)-benchmark$(PROG_SUFFIX) diff --git a/src/blocked.c b/src/blocked.c index d2d6a5d314..1edb7728cc 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -61,7 +61,7 @@ */ #include "server.h" -#include "slowlog.h" +#include "commandlog.h" #include "latency.h" #include "monotonic.h" #include "cluster_slot_stats.h" @@ -117,15 +117,15 @@ void blockClient(client *c, int btype) { * he will attempt to reprocess the command which will update the statistics. * However in case the client was timed out or in case of module blocked client is being unblocked * the command will not be reprocessed and we need to make stats update. - * This function will make updates to the commandstats, slot-stats, slowlog and monitors. + * This function will make updates to the commandstats, slot-stats, commandlog and monitors. * The failed_or_rejected parameter is an indication that the blocked command was either failed internally or * rejected/aborted externally. In case the command was rejected the value ERROR_COMMAND_REJECTED should be passed. * In case the command failed internally, ERROR_COMMAND_FAILED should be passed. * A value of zero indicate no error was reported after the command was unblocked */ void updateStatsOnUnblock(client *c, long blocked_us, long reply_us, int failed_or_rejected) { - const ustime_t total_cmd_duration = c->duration + blocked_us + reply_us; - c->lastcmd->microseconds += total_cmd_duration; - clusterSlotStatsAddCpuDuration(c, total_cmd_duration); + c->duration += blocked_us + reply_us; + c->lastcmd->microseconds += c->duration; + clusterSlotStatsAddCpuDuration(c, c->duration); c->lastcmd->calls++; c->commands_processed++; server.stat_numcommands++; @@ -139,9 +139,9 @@ void updateStatsOnUnblock(client *c, long blocked_us, long reply_us, int failed_ debugServerAssertWithInfo(c, NULL, 0); } if (server.latency_tracking_enabled) - updateCommandLatencyHistogram(&(c->lastcmd->latency_histogram), total_cmd_duration * 1000); - /* Log the command into the Slow log if needed. */ - slowlogPushCurrentCommand(c, c->lastcmd, total_cmd_duration); + updateCommandLatencyHistogram(&(c->lastcmd->latency_histogram), c->duration * 1000); + /* Log the command into the commandlog if needed. */ + commandlogPushCurrentCommand(c, c->lastcmd); c->duration = 0; /* Log the reply duration event. */ latencyAddSampleIfNeeded("command-unblocking", reply_us / 1000); diff --git a/src/commandlog.c b/src/commandlog.c new file mode 100644 index 0000000000..cf25cbf2c2 --- /dev/null +++ b/src/commandlog.c @@ -0,0 +1,265 @@ +/* Commandlog implements a system that is able to remember the latest N + * queries that took more than M microseconds to execute, or consumed + * too much network bandwidth and memory for input/output buffers. + * + * The execution time to reach to be logged in the slow log is set + * using the 'commandlog-execution-slower-than' config directive, that is also + * readable and writable using the CONFIG SET/GET command. + * + * Other configurations such as `commandlog-request-larger-than` and + * `commandlog-reply-larger-than` can be found with more detailed + * explanations in the config file. + * + * The command log is actually not "logged" in the server log file + * but is accessible thanks to the COMMANDLOG command. + * + * ---------------------------------------------------------------------------- + * + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + +#include "commandlog.h" +#include "script.h" + +/* Create a new commandlog entry. + * Incrementing the ref count of all the objects retained is up to + * this function. */ +static commandlogEntry *commandlogCreateEntry(client *c, robj **argv, int argc, long long value, int type) { + commandlogEntry *ce = zmalloc(sizeof(*ce)); + int j, ceargc = argc; + + if (ceargc > COMMANDLOG_ENTRY_MAX_ARGC) ceargc = COMMANDLOG_ENTRY_MAX_ARGC; + ce->argc = ceargc; + ce->argv = zmalloc(sizeof(robj *) * ceargc); + for (j = 0; j < ceargc; j++) { + /* Logging too many arguments is a useless memory waste, so we stop + * at COMMANDLOG_ENTRY_MAX_ARGC, but use the last argument to specify + * how many remaining arguments there were in the original command. */ + if (ceargc != argc && j == ceargc - 1) { + ce->argv[j] = + createObject(OBJ_STRING, sdscatprintf(sdsempty(), "... (%d more arguments)", argc - ceargc + 1)); + } else { + /* Trim too long strings as well... */ + if (argv[j]->type == OBJ_STRING && sdsEncodedObject(argv[j]) && + sdslen(argv[j]->ptr) > COMMANDLOG_ENTRY_MAX_STRING) { + sds s = sdsnewlen(argv[j]->ptr, COMMANDLOG_ENTRY_MAX_STRING); + + s = sdscatprintf(s, "... (%lu more bytes)", + (unsigned long)sdslen(argv[j]->ptr) - COMMANDLOG_ENTRY_MAX_STRING); + ce->argv[j] = createObject(OBJ_STRING, s); + } else if (argv[j]->refcount == OBJ_SHARED_REFCOUNT) { + ce->argv[j] = argv[j]; + } else { + /* Here we need to duplicate the string objects composing the + * argument vector of the command, because those may otherwise + * end shared with string objects stored into keys. Having + * shared objects between any part of the server, and the data + * structure holding the data, is a problem: FLUSHALL ASYNC + * may release the shared string object and create a race. */ + ce->argv[j] = dupStringObject(argv[j]); + } + } + } + ce->time = time(NULL); + ce->value = value; + ce->id = server.commandlog[type].entry_id++; + ce->peerid = sdsnew(getClientPeerId(c)); + ce->cname = c->name ? sdsnew(c->name->ptr) : sdsempty(); + return ce; +} + +/* Free a command log entry. The argument is void so that the prototype of this + * function matches the one of the 'free' method of adlist.c. + * + * This function will take care to release all the retained object. */ +static void commandlogFreeEntry(void *ceptr) { + commandlogEntry *ce = ceptr; + int j; + + for (j = 0; j < ce->argc; j++) decrRefCount(ce->argv[j]); + zfree(ce->argv); + sdsfree(ce->peerid); + sdsfree(ce->cname); + zfree(ce); +} + +/* Initialize the command log. This function should be called a single time + * at server startup. */ +void commandlogInit(void) { + for (int i = 0; i < COMMANDLOG_TYPE_NUM; i++) { + server.commandlog[i].entries = listCreate(); + server.commandlog[i].entry_id = 0; + listSetFreeMethod(server.commandlog[i].entries, commandlogFreeEntry); + } +} + +/* Push a new entry into the command log. + * This function will make sure to trim the command log accordingly to the + * configured max length. */ +static void commandlogPushEntryIfNeeded(client *c, robj **argv, int argc, long long value, int type) { + if (server.commandlog[type].threshold < 0 || server.commandlog[type].max_len == 0) return; /* The corresponding commandlog disabled */ + if (value >= server.commandlog[type].threshold) + listAddNodeHead(server.commandlog[type].entries, commandlogCreateEntry(c, argv, argc, value, type)); + + /* Remove old entries if needed. */ + while (listLength(server.commandlog[type].entries) > server.commandlog[type].max_len) listDelNode(server.commandlog[type].entries, listLast(server.commandlog[type].entries)); +} + +/* Remove all the entries from the current command log of the specified type. */ +static void commandlogReset(int type) { + while (listLength(server.commandlog[type].entries) > 0) listDelNode(server.commandlog[type].entries, listLast(server.commandlog[type].entries)); +} + +/* Reply command logs to client. */ +static void commandlogGetReply(client *c, int type, long count) { + listIter li; + listNode *ln; + commandlogEntry *ce; + + if (count > (long)listLength(server.commandlog[type].entries)) { + count = listLength(server.commandlog[type].entries); + } + addReplyArrayLen(c, count); + listRewind(server.commandlog[type].entries, &li); + while (count--) { + int j; + + ln = listNext(&li); + ce = ln->value; + addReplyArrayLen(c, 6); + addReplyLongLong(c, ce->id); + addReplyLongLong(c, ce->time); + addReplyLongLong(c, ce->value); + addReplyArrayLen(c, ce->argc); + for (j = 0; j < ce->argc; j++) addReplyBulk(c, ce->argv[j]); + addReplyBulkCBuffer(c, ce->peerid, sdslen(ce->peerid)); + addReplyBulkCBuffer(c, ce->cname, sdslen(ce->cname)); + } +} + +/* Log the last command a client executed into the commandlog. */ +void commandlogPushCurrentCommand(client *c, struct serverCommand *cmd) { + /* Some commands may contain sensitive data that should not be available in the commandlog. + */ + if (cmd->flags & CMD_SKIP_COMMANDLOG) return; + + /* If command argument vector was rewritten, use the original + * arguments. */ + robj **argv = c->original_argv ? c->original_argv : c->argv; + int argc = c->original_argv ? c->original_argc : c->argc; + + /* If a script is currently running, the client passed in is a + * fake client. Or the client passed in is the original client + * if this is a EVAL or alike, doesn't matter. In this case, + * use the original client to get the client information. */ + c = scriptIsRunning() ? scriptGetCaller() : c; + + commandlogPushEntryIfNeeded(c, argv, argc, c->duration, COMMANDLOG_TYPE_SLOW); + commandlogPushEntryIfNeeded(c, argv, argc, c->net_input_bytes_curr_cmd, COMMANDLOG_TYPE_LARGE_REQUEST); + commandlogPushEntryIfNeeded(c, argv, argc, c->net_output_bytes_curr_cmd, COMMANDLOG_TYPE_LARGE_REPLY); +} + +/* The SLOWLOG command. Implements all the subcommands needed to handle the + * slow log. */ +void slowlogCommand(client *c) { + if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "help")) { + const char *help[] = { + "GET []", + " Return top entries from the slowlog (default: 10, -1 mean all).", + " Entries are made of:", + " id, timestamp, time in microseconds, arguments array, client IP and port,", + " client name", + "LEN", + " Return the length of the slowlog.", + "RESET", + " Reset the slowlog.", + NULL, + }; + addReplyHelp(c, help); + } else if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "reset")) { + commandlogReset(COMMANDLOG_TYPE_SLOW); + addReply(c, shared.ok); + } else if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "len")) { + addReplyLongLong(c, listLength(server.commandlog[COMMANDLOG_TYPE_SLOW].entries)); + } else if ((c->argc == 2 || c->argc == 3) && !strcasecmp(c->argv[1]->ptr, "get")) { + long count = 10; + + if (c->argc == 3) { + /* Consume count arg. */ + if (getRangeLongFromObjectOrReply(c, c->argv[2], -1, LONG_MAX, &count, + "count should be greater than or equal to -1") != C_OK) + return; + + if (count == -1) { + /* We treat -1 as a special value, which means to get all slow logs. + * Simply set count to the length of server.commandlog. */ + count = listLength(server.commandlog[COMMANDLOG_TYPE_SLOW].entries); + } + } + + commandlogGetReply(c, COMMANDLOG_TYPE_SLOW, count); + } else { + addReplySubcommandSyntaxError(c); + } +} + +static int commandlogGetTypeOrReply(client *c, robj *o) { + if (!strcasecmp(o->ptr, "slow")) return COMMANDLOG_TYPE_SLOW; + if (!strcasecmp(o->ptr, "large-request")) return COMMANDLOG_TYPE_LARGE_REQUEST; + if (!strcasecmp(o->ptr, "large-reply")) return COMMANDLOG_TYPE_LARGE_REPLY; + addReplyError(c, "type should be one of the following: slow, large-request, large-reply"); + return -1; +} + +/* The COMMANDLOG command. Implements all the subcommands needed to handle the + * command log. */ +void commandlogCommand(client *c) { + int type; + if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "help")) { + const char *help[] = { + "GET ", + " Return top entries of the specified from the commandlog (-1 mean all).", + " Entries are made of:", + " id, timestamp,", + " time in microseconds for type of slow,", + " or size in bytes for type of large-request,", + " or size in bytes for type of large-reply", + " arguments array, client IP and port,", + " client name", + "LEN ", + " Return the length of the specified type of commandlog.", + "RESET ", + " Reset the specified type of commandlog.", + NULL, + }; + addReplyHelp(c, help); + } else if (c->argc == 3 && !strcasecmp(c->argv[1]->ptr, "reset")) { + if ((type = commandlogGetTypeOrReply(c, c->argv[2])) == -1) return; + commandlogReset(type); + addReply(c, shared.ok); + } else if (c->argc == 3 && !strcasecmp(c->argv[1]->ptr, "len")) { + if ((type = commandlogGetTypeOrReply(c, c->argv[2])) == -1) return; + addReplyLongLong(c, listLength(server.commandlog[type].entries)); + } else if (c->argc == 4 && !strcasecmp(c->argv[1]->ptr, "get")) { + long count; + + /* Consume count arg. */ + if (getRangeLongFromObjectOrReply(c, c->argv[2], -1, LONG_MAX, &count, + "count should be greater than or equal to -1") != C_OK) + return; + + if ((type = commandlogGetTypeOrReply(c, c->argv[3])) == -1) return; + + if (count == -1) { + /* We treat -1 as a special value, which means to get all command logs. + * Simply set count to the length of server.commandlog. */ + count = listLength(server.commandlog[type].entries); + } + + commandlogGetReply(c, type, count); + } else { + addReplySubcommandSyntaxError(c); + } +} diff --git a/src/slowlog.h b/src/commandlog.h similarity index 71% rename from src/slowlog.h rename to src/commandlog.h index 12d9097ffa..825014746e 100644 --- a/src/slowlog.h +++ b/src/commandlog.h @@ -27,27 +27,26 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __SLOWLOG_H__ -#define __SLOWLOG_H__ +#ifndef __COMMANDLOG_H__ +#define __COMMANDLOG_H__ #include "server.h" -#define SLOWLOG_ENTRY_MAX_ARGC 32 -#define SLOWLOG_ENTRY_MAX_STRING 128 +#define COMMANDLOG_ENTRY_MAX_ARGC 32 +#define COMMANDLOG_ENTRY_MAX_STRING 128 -/* This structure defines an entry inside the slow log list */ -typedef struct slowlogEntry { +/* This structure defines an entry inside the command log list */ +typedef struct commandlogEntry { robj **argv; int argc; - long long id; /* Unique entry identifier. */ - long long duration; /* Time spent by the query, in microseconds. */ - time_t time; /* Unix time at which the query was executed. */ - sds cname; /* Client name. */ - sds peerid; /* Client network address. */ -} slowlogEntry; + long long id; /* Unique entry identifier. */ + long long value; /* The meaning is determined by the type of command log. */ + time_t time; /* Unix time at which the query was executed. */ + sds cname; /* Client name. */ + sds peerid; /* Client network address. */ +} commandlogEntry; /* Exported API */ -void slowlogInit(void); -void slowlogPushEntryIfNeeded(client *c, robj **argv, int argc, long long duration); +void commandlogInit(void); -#endif /* __SLOWLOG_H__ */ +#endif /* __COMMANDLOG_H__ */ diff --git a/src/commands.def b/src/commands.def index cd919a80e1..613eb16c9b 100644 --- a/src/commands.def +++ b/src/commands.def @@ -6668,6 +6668,147 @@ const char *COMMAND_Tips[] = { #define COMMAND_Keyspecs NULL #endif +/********** COMMANDLOG GET ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* COMMANDLOG GET history */ +#define COMMANDLOG_GET_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* COMMANDLOG GET tips */ +const char *COMMANDLOG_GET_Tips[] = { +"request_policy:all_nodes", +"nondeterministic_output", +}; +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* COMMANDLOG GET key specs */ +#define COMMANDLOG_GET_Keyspecs NULL +#endif + +/* COMMANDLOG GET type argument table */ +struct COMMAND_ARG COMMANDLOG_GET_type_Subargs[] = { +{MAKE_ARG("slow",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("large-request",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("large-reply",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* COMMANDLOG GET argument table */ +struct COMMAND_ARG COMMANDLOG_GET_Args[] = { +{MAKE_ARG("count",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("type",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_NONE,3,NULL),.subargs=COMMANDLOG_GET_type_Subargs}, +}; + +/********** COMMANDLOG HELP ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* COMMANDLOG HELP history */ +#define COMMANDLOG_HELP_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* COMMANDLOG HELP tips */ +#define COMMANDLOG_HELP_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* COMMANDLOG HELP key specs */ +#define COMMANDLOG_HELP_Keyspecs NULL +#endif + +/********** COMMANDLOG LEN ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* COMMANDLOG LEN history */ +#define COMMANDLOG_LEN_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* COMMANDLOG LEN tips */ +const char *COMMANDLOG_LEN_Tips[] = { +"request_policy:all_nodes", +"response_policy:agg_sum", +"nondeterministic_output", +}; +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* COMMANDLOG LEN key specs */ +#define COMMANDLOG_LEN_Keyspecs NULL +#endif + +/* COMMANDLOG LEN type argument table */ +struct COMMAND_ARG COMMANDLOG_LEN_type_Subargs[] = { +{MAKE_ARG("slow",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("large-request",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("large-reply",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* COMMANDLOG LEN argument table */ +struct COMMAND_ARG COMMANDLOG_LEN_Args[] = { +{MAKE_ARG("type",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_NONE,3,NULL),.subargs=COMMANDLOG_LEN_type_Subargs}, +}; + +/********** COMMANDLOG RESET ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* COMMANDLOG RESET history */ +#define COMMANDLOG_RESET_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* COMMANDLOG RESET tips */ +const char *COMMANDLOG_RESET_Tips[] = { +"request_policy:all_nodes", +"response_policy:all_succeeded", +}; +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* COMMANDLOG RESET key specs */ +#define COMMANDLOG_RESET_Keyspecs NULL +#endif + +/* COMMANDLOG RESET type argument table */ +struct COMMAND_ARG COMMANDLOG_RESET_type_Subargs[] = { +{MAKE_ARG("slow",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("large-request",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("large-reply",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* COMMANDLOG RESET argument table */ +struct COMMAND_ARG COMMANDLOG_RESET_Args[] = { +{MAKE_ARG("type",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_NONE,3,NULL),.subargs=COMMANDLOG_RESET_type_Subargs}, +}; + +/* COMMANDLOG command table */ +struct COMMAND_STRUCT COMMANDLOG_Subcommands[] = { +{MAKE_CMD("get","Returns the specified command log's entries.","O(N) where N is the number of entries returned","8.1.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMANDLOG_GET_History,0,COMMANDLOG_GET_Tips,2,commandlogCommand,4,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,COMMANDLOG_GET_Keyspecs,0,NULL,2),.args=COMMANDLOG_GET_Args}, +{MAKE_CMD("help","Show helpful text about the different subcommands","O(1)","8.1.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMANDLOG_HELP_History,0,COMMANDLOG_HELP_Tips,0,commandlogCommand,2,CMD_LOADING|CMD_STALE,0,COMMANDLOG_HELP_Keyspecs,0,NULL,0)}, +{MAKE_CMD("len","Returns the number of entries in the specified type of command log.","O(1)","8.1.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMANDLOG_LEN_History,0,COMMANDLOG_LEN_Tips,3,commandlogCommand,3,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,COMMANDLOG_LEN_Keyspecs,0,NULL,1),.args=COMMANDLOG_LEN_Args}, +{MAKE_CMD("reset","Clears all entries from the specified type of command log.","O(N) where N is the number of entries in the commandlog","8.1.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMANDLOG_RESET_History,0,COMMANDLOG_RESET_Tips,2,commandlogCommand,3,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,COMMANDLOG_RESET_Keyspecs,0,NULL,1),.args=COMMANDLOG_RESET_Args}, +{0} +}; + +/********** COMMANDLOG ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* COMMANDLOG history */ +#define COMMANDLOG_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* COMMANDLOG tips */ +#define COMMANDLOG_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* COMMANDLOG key specs */ +#define COMMANDLOG_Keyspecs NULL +#endif + /********** CONFIG GET ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -7831,10 +7972,10 @@ const char *SLOWLOG_RESET_Tips[] = { /* SLOWLOG command table */ struct COMMAND_STRUCT SLOWLOG_Subcommands[] = { -{MAKE_CMD("get","Returns the slow log's entries.","O(N) where N is the number of entries returned","2.2.12",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SLOWLOG_GET_History,1,SLOWLOG_GET_Tips,2,slowlogCommand,-2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_GET_Keyspecs,0,NULL,1),.args=SLOWLOG_GET_Args}, -{MAKE_CMD("help","Show helpful text about the different subcommands","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SLOWLOG_HELP_History,0,SLOWLOG_HELP_Tips,0,slowlogCommand,2,CMD_LOADING|CMD_STALE,0,SLOWLOG_HELP_Keyspecs,0,NULL,0)}, -{MAKE_CMD("len","Returns the number of entries in the slow log.","O(1)","2.2.12",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SLOWLOG_LEN_History,0,SLOWLOG_LEN_Tips,3,slowlogCommand,2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_LEN_Keyspecs,0,NULL,0)}, -{MAKE_CMD("reset","Clears all entries from the slow log.","O(N) where N is the number of entries in the slowlog","2.2.12",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SLOWLOG_RESET_History,0,SLOWLOG_RESET_Tips,2,slowlogCommand,2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_RESET_Keyspecs,0,NULL,0)}, +{MAKE_CMD("get","Returns the slow log's entries.","O(N) where N is the number of entries returned","2.2.12",CMD_DOC_DEPRECATED,"`COMMANDLOG GET SLOW`","8.1.0","server",COMMAND_GROUP_SERVER,SLOWLOG_GET_History,1,SLOWLOG_GET_Tips,2,slowlogCommand,-2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_GET_Keyspecs,0,NULL,1),.args=SLOWLOG_GET_Args}, +{MAKE_CMD("help","Show helpful text about the different subcommands","O(1)","6.2.0",CMD_DOC_DEPRECATED,"`COMMANDLOG HELP`","8.1.0","server",COMMAND_GROUP_SERVER,SLOWLOG_HELP_History,0,SLOWLOG_HELP_Tips,0,slowlogCommand,2,CMD_LOADING|CMD_STALE,0,SLOWLOG_HELP_Keyspecs,0,NULL,0)}, +{MAKE_CMD("len","Returns the number of entries in the slow log.","O(1)","2.2.12",CMD_DOC_DEPRECATED,"`COMMANDLOG LEN SLOW`","8.1.0","server",COMMAND_GROUP_SERVER,SLOWLOG_LEN_History,0,SLOWLOG_LEN_Tips,3,slowlogCommand,2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_LEN_Keyspecs,0,NULL,0)}, +{MAKE_CMD("reset","Clears all entries from the slow log.","O(N) where N is the number of entries in the slowlog","2.2.12",CMD_DOC_DEPRECATED,"`COMMANDLOG RESET SLOW`","8.1.0","server",COMMAND_GROUP_SERVER,SLOWLOG_RESET_History,0,SLOWLOG_RESET_Tips,2,slowlogCommand,2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_RESET_Keyspecs,0,NULL,0)}, {0} }; @@ -11043,6 +11184,7 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("bgrewriteaof","Asynchronously rewrites the append-only file to disk.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,BGREWRITEAOF_History,0,BGREWRITEAOF_Tips,0,bgrewriteaofCommand,1,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NOSCRIPT,0,BGREWRITEAOF_Keyspecs,0,NULL,0)}, {MAKE_CMD("bgsave","Asynchronously saves the database(s) to disk.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,BGSAVE_History,2,BGSAVE_Tips,0,bgsaveCommand,-1,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NOSCRIPT,0,BGSAVE_Keyspecs,0,NULL,1),.args=BGSAVE_Args}, {MAKE_CMD("command","Returns detailed information about all commands.","O(N) where N is the total number of commands","2.8.13",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMAND_History,0,COMMAND_Tips,1,commandCommand,-1,CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,COMMAND_Keyspecs,0,NULL,0),.subcommands=COMMAND_Subcommands}, +{MAKE_CMD("commandlog","A container for command log commands.","Depends on subcommand.","8.1.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMANDLOG_History,0,COMMANDLOG_Tips,0,NULL,-2,0,0,COMMANDLOG_Keyspecs,0,NULL,0),.subcommands=COMMANDLOG_Subcommands}, {MAKE_CMD("config","A container for server configuration commands.","Depends on subcommand.","2.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,CONFIG_History,0,CONFIG_Tips,0,NULL,-2,0,0,CONFIG_Keyspecs,0,NULL,0),.subcommands=CONFIG_Subcommands}, {MAKE_CMD("dbsize","Returns the number of keys in the database.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,DBSIZE_History,0,DBSIZE_Tips,2,dbsizeCommand,1,CMD_READONLY|CMD_FAST,ACL_CATEGORY_KEYSPACE,DBSIZE_Keyspecs,0,NULL,0)}, {MAKE_CMD("debug","A container for debugging commands.","Depends on subcommand.","1.0.0",CMD_DOC_SYSCMD,NULL,NULL,"server",COMMAND_GROUP_SERVER,DEBUG_History,0,DEBUG_Tips,0,debugCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_PROTECTED,0,DEBUG_Keyspecs,0,NULL,0)}, @@ -11064,7 +11206,7 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("save","Synchronously saves the database(s) to disk.","O(N) where N is the total number of keys in all databases","1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SAVE_History,0,SAVE_Tips,0,saveCommand,1,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NOSCRIPT|CMD_NO_MULTI,0,SAVE_Keyspecs,0,NULL,0)}, {MAKE_CMD("shutdown","Synchronously saves the database(s) to disk and shuts down the server.","O(N) when saving, where N is the total number of keys in all databases when saving data, otherwise O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SHUTDOWN_History,1,SHUTDOWN_Tips,0,shutdownCommand,-1,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_NO_MULTI|CMD_SENTINEL|CMD_ALLOW_BUSY,0,SHUTDOWN_Keyspecs,0,NULL,1),.args=SHUTDOWN_Args}, {MAKE_CMD("slaveof","Sets a server as a replica of another, or promotes it to being a primary.","O(1)","1.0.0",CMD_DOC_DEPRECATED,"`REPLICAOF`","5.0.0","server",COMMAND_GROUP_SERVER,SLAVEOF_History,0,SLAVEOF_Tips,0,replicaofCommand,3,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NOSCRIPT|CMD_STALE,0,SLAVEOF_Keyspecs,0,NULL,1),.args=SLAVEOF_Args}, -{MAKE_CMD("slowlog","A container for slow log commands.","Depends on subcommand.","2.2.12",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SLOWLOG_History,0,SLOWLOG_Tips,0,NULL,-2,0,0,SLOWLOG_Keyspecs,0,NULL,0),.subcommands=SLOWLOG_Subcommands}, +{MAKE_CMD("slowlog","A container for slow log commands.","Depends on subcommand.","2.2.12",CMD_DOC_DEPRECATED,"`COMMANDLOG`","8.1.0","server",COMMAND_GROUP_SERVER,SLOWLOG_History,0,SLOWLOG_Tips,0,NULL,-2,0,0,SLOWLOG_Keyspecs,0,NULL,0),.subcommands=SLOWLOG_Subcommands}, {MAKE_CMD("swapdb","Swaps two databases.","O(N) where N is the count of clients watching or blocking on keys from both databases.","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SWAPDB_History,0,SWAPDB_Tips,0,swapdbCommand,3,CMD_WRITE|CMD_FAST,ACL_CATEGORY_KEYSPACE|ACL_CATEGORY_DANGEROUS,SWAPDB_Keyspecs,0,NULL,2),.args=SWAPDB_Args}, {MAKE_CMD("sync","An internal command used in replication.",NULL,"1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SYNC_History,0,SYNC_Tips,0,syncCommand,1,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NO_MULTI|CMD_NOSCRIPT,0,SYNC_Keyspecs,0,NULL,0)}, {MAKE_CMD("time","Returns the server time.","O(1)","2.6.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,TIME_History,0,TIME_Tips,1,timeCommand,1,CMD_LOADING|CMD_STALE|CMD_FAST,0,TIME_Keyspecs,0,NULL,0)}, @@ -11163,7 +11305,7 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("substr","Returns a substring from a string value.","O(N) where N is the length of the returned string. The complexity is ultimately determined by the returned length, but because creating a substring from an existing string is very cheap, it can be considered O(1) for small strings.","1.0.0",CMD_DOC_DEPRECATED,"`GETRANGE`","2.0.0","string",COMMAND_GROUP_STRING,SUBSTR_History,0,SUBSTR_Tips,0,getrangeCommand,4,CMD_READONLY,ACL_CATEGORY_STRING,SUBSTR_Keyspecs,1,NULL,3),.args=SUBSTR_Args}, /* transactions */ {MAKE_CMD("discard","Discards a transaction.","O(N), when N is the number of queued commands","2.0.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,DISCARD_History,0,DISCARD_Tips,0,discardCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_ALLOW_BUSY,ACL_CATEGORY_TRANSACTION,DISCARD_Keyspecs,0,NULL,0)}, -{MAKE_CMD("exec","Executes all commands in a transaction.","Depends on commands in the transaction","1.2.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,EXEC_History,0,EXEC_Tips,0,execCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SKIP_SLOWLOG,ACL_CATEGORY_TRANSACTION,EXEC_Keyspecs,0,NULL,0)}, +{MAKE_CMD("exec","Executes all commands in a transaction.","Depends on commands in the transaction","1.2.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,EXEC_History,0,EXEC_Tips,0,execCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SKIP_COMMANDLOG,ACL_CATEGORY_TRANSACTION,EXEC_Keyspecs,0,NULL,0)}, {MAKE_CMD("multi","Starts a transaction.","O(1)","1.2.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,MULTI_History,0,MULTI_Tips,0,multiCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_MULTI|CMD_ALLOW_BUSY,ACL_CATEGORY_TRANSACTION,MULTI_Keyspecs,0,NULL,0)}, {MAKE_CMD("unwatch","Forgets about watched keys of a transaction.","O(1)","2.2.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,UNWATCH_History,0,UNWATCH_Tips,0,unwatchCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_ALLOW_BUSY,ACL_CATEGORY_TRANSACTION,UNWATCH_Keyspecs,0,NULL,0)}, {MAKE_CMD("watch","Monitors changes to keys to determine the execution of a transaction.","O(1) for every key.","2.2.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,WATCH_History,0,WATCH_Tips,0,watchCommand,-2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_MULTI|CMD_ALLOW_BUSY,ACL_CATEGORY_TRANSACTION,WATCH_Keyspecs,1,NULL,1),.args=WATCH_Args}, diff --git a/src/commands/commandlog-get.json b/src/commands/commandlog-get.json new file mode 100644 index 0000000000..00a5a01b0f --- /dev/null +++ b/src/commands/commandlog-get.json @@ -0,0 +1,85 @@ +{ + "GET": { + "summary": "Returns the specified command log's entries.", + "complexity": "O(N) where N is the number of entries returned", + "group": "server", + "since": "8.1.0", + "arity": 4, + "container": "COMMANDLOG", + "function": "commandlogCommand", + "command_flags": [ + "ADMIN", + "LOADING", + "STALE" + ], + "command_tips": [ + "REQUEST_POLICY:ALL_NODES", + "NONDETERMINISTIC_OUTPUT" + ], + "reply_schema": { + "type": "array", + "description": "Entries from the command log in chronological order.", + "uniqueItems": true, + "items": { + "type": "array", + "minItems": 6, + "maxItems": 6, + "items": [ + { + "type": "integer", + "description": "Command log entry ID." + }, + { + "type": "integer", + "description": "The unix timestamp at which the logged command was processed.", + "minimum": 0 + }, + { + "type": "integer", + "description": "Determined by the type parameter.", + "minimum": 0 + }, + { + "type": "array", + "description": "The arguments of the command.", + "items": { + "type": "string" + } + }, + { + "type": "string", + "description": "Client IP address and port." + }, + { + "type": "string", + "description": "Client name if set via the CLIENT SETNAME command." + } + ] + } + }, + "arguments": [ + { + "name": "count", + "type": "integer" + }, + { + "name": "type", + "type": "oneof", + "arguments": [ + { + "name": "slow", + "type": "string" + }, + { + "name": "large-request", + "type": "string" + }, + { + "name": "large-reply", + "type": "string" + } + ] + } + ] + } +} diff --git a/src/commands/commandlog-help.json b/src/commands/commandlog-help.json new file mode 100644 index 0000000000..dacf7d6209 --- /dev/null +++ b/src/commands/commandlog-help.json @@ -0,0 +1,22 @@ +{ + "HELP": { + "summary": "Show helpful text about the different subcommands", + "complexity": "O(1)", + "group": "server", + "since": "8.1.0", + "arity": 2, + "container": "COMMANDLOG", + "function": "commandlogCommand", + "command_flags": [ + "LOADING", + "STALE" + ], + "reply_schema": { + "type": "array", + "description": "Helpful text about subcommands.", + "items": { + "type": "string" + } + } + } +} diff --git a/src/commands/commandlog-len.json b/src/commands/commandlog-len.json new file mode 100644 index 0000000000..4f1bb44075 --- /dev/null +++ b/src/commands/commandlog-len.json @@ -0,0 +1,46 @@ +{ + "LEN": { + "summary": "Returns the number of entries in the specified type of command log.", + "complexity": "O(1)", + "group": "server", + "since": "8.1.0", + "arity": 3, + "container": "COMMANDLOG", + "function": "commandlogCommand", + "command_flags": [ + "ADMIN", + "LOADING", + "STALE" + ], + "command_tips": [ + "REQUEST_POLICY:ALL_NODES", + "RESPONSE_POLICY:AGG_SUM", + "NONDETERMINISTIC_OUTPUT" + ], + "reply_schema": { + "type": "integer", + "description": "Number of entries in the command log.", + "minimum": 0 + }, + "arguments": [ + { + "name": "type", + "type": "oneof", + "arguments": [ + { + "name": "slow", + "type": "string" + }, + { + "name": "large-request", + "type": "string" + }, + { + "name": "large-reply", + "type": "string" + } + ] + } + ] + } +} diff --git a/src/commands/commandlog-reset.json b/src/commands/commandlog-reset.json new file mode 100644 index 0000000000..e43c4d4404 --- /dev/null +++ b/src/commands/commandlog-reset.json @@ -0,0 +1,43 @@ +{ + "RESET": { + "summary": "Clears all entries from the specified type of command log.", + "complexity": "O(N) where N is the number of entries in the commandlog", + "group": "server", + "since": "8.1.0", + "arity": 3, + "container": "COMMANDLOG", + "function": "commandlogCommand", + "command_flags": [ + "ADMIN", + "LOADING", + "STALE" + ], + "command_tips": [ + "REQUEST_POLICY:ALL_NODES", + "RESPONSE_POLICY:ALL_SUCCEEDED" + ], + "reply_schema": { + "const": "OK" + }, + "arguments": [ + { + "name": "type", + "type": "oneof", + "arguments": [ + { + "name": "slow", + "type": "string" + }, + { + "name": "large-request", + "type": "string" + }, + { + "name": "large-reply", + "type": "string" + } + ] + } + ] + } +} diff --git a/src/commands/commandlog.json b/src/commands/commandlog.json new file mode 100644 index 0000000000..2ff2376436 --- /dev/null +++ b/src/commands/commandlog.json @@ -0,0 +1,9 @@ +{ + "COMMANDLOG": { + "summary": "A container for command log commands.", + "complexity": "Depends on subcommand.", + "group": "server", + "since": "8.1.0", + "arity": -2 + } +} diff --git a/src/commands/exec.json b/src/commands/exec.json index 5f03d76e08..3b1b1faff1 100644 --- a/src/commands/exec.json +++ b/src/commands/exec.json @@ -10,7 +10,7 @@ "NOSCRIPT", "LOADING", "STALE", - "SKIP_SLOWLOG" + "SKIP_COMMANDLOG" ], "acl_categories": [ "TRANSACTION" diff --git a/src/commands/slowlog-get.json b/src/commands/slowlog-get.json index ffc54b5454..3f57b87ed8 100644 --- a/src/commands/slowlog-get.json +++ b/src/commands/slowlog-get.json @@ -7,6 +7,11 @@ "arity": -2, "container": "SLOWLOG", "function": "slowlogCommand", + "deprecated_since": "8.1.0", + "replaced_by": "`COMMANDLOG GET SLOW`", + "doc_flags": [ + "DEPRECATED" + ], "history": [ [ "4.0.0", diff --git a/src/commands/slowlog-help.json b/src/commands/slowlog-help.json index dde8fd4598..1db5520e1f 100644 --- a/src/commands/slowlog-help.json +++ b/src/commands/slowlog-help.json @@ -7,6 +7,11 @@ "arity": 2, "container": "SLOWLOG", "function": "slowlogCommand", + "deprecated_since": "8.1.0", + "replaced_by": "`COMMANDLOG HELP`", + "doc_flags": [ + "DEPRECATED" + ], "command_flags": [ "LOADING", "STALE" diff --git a/src/commands/slowlog-len.json b/src/commands/slowlog-len.json index 717a8ad416..8b4e2b86c8 100644 --- a/src/commands/slowlog-len.json +++ b/src/commands/slowlog-len.json @@ -7,6 +7,11 @@ "arity": 2, "container": "SLOWLOG", "function": "slowlogCommand", + "deprecated_since": "8.1.0", + "replaced_by": "`COMMANDLOG LEN SLOW`", + "doc_flags": [ + "DEPRECATED" + ], "command_flags": [ "ADMIN", "LOADING", diff --git a/src/commands/slowlog-reset.json b/src/commands/slowlog-reset.json index cfc1e4da7f..d5fd4b02a2 100644 --- a/src/commands/slowlog-reset.json +++ b/src/commands/slowlog-reset.json @@ -7,6 +7,11 @@ "arity": 2, "container": "SLOWLOG", "function": "slowlogCommand", + "deprecated_since": "8.1.0", + "replaced_by": "`COMMANDLOG RESET SLOW`", + "doc_flags": [ + "DEPRECATED" + ], "command_flags": [ "ADMIN", "LOADING", diff --git a/src/commands/slowlog.json b/src/commands/slowlog.json index 1b9526b191..0120d55eda 100644 --- a/src/commands/slowlog.json +++ b/src/commands/slowlog.json @@ -4,6 +4,11 @@ "complexity": "Depends on subcommand.", "group": "server", "since": "2.2.12", - "arity": -2 + "arity": -2, + "deprecated_since": "8.1.0", + "replaced_by": "`COMMANDLOG`", + "doc_flags": [ + "DEPRECATED" + ] } } diff --git a/src/config.c b/src/config.c index 5b90ebbd60..6bbb82bf52 100644 --- a/src/config.c +++ b/src/config.c @@ -818,7 +818,7 @@ void configSetCommand(client *c) { /* Note: it's important we run over ALL passed configs and check if we need to call * `redactClientCommandArgument()`. This is in order to avoid anyone using this command for a - * log/slowlog/monitor/etc. displaying sensitive info. So even if we encounter an error we still continue + * log/commandlog/monitor/etc. displaying sensitive info. So even if we encounter an error we still continue * running over the remaining arguments. */ if (config->flags & SENSITIVE_CONFIG) { redactClientCommandArgument(c, 2 + i * 2 + 1); @@ -3302,7 +3302,9 @@ standardConfig static_configs[] = { /* Unsigned Long configs */ createULongConfig("active-defrag-max-scan-fields", NULL, MODIFIABLE_CONFIG, 1, LONG_MAX, server.active_defrag_max_scan_fields, 1000, INTEGER_CONFIG, NULL, NULL), /* Default: keys with more than 1000 fields will be processed separately */ - createULongConfig("slowlog-max-len", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.slowlog_max_len, 128, INTEGER_CONFIG, NULL, NULL), + createULongConfig("commandlog-slow-execution-max-len", "slowlog-max-len", MODIFIABLE_CONFIG, 0, LONG_MAX, server.commandlog[COMMANDLOG_TYPE_SLOW].max_len, 128, INTEGER_CONFIG, NULL, NULL), + createULongConfig("commandlog-large-request-max-len", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.commandlog[COMMANDLOG_TYPE_LARGE_REQUEST].max_len, 128, INTEGER_CONFIG, NULL, NULL), + createULongConfig("commandlog-large-reply-max-len", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.commandlog[COMMANDLOG_TYPE_LARGE_REPLY].max_len, 128, INTEGER_CONFIG, NULL, NULL), createULongConfig("acllog-max-len", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.acllog_max_len, 128, INTEGER_CONFIG, NULL, NULL), createULongConfig("cluster-blacklist-ttl", NULL, MODIFIABLE_CONFIG, 0, ULONG_MAX, server.cluster_blacklist_ttl, 60, INTEGER_CONFIG, NULL, NULL), @@ -3310,7 +3312,9 @@ standardConfig static_configs[] = { createLongLongConfig("busy-reply-threshold", "lua-time-limit", MODIFIABLE_CONFIG, 0, LONG_MAX, server.busy_reply_threshold, 5000, INTEGER_CONFIG, NULL, NULL), /* milliseconds */ createLongLongConfig("cluster-node-timeout", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.cluster_node_timeout, 15000, INTEGER_CONFIG, NULL, NULL), createLongLongConfig("cluster-ping-interval", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, LLONG_MAX, server.cluster_ping_interval, 0, INTEGER_CONFIG, NULL, NULL), - createLongLongConfig("slowlog-log-slower-than", NULL, MODIFIABLE_CONFIG, -1, LLONG_MAX, server.slowlog_log_slower_than, 10000, INTEGER_CONFIG, NULL, NULL), + createLongLongConfig("commandlog-execution-slower-than", "slowlog-log-slower-than", MODIFIABLE_CONFIG, -1, LLONG_MAX, server.commandlog[COMMANDLOG_TYPE_SLOW].threshold, 10000, INTEGER_CONFIG, NULL, NULL), + createLongLongConfig("commandlog-request-larger-than", NULL, MODIFIABLE_CONFIG, -1, LLONG_MAX, server.commandlog[COMMANDLOG_TYPE_LARGE_REQUEST].threshold, 1024 * 1024, INTEGER_CONFIG, NULL, NULL), + createLongLongConfig("commandlog-reply-larger-than", NULL, MODIFIABLE_CONFIG, -1, LLONG_MAX, server.commandlog[COMMANDLOG_TYPE_LARGE_REPLY].threshold, 1024 * 1024, INTEGER_CONFIG, NULL, NULL), createLongLongConfig("latency-monitor-threshold", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.latency_monitor_threshold, 0, INTEGER_CONFIG, NULL, NULL), createLongLongConfig("proto-max-bulk-len", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, 1024 * 1024, LONG_MAX, server.proto_max_bulk_len, 512ll * 1024 * 1024, MEMORY_CONFIG, NULL, NULL), /* Bulk request max size */ createLongLongConfig("stream-node-max-entries", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.stream_node_max_entries, 100, INTEGER_CONFIG, NULL, NULL), diff --git a/src/latency.c b/src/latency.c index fa448dac35..7dcdaea967 100644 --- a/src/latency.c +++ b/src/latency.c @@ -266,10 +266,10 @@ sds createLatencyReport(void) { /* Potentially commands. */ if (!strcasecmp(event, "command")) { - if (server.slowlog_log_slower_than < 0 || server.slowlog_max_len == 0) { + if (server.commandlog[COMMANDLOG_TYPE_SLOW].threshold < 0 || server.commandlog[COMMANDLOG_TYPE_SLOW].max_len == 0) { advise_slowlog_enabled = 1; advices++; - } else if (server.slowlog_log_slower_than / 1000 > server.latency_monitor_threshold) { + } else if (server.commandlog[COMMANDLOG_TYPE_SLOW].threshold / 1000 > server.latency_monitor_threshold) { advise_slowlog_tuning = 1; advices++; } diff --git a/src/module.c b/src/module.c index 17ac4ddf02..37bd310427 100644 --- a/src/module.c +++ b/src/module.c @@ -53,7 +53,7 @@ #include "server.h" #include "cluster.h" -#include "slowlog.h" +#include "commandlog.h" #include "rdb.h" #include "monotonic.h" #include "script.h" @@ -1221,7 +1221,8 @@ int64_t commandFlagsFromString(char *s) { else if (!strcasecmp(t,"blocking")) flags |= CMD_BLOCKING; else if (!strcasecmp(t,"allow-stale")) flags |= CMD_STALE; else if (!strcasecmp(t,"no-monitor")) flags |= CMD_SKIP_MONITOR; - else if (!strcasecmp(t,"no-slowlog")) flags |= CMD_SKIP_SLOWLOG; + else if (!strcasecmp(t,"no-slowlog")) flags |= CMD_SKIP_COMMANDLOG; + else if (!strcasecmp(t,"no-commandlog")) flags |= CMD_SKIP_COMMANDLOG; else if (!strcasecmp(t,"fast")) flags |= CMD_FAST; else if (!strcasecmp(t,"no-auth")) flags |= CMD_NO_AUTH; else if (!strcasecmp(t,"may-replicate")) flags |= CMD_MAY_REPLICATE; @@ -1296,7 +1297,8 @@ ValkeyModuleCommand *moduleCreateCommandProxy(struct ValkeyModule *module, * this means. * * **"no-monitor"**: Don't propagate the command on monitor. Use this if * the command has sensitive data among the arguments. - * * **"no-slowlog"**: Don't log this command in the slowlog. Use this if + * * **"no-slowlog"**: Deprecated, please use "no-commandlog". + * * **"no-commandlog"**: Don't log this command in the commandlog. Use this if * the command has sensitive data among the arguments. * * **"fast"**: The command time complexity is not greater * than O(log(N)) where N is the size of the collection or diff --git a/src/multi.c b/src/multi.c index 9e3aad9d3c..0318c418cc 100644 --- a/src/multi.c +++ b/src/multi.c @@ -250,7 +250,7 @@ void execCommand(client *c) { c->mstate->commands[j].argv_len = c->argv_len; c->mstate->commands[j].cmd = c->cmd; - /* The original argv has already been processed for slowlog and monitor, + /* The original argv has already been processed for commandlog and monitor, * so we can safely free it before proceeding to the next command. */ freeClientOriginalArgv(c); } diff --git a/src/networking.c b/src/networking.c index 0ff0fed4c0..093d579ef4 100644 --- a/src/networking.c +++ b/src/networking.c @@ -4297,7 +4297,7 @@ void securityWarningCommand(client *c) { freeClientAsync(c); } -/* This function preserves the original command arguments for accurate slowlog recording. +/* This function preserves the original command arguments for accurate commandlog recording. * * It performs the following operations: * - Stores the initial command vector if not already saved @@ -4348,7 +4348,7 @@ static void backupAndUpdateClientArgv(client *c, int new_argc, robj **new_argv) } /* Redact a given argument to prevent it from being shown - * in the slowlog. This information is stored in the + * in the commandlog. This information is stored in the * original_argv array. */ void redactClientCommandArgument(client *c, int argc) { backupAndUpdateClientArgv(c, c->argc, NULL); diff --git a/src/server.c b/src/server.c index ab06a24470..898675ed04 100644 --- a/src/server.c +++ b/src/server.c @@ -31,7 +31,7 @@ #include "monotonic.h" #include "cluster.h" #include "cluster_slot_stats.h" -#include "slowlog.h" +#include "commandlog.h" #include "bio.h" #include "latency.h" #include "mt19937-64.h" @@ -2917,7 +2917,7 @@ void initServer(void) { if (functionsInit() == C_ERR) { serverPanic("Functions initialization failed, check the server logs."); } - slowlogInit(); + commandlogInit(); latencyMonitorInit(); initSharedQueryBuf(); @@ -3487,25 +3487,6 @@ void preventCommandReplication(client *c) { c->flag.prevent_repl_prop = 1; } -/* Log the last command a client executed into the slowlog. */ -void slowlogPushCurrentCommand(client *c, struct serverCommand *cmd, ustime_t duration) { - /* Some commands may contain sensitive data that should not be available in the slowlog. */ - if (cmd->flags & CMD_SKIP_SLOWLOG) return; - - /* If command argument vector was rewritten, use the original - * arguments. */ - robj **argv = c->original_argv ? c->original_argv : c->argv; - int argc = c->original_argv ? c->original_argc : c->argc; - - /* If a script is currently running, the client passed in is a - * fake client. Or the client passed in is the original client - * if this is a EVAL or alike, doesn't matter. In this case, - * use the original client to get the client information. */ - c = scriptIsRunning() ? scriptGetCaller() : c; - - slowlogPushEntryIfNeeded(c, argv, argc, duration); -} - /* This function is called in order to update the total command histogram duration. * The latency unit is nano-seconds. * If needed it will allocate the histogram memory and trim the duration to the upper/lower tracking limits*/ @@ -3659,7 +3640,7 @@ void call(client *c, int flags) { server.executing_client = c; /* When call() is issued during loading the AOF we don't want commands called - * from module, exec or LUA to go into the slowlog or to populate statistics. */ + * from module, exec or LUA to go into the commandlog or to populate statistics. */ int update_command_stats = !isAOFLoadingContext(); /* We want to be aware of a client which is making a first time attempt to execute this command @@ -3756,9 +3737,9 @@ void call(client *c, int flags) { if (server.execution_nesting == 0) durationAddSample(EL_DURATION_TYPE_CMD, duration); } - /* Log the command into the Slow log if needed. - * If the client is blocked we will handle slowlog when it is unblocked. */ - if (update_command_stats && !c->flag.blocked) slowlogPushCurrentCommand(c, real_cmd, c->duration); + /* Log the command into the commandlog if needed. + * If the client is blocked we will handle commandlog when it is unblocked. */ + if (update_command_stats && !c->flag.blocked) commandlogPushCurrentCommand(c, real_cmd); /* Send the command to clients in MONITOR mode if applicable, * since some administrative commands are considered too dangerous to be shown. @@ -4725,7 +4706,7 @@ void addReplyFlagsForCommand(client *c, struct serverCommand *cmd) { {CMD_LOADING, "loading"}, {CMD_STALE, "stale"}, {CMD_SKIP_MONITOR, "skip_monitor"}, - {CMD_SKIP_SLOWLOG, "skip_slowlog"}, + {CMD_SKIP_COMMANDLOG, "skip_commandlog"}, {CMD_ASKING, "asking"}, {CMD_FAST, "fast"}, {CMD_NO_AUTH, "no_auth"}, diff --git a/src/server.h b/src/server.h index 8f4fb6a6b4..f5a4d4951b 100644 --- a/src/server.h +++ b/src/server.h @@ -230,7 +230,7 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; #define CMD_LOADING (1ULL << 9) #define CMD_STALE (1ULL << 10) #define CMD_SKIP_MONITOR (1ULL << 11) -#define CMD_SKIP_SLOWLOG (1ULL << 12) +#define CMD_SKIP_COMMANDLOG (1ULL << 12) #define CMD_ASKING (1ULL << 13) #define CMD_FAST (1ULL << 14) #define CMD_NO_AUTH (1ULL << 15) @@ -382,6 +382,22 @@ typedef enum blocking_type { buffer configuration. Just the first \ three: normal, replica, pubsub. */ +/* Type of commandlog */ +typedef enum { + COMMANDLOG_TYPE_SLOW = 0, + COMMANDLOG_TYPE_LARGE_REQUEST, + COMMANDLOG_TYPE_LARGE_REPLY, + COMMANDLOG_TYPE_NUM +} commandlog_type; + +/* Configuration and entry list of different types of command logs */ +typedef struct commandlog { + list *entries; + long long entry_id; + long long threshold; + unsigned long max_len; +} commandlog; + /* Replica replication state. Used in server.repl_state for replicas to remember * what to do next. */ typedef enum { @@ -1688,10 +1704,7 @@ struct valkeyServer { long long stat_sync_full; /* Number of full resyncs with replicas. */ long long stat_sync_partial_ok; /* Number of accepted PSYNC requests. */ long long stat_sync_partial_err; /* Number of unaccepted PSYNC requests. */ - list *slowlog; /* SLOWLOG list of commands */ - long long slowlog_entry_id; /* SLOWLOG current entry ID */ - long long slowlog_log_slower_than; /* SLOWLOG time limit (to get logged) */ - unsigned long slowlog_max_len; /* SLOWLOG max number of items logged */ + commandlog commandlog[COMMANDLOG_TYPE_NUM]; /* Logs of commands. */ struct malloc_stats cron_malloc_stats; /* sampled in serverCron(). */ long long stat_net_input_bytes; /* Bytes read from network. */ long long stat_net_output_bytes; /* Bytes written to network. */ @@ -2339,7 +2352,7 @@ typedef int serverGetKeysProc(struct serverCommand *cmd, robj **argv, int argc, * * CMD_SKIP_MONITOR: Do not automatically propagate the command on MONITOR. * - * CMD_SKIP_SLOWLOG: Do not automatically propagate the command to the slowlog. + * CMD_SKIP_COMMANDLOG: Do not automatically propagate the command to the commandlog. * * CMD_ASKING: Perform an implicit ASKING for this command, so the * command will be accepted in cluster mode if the slot is marked @@ -3151,7 +3164,7 @@ void forceCommandPropagation(client *c, int flags); void preventCommandPropagation(client *c); void preventCommandAOF(client *c); void preventCommandReplication(client *c); -void slowlogPushCurrentCommand(client *c, struct serverCommand *cmd, ustime_t duration); +void commandlogPushCurrentCommand(client *c, struct serverCommand *cmd); void updateCommandLatencyHistogram(struct hdr_histogram **latency_histogram, int64_t duration_hist); int prepareForShutdown(client *c, int flags); void replyToClientsBlockedOnShutdown(void); @@ -3632,6 +3645,7 @@ void bgsaveCommand(client *c); void bgrewriteaofCommand(client *c); void shutdownCommand(client *c); void slowlogCommand(client *c); +void commandlogCommand(client *c); void moveCommand(client *c); void copyCommand(client *c); void renameCommand(client *c); diff --git a/src/slowlog.c b/src/slowlog.c deleted file mode 100644 index b98a2dda43..0000000000 --- a/src/slowlog.c +++ /dev/null @@ -1,193 +0,0 @@ -/* Slowlog implements a system that is able to remember the latest N - * queries that took more than M microseconds to execute. - * - * The execution time to reach to be logged in the slow log is set - * using the 'slowlog-log-slower-than' config directive, that is also - * readable and writable using the CONFIG SET/GET command. - * - * The slow queries log is actually not "logged" in the server log file - * but is accessible thanks to the SLOWLOG command. - * - * ---------------------------------------------------------------------------- - * - * Copyright (c) 2009-2012, Redis Ltd. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "slowlog.h" - -/* Create a new slowlog entry. - * Incrementing the ref count of all the objects retained is up to - * this function. */ -slowlogEntry *slowlogCreateEntry(client *c, robj **argv, int argc, long long duration) { - slowlogEntry *se = zmalloc(sizeof(*se)); - int j, slargc = argc; - - if (slargc > SLOWLOG_ENTRY_MAX_ARGC) slargc = SLOWLOG_ENTRY_MAX_ARGC; - se->argc = slargc; - se->argv = zmalloc(sizeof(robj *) * slargc); - for (j = 0; j < slargc; j++) { - /* Logging too many arguments is a useless memory waste, so we stop - * at SLOWLOG_ENTRY_MAX_ARGC, but use the last argument to specify - * how many remaining arguments there were in the original command. */ - if (slargc != argc && j == slargc - 1) { - se->argv[j] = - createObject(OBJ_STRING, sdscatprintf(sdsempty(), "... (%d more arguments)", argc - slargc + 1)); - } else { - /* Trim too long strings as well... */ - if (argv[j]->type == OBJ_STRING && sdsEncodedObject(argv[j]) && - sdslen(argv[j]->ptr) > SLOWLOG_ENTRY_MAX_STRING) { - sds s = sdsnewlen(argv[j]->ptr, SLOWLOG_ENTRY_MAX_STRING); - - s = sdscatprintf(s, "... (%lu more bytes)", - (unsigned long)sdslen(argv[j]->ptr) - SLOWLOG_ENTRY_MAX_STRING); - se->argv[j] = createObject(OBJ_STRING, s); - } else if (argv[j]->refcount == OBJ_SHARED_REFCOUNT) { - se->argv[j] = argv[j]; - } else { - /* Here we need to duplicate the string objects composing the - * argument vector of the command, because those may otherwise - * end shared with string objects stored into keys. Having - * shared objects between any part of the server, and the data - * structure holding the data, is a problem: FLUSHALL ASYNC - * may release the shared string object and create a race. */ - se->argv[j] = dupStringObject(argv[j]); - } - } - } - se->time = time(NULL); - se->duration = duration; - se->id = server.slowlog_entry_id++; - se->peerid = sdsnew(getClientPeerId(c)); - se->cname = c->name ? sdsnew(c->name->ptr) : sdsempty(); - return se; -} - -/* Free a slow log entry. The argument is void so that the prototype of this - * function matches the one of the 'free' method of adlist.c. - * - * This function will take care to release all the retained object. */ -void slowlogFreeEntry(void *septr) { - slowlogEntry *se = septr; - int j; - - for (j = 0; j < se->argc; j++) decrRefCount(se->argv[j]); - zfree(se->argv); - sdsfree(se->peerid); - sdsfree(se->cname); - zfree(se); -} - -/* Initialize the slow log. This function should be called a single time - * at server startup. */ -void slowlogInit(void) { - server.slowlog = listCreate(); - server.slowlog_entry_id = 0; - listSetFreeMethod(server.slowlog, slowlogFreeEntry); -} - -/* Push a new entry into the slow log. - * This function will make sure to trim the slow log accordingly to the - * configured max length. */ -void slowlogPushEntryIfNeeded(client *c, robj **argv, int argc, long long duration) { - if (server.slowlog_log_slower_than < 0 || server.slowlog_max_len == 0) return; /* Slowlog disabled */ - if (duration >= server.slowlog_log_slower_than) - listAddNodeHead(server.slowlog, slowlogCreateEntry(c, argv, argc, duration)); - - /* Remove old entries if needed. */ - while (listLength(server.slowlog) > server.slowlog_max_len) listDelNode(server.slowlog, listLast(server.slowlog)); -} - -/* Remove all the entries from the current slow log. */ -void slowlogReset(void) { - while (listLength(server.slowlog) > 0) listDelNode(server.slowlog, listLast(server.slowlog)); -} - -/* The SLOWLOG command. Implements all the subcommands needed to handle the - * slow log. */ -void slowlogCommand(client *c) { - if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "help")) { - const char *help[] = { - "GET []", - " Return top entries from the slowlog (default: 10, -1 mean all).", - " Entries are made of:", - " id, timestamp, time in microseconds, arguments array, client IP and port,", - " client name", - "LEN", - " Return the length of the slowlog.", - "RESET", - " Reset the slowlog.", - NULL, - }; - addReplyHelp(c, help); - } else if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "reset")) { - slowlogReset(); - addReply(c, shared.ok); - } else if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "len")) { - addReplyLongLong(c, listLength(server.slowlog)); - } else if ((c->argc == 2 || c->argc == 3) && !strcasecmp(c->argv[1]->ptr, "get")) { - long count = 10; - listIter li; - listNode *ln; - slowlogEntry *se; - - if (c->argc == 3) { - /* Consume count arg. */ - if (getRangeLongFromObjectOrReply(c, c->argv[2], -1, LONG_MAX, &count, - "count should be greater than or equal to -1") != C_OK) - return; - - if (count == -1) { - /* We treat -1 as a special value, which means to get all slow logs. - * Simply set count to the length of server.slowlog.*/ - count = listLength(server.slowlog); - } - } - - if (count > (long)listLength(server.slowlog)) { - count = listLength(server.slowlog); - } - addReplyArrayLen(c, count); - listRewind(server.slowlog, &li); - while (count--) { - int j; - - ln = listNext(&li); - se = ln->value; - addReplyArrayLen(c, 6); - addReplyLongLong(c, se->id); - addReplyLongLong(c, se->time); - addReplyLongLong(c, se->duration); - addReplyArrayLen(c, se->argc); - for (j = 0; j < se->argc; j++) addReplyBulk(c, se->argv[j]); - addReplyBulkCBuffer(c, se->peerid, sdslen(se->peerid)); - addReplyBulkCBuffer(c, se->cname, sdslen(se->cname)); - } - } else { - addReplySubcommandSyntaxError(c); - } -} diff --git a/tests/unit/commandlog.tcl b/tests/unit/commandlog.tcl new file mode 100644 index 0000000000..39038d7950 --- /dev/null +++ b/tests/unit/commandlog.tcl @@ -0,0 +1,361 @@ +start_server {tags {"commandlog"} overrides {commandlog-execution-slower-than 1000000 commandlog-request-larger-than 1048576 commandlog-reply-larger-than 1048576}} { + test {COMMANDLOG - check that it starts with an empty log} { + if {$::external} { + r commandlog reset slow + r commandlog reset large-request + r commandlog reset large-reply + } + assert_equal [r commandlog len slow] 0 + assert_equal [r commandlog len large-request] 0 + assert_equal [r commandlog len large-reply] 0 + } + + test {COMMANDLOG - only logs commands exceeding the threshold} { + # for slow + r config set commandlog-execution-slower-than 100000 + r ping + assert_equal [r commandlog len slow] 0 + r debug sleep 0.2 + assert_equal [r commandlog len slow] 1 + + # for large-request + r config set commandlog-request-larger-than 1024 + r ping + assert_equal [r commandlog len large-request] 0 + set value [string repeat A 1024] + r set testkey $value + assert_equal [r commandlog len large-request] 1 + + # for large-reply + r config set commandlog-reply-larger-than 1024 + r ping + assert_equal [r commandlog len large-reply] 0 + r get testkey + assert_equal [r commandlog len large-reply] 1 + } {} {needs:debug} + + test {COMMANDLOG - zero max length is correctly handled} { + r commandlog reset slow + r commandlog reset large-request + r commandlog reset large-reply + r config set commandlog-slow-execution-max-len 0 + r config set commandlog-execution-slower-than 0 + r config set commandlog-large-request-max-len 0 + r config set commandlog-request-larger-than 0 + r config set commandlog-large-reply-max-len 0 + r config set commandlog-reply-larger-than 0 + for {set i 0} {$i < 100} {incr i} { + r ping + } + assert_equal [r commandlog len slow] 0 + assert_equal [r commandlog len large-request] 0 + assert_equal [r commandlog len large-reply] 0 + } + + test {COMMANDLOG - max entries is correctly handled} { + r config set commandlog-execution-slower-than 0 + r config set commandlog-slow-execution-max-len 10 + r config set commandlog-large-request-max-len 10 + r config set commandlog-request-larger-than 0 + r config set commandlog-large-reply-max-len 10 + r config set commandlog-reply-larger-than 0 + for {set i 0} {$i < 100} {incr i} { + r ping + } + assert_equal [r commandlog len slow] 10 + assert_equal [r commandlog len large-request] 10 + assert_equal [r commandlog len large-reply] 10 + } + + test {COMMANDLOG - GET optional argument to limit output len works} { + assert_equal 5 [llength [r commandlog get 5 slow]] + assert_equal 10 [llength [r commandlog get -1 slow]] + assert_equal 10 [llength [r commandlog get 20 slow]] + + assert_equal 5 [llength [r commandlog get 5 large-request]] + assert_equal 10 [llength [r commandlog get -1 large-request]] + assert_equal 10 [llength [r commandlog get 20 large-request]] + + assert_equal 5 [llength [r commandlog get 5 large-reply]] + assert_equal 10 [llength [r commandlog get -1 large-reply]] + assert_equal 10 [llength [r commandlog get 20 large-reply]] + } + + test {COMMANDLOG - RESET subcommand works} { + r config set commandlog-execution-slower-than 100000 + r config set commandlog-request-larger-than 1024 + r config set commandlog-reply-larger-than 1024 + r commandlog reset slow + r commandlog reset large-request + r commandlog reset large-reply + assert_equal [r commandlog len slow] 0 + assert_equal [r commandlog len large-request] 0 + assert_equal [r commandlog len large-reply] 0 + } + + test {COMMANDLOG - logged entry sanity check} { + r client setname foobar + + # for slow + r debug sleep 0.2 + set e [lindex [r commandlog get -1 slow] 0] + assert_equal [llength $e] 6 + if {!$::external} { + assert_equal [lindex $e 0] 118 + } + assert_equal [expr {[lindex $e 2] > 100000}] 1 + assert_equal [lindex $e 3] {debug sleep 0.2} + assert_equal {foobar} [lindex $e 5] + + # for large-request + set value [string repeat A 1024] + r set testkey $value + set e [lindex [r commandlog get -1 large-request] 0] + assert_equal [llength $e] 6 + if {!$::external} { + assert_equal [lindex $e 0] 118 + } + assert_equal [expr {[lindex $e 2] > 1024}] 1 + assert_equal [lindex $e 3] {set testkey {AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA... (896 more bytes)}} + assert_equal {foobar} [lindex $e 5] + + # for large-reply + r get testkey + set e [lindex [r commandlog get -1 large-reply] 0] + assert_equal [llength $e] 6 + if {!$::external} { + assert_equal [lindex $e 0] 117 + } + assert_equal [expr {[lindex $e 2] > 1024}] 1 + assert_equal [lindex $e 3] {get testkey} + assert_equal {foobar} [lindex $e 5] + } {} {needs:debug} + + test {COMMANDLOG slow - Certain commands are omitted that contain sensitive information} { + r config set commandlog-slow-execution-max-len 100 + r config set commandlog-execution-slower-than 0 + r commandlog reset slow + catch {r acl setuser "slowlog test user" +get +set} _ + r config set primaryuser "" + r config set primaryauth "" + r config set requirepass "" + r config set tls-key-file-pass "" + r config set tls-client-key-file-pass "" + r acl setuser slowlog-test-user +get +set + r acl getuser slowlog-test-user + r acl deluser slowlog-test-user non-existing-user + r config set commandlog-execution-slower-than 0 + r config set commandlog-execution-slower-than -1 + set slowlog_resp [r commandlog get -1 slow] + + # Make sure normal configs work, but the two sensitive + # commands are omitted or redacted + assert_equal 11 [llength $slowlog_resp] + assert_equal {commandlog reset slow} [lindex [lindex $slowlog_resp 10] 3] + assert_equal {acl setuser (redacted) (redacted) (redacted)} [lindex [lindex $slowlog_resp 9] 3] + assert_equal {config set primaryuser (redacted)} [lindex [lindex $slowlog_resp 8] 3] + assert_equal {config set primaryauth (redacted)} [lindex [lindex $slowlog_resp 7] 3] + assert_equal {config set requirepass (redacted)} [lindex [lindex $slowlog_resp 6] 3] + assert_equal {config set tls-key-file-pass (redacted)} [lindex [lindex $slowlog_resp 5] 3] + assert_equal {config set tls-client-key-file-pass (redacted)} [lindex [lindex $slowlog_resp 4] 3] + assert_equal {acl setuser (redacted) (redacted) (redacted)} [lindex [lindex $slowlog_resp 3] 3] + assert_equal {acl getuser (redacted)} [lindex [lindex $slowlog_resp 2] 3] + assert_equal {acl deluser (redacted) (redacted)} [lindex [lindex $slowlog_resp 1] 3] + assert_equal {config set commandlog-execution-slower-than 0} [lindex [lindex $slowlog_resp 0] 3] + } {} {needs:repl} + + test {COMMANDLOG slow - Some commands can redact sensitive fields} { + r config set commandlog-execution-slower-than 0 + r commandlog reset slow + r migrate [srv 0 host] [srv 0 port] key 9 5000 + r migrate [srv 0 host] [srv 0 port] key 9 5000 AUTH user + r migrate [srv 0 host] [srv 0 port] key 9 5000 AUTH2 user password + r config set commandlog-execution-slower-than -1 + set slowlog_resp [r commandlog get -1 slow] + + # Make sure all 3 commands were logged, but the sensitive fields are omitted + assert_equal 4 [llength $slowlog_resp] + assert_match {* key 9 5000} [lindex [lindex $slowlog_resp 2] 3] + assert_match {* key 9 5000 AUTH (redacted)} [lindex [lindex $slowlog_resp 1] 3] + assert_match {* key 9 5000 AUTH2 (redacted) (redacted)} [lindex [lindex $slowlog_resp 0] 3] + } {} {needs:repl} + + test {COMMANDLOG slow - Rewritten commands are logged as their original command} { + r config set commandlog-execution-slower-than 0 + + # Test rewriting client arguments + r sadd set a b c d e + r commandlog reset slow + + # SPOP is rewritten as DEL when all keys are removed + r spop set 10 + assert_equal {spop set 10} [lindex [lindex [r commandlog get -1 slow] 0] 3] + + # Test replacing client arguments + r commandlog reset slow + + # GEOADD is replicated as ZADD + r geoadd cool-cities -122.33207 47.60621 Seattle + assert_equal {geoadd cool-cities -122.33207 47.60621 Seattle} [lindex [lindex [r commandlog get -1 slow] 0] 3] + + # Test replacing a single command argument + r set A 5 + r commandlog reset slow + + # GETSET is replicated as SET + r getset a 5 + assert_equal {getset a 5} [lindex [lindex [r commandlog get -1 slow] 0] 3] + + # INCRBYFLOAT calls rewrite multiple times, so it's a special case + r set A 0 + r commandlog reset slow + + # INCRBYFLOAT is replicated as SET + r INCRBYFLOAT A 1.0 + assert_equal {INCRBYFLOAT A 1.0} [lindex [lindex [r commandlog get -1 slow] 0] 3] + + # blocked BLPOP is replicated as LPOP + set rd [valkey_deferring_client] + $rd blpop l 0 + wait_for_blocked_clients_count 1 50 100 + r multi + r lpush l foo + r commandlog reset slow + r exec + $rd read + $rd close + assert_equal {blpop l 0} [lindex [lindex [r commandlog get -1 slow] 0] 3] + } + + test {COMMANDLOG slow - commands with too many arguments are trimmed} { + r config set commandlog-execution-slower-than 0 + r commandlog reset slow + r sadd set 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 + set e [lindex [r commandlog get -1 slow] end-1] + lindex $e 3 + } {sadd set 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 {... (2 more arguments)}} + + test {COMMANDLOG slow - too long arguments are trimmed} { + r config set commandlog-execution-slower-than 0 + r commandlog reset slow + set arg [string repeat A 129] + r sadd set foo $arg + set e [lindex [r commandlog get -1 slow] end-1] + lindex $e 3 + } {sadd set foo {AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA... (1 more bytes)}} + + test {COMMANDLOG slow - EXEC is not logged, just executed commands} { + r config set commandlog-execution-slower-than 100000 + r commandlog reset slow + assert_equal [r commandlog len slow] 0 + r multi + r debug sleep 0.2 + r exec + assert_equal [r commandlog len slow] 1 + set e [lindex [r commandlog get -1 slow] 0] + assert_equal [lindex $e 3] {debug sleep 0.2} + } {} {needs:debug} + + test {COMMANDLOG slow - can clean older entries} { + r client setname lastentry_client + r config set commandlog-slow-execution-max-len 1 + r debug sleep 0.2 + assert {[llength [r commandlog get -1 slow]] == 1} + set e [lindex [r commandlog get -1 slow] 0] + assert_equal {lastentry_client} [lindex $e 5] + } {} {needs:debug} + + test {COMMANDLOG slow - can be disabled} { + r config set commandlog-slow-execution-max-len 1 + r config set commandlog-execution-slower-than 1 + r commandlog reset slow + r debug sleep 0.2 + assert_equal [r commandlog len slow] 1 + r config set commandlog-execution-slower-than -1 + r commandlog reset slow + r debug sleep 0.2 + assert_equal [r commandlog len slow] 0 + } {} {needs:debug} + + test {COMMANDLOG slow - count must be >= -1} { + assert_error "ERR count should be greater than or equal to -1" {r commandlog get -2 slow} + assert_error "ERR count should be greater than or equal to -1" {r commandlog get -222 slow} + } + + test {COMMANDLOG slow - get all slow logs} { + r config set commandlog-execution-slower-than 0 + r config set commandlog-slow-execution-max-len 3 + r commandlog reset slow + + r set key test + r sadd set a b c + r incr num + r lpush list a + + assert_equal [r commandlog len slow] 3 + assert_equal 0 [llength [r commandlog get 0 slow]] + assert_equal 1 [llength [r commandlog get 1 slow]] + assert_equal 3 [llength [r commandlog get -1 slow]] + assert_equal 3 [llength [r commandlog get 3 slow]] + } + + test {COMMANDLOG slow - blocking command is reported only after unblocked} { + # Cleanup first + r del mylist + # create a test client + set rd [valkey_deferring_client] + + # config the slowlog and reset + r config set commandlog-execution-slower-than 0 + r config set commandlog-slow-execution-max-len 110 + r commandlog reset slow + + $rd BLPOP mylist 0 + wait_for_blocked_clients_count 1 50 20 + assert_equal 0 [llength [regexp -all -inline (?=BLPOP) [r commandlog get -1 slow]]] + + r LPUSH mylist 1 + wait_for_blocked_clients_count 0 50 20 + assert_equal 1 [llength [regexp -all -inline (?=BLPOP) [r commandlog get -1 slow]]] + + $rd close + } + + foreach is_eval {0 1} { + test "COMMANDLOG slow - the commands in script are recorded normally - is_eval: $is_eval" { + if {$is_eval == 0} { + r function load replace "#!lua name=mylib \n redis.register_function('myfunc', function(KEYS, ARGS) server.call('ping') end)" + } + + r client setname test-client + r config set commandlog-execution-slower-than 0 + r commandlog reset slow + + if {$is_eval} { + r eval "server.call('ping')" 0 + } else { + r fcall myfunc 0 + } + set slowlog_resp [r commandlog get 2 slow] + assert_equal 2 [llength $slowlog_resp] + + # The first one is the script command, and the second one is the ping command executed in the script + # Each slowlog contains: id, timestamp, execution time, command array, ip:port, client name + set script_cmd [lindex $slowlog_resp 0] + set ping_cmd [lindex $slowlog_resp 1] + + # Make sure the command are logged. + if {$is_eval} { + assert_equal {eval server.call('ping') 0} [lindex $script_cmd 3] + } else { + assert_equal {fcall myfunc 0} [lindex $script_cmd 3] + } + assert_equal {ping} [lindex $ping_cmd 3] + + # Make sure the client info are the logged. + assert_equal [lindex $script_cmd 4] [lindex $ping_cmd 4] + assert_equal {test-client} [lindex $script_cmd 5] + assert_equal {test-client} [lindex $ping_cmd 5] + } + } +} diff --git a/valkey.conf b/valkey.conf index e23aea39de..1838357058 100644 --- a/valkey.conf +++ b/valkey.conf @@ -1969,29 +1969,70 @@ aof-timestamp-enabled no # cluster-announce-port 0 # cluster-announce-bus-port 6380 -################################## SLOW LOG ################################### - -# The server Slow Log is a system to log queries that exceeded a specified -# execution time. The execution time does not include the I/O operations -# like talking with the client, sending the reply and so forth, -# but just the time needed to actually execute the command (this is the only -# stage of command execution where the thread is blocked and can not serve -# other requests in the meantime). -# -# You can configure the slow log with two parameters: one tells the server -# what is the execution time, in microseconds, to exceed in order for the -# command to get logged, and the other parameter is the length of the -# slow log. When a new command is logged the oldest one is removed from the -# queue of logged commands. - -# The following time is expressed in microseconds, so 1000000 is equivalent -# to one second. Note that a negative number disables the slow log, while -# a value of zero forces the logging of every command. -slowlog-log-slower-than 10000 - +################################## COMMAND LOG ################################### + +# The Command Log system is used to record commands that consume significant resources +# during server operation, including CPU, memory, and network bandwidth. +# These commands and the data they access may lead to abnormal instance operations, +# the commandlog can help users quickly and intuitively locate issues. +# +# Currently, three types of command logs are supported: +# +# SLOW: Logs commands that exceed a specified execution time. This excludes time spent +# on I/O operations like client communication and focuses solely on the command's +# processing time, where the main thread is blocked. +# +# LARGE-REQUEST: Logs commands with requests exceeding a defined size. This helps +# identify potentially problematic commands that send excessive data to the server. +# +# LARGE-REPLY: Logs commands that generate replies exceeding a defined size. This +# helps identify commands that return unusually large amounts of data, which may +# impact network performance or client processing. +# +# Each log type has two key parameters: +# 1. A threshold value that determines when a command is logged. This threshold is specific +# to the type of log (e.g., execution time, request size, or reply size). A negative value disables +# logging. A value of 0 logs all commands. +# 2. A maximum length that specifies the number of entries to retain in the log. Increasing +# the length allows more entries to be stored but consumes additional memory. To clear all +# entries for a specific log type and reclaim memory, use the `COMMANDLOG RESET` +# subcommand followed by the log type. +# +# SLOW Command Logs +# The SLOW log records commands that exceed a specified execution time. The execution time +# does not include I/O operations, such as client communication or sending responses. +# It only measures the time spent executing the command, during which the thread is blocked +# and cannot handle other requests. +# +# The threshold is measured in microseconds. +# +# Backward Compatibility: The parameter `slowlog-log-slower-than` is still supported but +# deprecated in favor of `commandlog-slow-execution`. +commandlog-execution-slower-than 10000 # There is no limit to this length. Just be aware that it will consume memory. -# You can reclaim memory used by the slow log with SLOWLOG RESET. -slowlog-max-len 128 +# You can reclaim memory used by the slow log with SLOWLOG RESET or COMMANDLOG RESET SLOW. +commandlog-slow-execution-max-len 128 +# +# LARGE_REQUEST Command Logs +# The LARGE_REQUEST log tracks commands with requests exceeding a specified size. The request size +# includes the command itself and all its arguments. For example, in `SET KEY VALUE`, the size is +# determined by the combined size of the key and value. Commands that consume excessive network +# bandwidth or query buffer space are recorded here. +# +# The threshold is measured in bytes. +commandlog-request-larger-than 1048576 +# Record the number of commands. +commandlog-large-request-max-len 128 +# +# LARGE_REPLY Command Logs +# The LARGE_REPLY log records commands that produce replies exceeding a specified size. These replies +# may consume significant network bandwidth or client output buffer space. Examples include commands +# like `KEYS` or `HGETALL` that return large datasets. Even a `GET` command may qualify if the value +# is substantial. +# +# The threshold is measured in bytes. +commandlog-reply-larger-than 1048576 +commandlog-large-reply-max-len 128 ################################ LATENCY MONITOR ############################## From 9071a5c8e627b4a74d810bfc68fcfb70d0db4dcb Mon Sep 17 00:00:00 2001 From: Harkrishn Patro Date: Fri, 24 Jan 2025 10:47:10 -0800 Subject: [PATCH 10/24] Set GH actions job timeout to a day (#1540) Signed-off-by: Harkrishn Patro --- .github/workflows/daily.yml | 52 +++++++++++++++++----------------- .github/workflows/external.yml | 6 ++-- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index c0e7e4b446..309e8353f7 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -44,7 +44,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'ubuntu') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -87,7 +87,7 @@ jobs: (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'fortify') container: ubuntu:plucky - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -132,7 +132,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'malloc') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -171,7 +171,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'malloc') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -210,7 +210,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, '32bit') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -256,7 +256,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'tls') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -302,7 +302,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'tls') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -348,7 +348,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'iothreads') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -382,7 +382,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'tls') && !contains(github.event.inputs.skipjobs, 'iothreads') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -420,7 +420,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'specific') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -496,7 +496,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && !contains(github.event.inputs.skipjobs, 'valgrind') && !contains(github.event.inputs.skiptests, 'valkey') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -528,7 +528,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && !contains(github.event.inputs.skipjobs, 'valgrind') && !(contains(github.event.inputs.skiptests, 'modules') && contains(github.event.inputs.skiptests, 'unittest')) - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -565,7 +565,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && !contains(github.event.inputs.skipjobs, 'valgrind') && !contains(github.event.inputs.skiptests, 'valkey') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -597,7 +597,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && !contains(github.event.inputs.skipjobs, 'valgrind') && !(contains(github.event.inputs.skiptests, 'modules') && contains(github.event.inputs.skiptests, 'unittest')) - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -634,7 +634,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && !contains(github.event.inputs.skipjobs, 'sanitizer') - timeout-minutes: 14400 + timeout-minutes: 1440 strategy: fail-fast: false matrix: @@ -684,7 +684,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && !contains(github.event.inputs.skipjobs, 'sanitizer') - timeout-minutes: 14400 + timeout-minutes: 1440 strategy: fail-fast: false matrix: @@ -734,7 +734,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && !contains(github.event.inputs.skipjobs, 'sanitizer') - timeout-minutes: 14400 + timeout-minutes: 1440 strategy: fail-fast: false steps: @@ -801,7 +801,7 @@ jobs: runs-on: ubuntu-latest container: ${{ matrix.container }} - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep @@ -867,7 +867,7 @@ jobs: runs-on: ubuntu-latest container: ${{ matrix.container }} - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep @@ -939,7 +939,7 @@ jobs: runs-on: ubuntu-latest container: ${{ matrix.container }} - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep @@ -990,7 +990,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'macos') && !(contains(github.event.inputs.skiptests, 'valkey') && contains(github.event.inputs.skiptests, 'modules')) - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -1021,7 +1021,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'macos') && !contains(github.event.inputs.skiptests, 'sentinel') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -1049,7 +1049,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'macos') && !contains(github.event.inputs.skiptests, 'cluster') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -1081,7 +1081,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'macos') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - uses: maxim-lobanov/setup-xcode@60606e260d2fc5762a71e64e74b2174e8ea3c8bd # v1.6.0 with: @@ -1109,7 +1109,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'freebsd') - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - name: prep if: github.event_name == 'workflow_dispatch' @@ -1216,7 +1216,7 @@ jobs: reply-schemas-validator: runs-on: ubuntu-latest - timeout-minutes: 14400 + timeout-minutes: 1440 if: | (github.event_name == 'workflow_dispatch' || (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || diff --git a/.github/workflows/external.yml b/.github/workflows/external.yml index eeacf80609..a9777538fd 100644 --- a/.github/workflows/external.yml +++ b/.github/workflows/external.yml @@ -17,7 +17,7 @@ jobs: test-external-standalone: runs-on: ubuntu-latest if: github.event_name != 'schedule' || github.repository == 'valkey-io/valkey' - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: Build @@ -42,7 +42,7 @@ jobs: test-external-cluster: runs-on: ubuntu-latest if: github.event_name != 'schedule' || github.repository == 'valkey-io/valkey' - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: Build @@ -70,7 +70,7 @@ jobs: test-external-nodebug: runs-on: ubuntu-latest if: github.event_name != 'schedule' || github.repository == 'valkey-io/valkey' - timeout-minutes: 14400 + timeout-minutes: 1440 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: Build From 66577573f207313b8998f4e0e81e32740adf5d6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Mon, 27 Jan 2025 04:38:54 +0100 Subject: [PATCH 11/24] Test coverage for COMMANDLOG HELP (#1617) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes reply-schema-validator test job which needs coverage for all commands. Failing job: https://github.com/valkey-io/valkey/actions/runs/12969591890/job/36173810824 Signed-off-by: Viktor Söderqvist --- tests/unit/other.tcl | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/other.tcl b/tests/unit/other.tcl index 15c0d38136..0793283592 100644 --- a/tests/unit/other.tcl +++ b/tests/unit/other.tcl @@ -11,6 +11,7 @@ start_server {tags {"other"}} { assert_match "*MEMORY *" [r MEMORY HELP] assert_match "*PUBSUB *" [r PUBSUB HELP] assert_match "*SLOWLOG *" [r SLOWLOG HELP] + assert_match "*COMMANDLOG *" [r COMMANDLOG HELP] assert_match "*CLIENT *" [r CLIENT HELP] assert_match "*COMMAND *" [r COMMAND HELP] assert_match "*CONFIG *" [r CONFIG HELP] From a18fcdb371db351b25332ddc0269589e4c5d56c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Mon, 27 Jan 2025 10:13:46 +0100 Subject: [PATCH 12/24] Deflake hashtable random fairness test (#1618) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the unit test for hashtable random fairness intermittent failures when running with the `--accurate` flag. https://github.com/valkey-io/valkey/actions/runs/12969591890/job/36173815884#step:10:105 The test case picks a random element out of 400, repeated 1M times, and then checks that 60% of the elements are picked within 3 standard deviations from the number of times they're expected to be picked. In this test run (with `--accurate`), the expected number is 2500 and the standard deviation is 50, which is only 2% of the expected value. This makes the check too strict and makes the test flaky. As an alternative, we allow 80% of the elements to be picked within 10% of the expected number. With this alternative condition, we can also raise the check for the non-edge case from 60% to 80% of the elements to be within 3 standard deviations. (With fewer repetitions, 3 standard deviations is greater than 10% of the expected value, so this new condition only affects the `--accurate` test run.) Additional change: Set a random seed to the hash function in the test suite. Until now, we only seeded the random number generator. Signed-off-by: Viktor Söderqvist --- src/unit/test_hashtable.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/unit/test_hashtable.c b/src/unit/test_hashtable.c index 71a7dde841..4fcd1f37a3 100644 --- a/src/unit/test_hashtable.c +++ b/src/unit/test_hashtable.c @@ -21,6 +21,9 @@ static void randomSeed(void) { getRandomBytes((void *)&seed, sizeof(seed)); init_genrand64(seed); srandom((unsigned)seed); + uint8_t hashseed[16]; + getRandomBytes(hashseed, sizeof(hashseed)); + hashtableSetHashFunctionSeed(hashseed); } /* An entry holding a string key and a string value in one allocation. */ @@ -749,7 +752,7 @@ int test_random_entry(int argc, char **argv, int flags) { /* With large n, the distribution approaches a normal distribution and we * can use p68 = within 1 std dev, p95 = within 2 std dev, p99.7 = within 3 * std dev. */ - long p68 = 0, p95 = 0, p99 = 0, p4dev = 0, p5dev = 0; + long p68 = 0, p95 = 0, p99 = 0, p4dev = 0, p5dev = 0, p10percent = 0; for (size_t j = 0; j < count; j++) { double dev = expected - times_picked[j]; p68 += (dev >= -std_dev && dev <= std_dev); @@ -757,7 +760,9 @@ int test_random_entry(int argc, char **argv, int flags) { p99 += (dev >= -std_dev * 3 && dev <= std_dev * 3); p4dev += (dev >= -std_dev * 4 && dev <= std_dev * 4); p5dev += (dev >= -std_dev * 5 && dev <= std_dev * 5); + p10percent += (dev >= -0.1 * expected && dev <= 0.1 * expected); } + printf("Random entry fairness test\n"); printf(" Pick one of %zu entries, %ld times.\n", count, num_rounds); printf(" Expecting each entry to be picked %.2lf times, std dev %.3lf.\n", expected, std_dev); @@ -766,12 +771,25 @@ int test_random_entry(int argc, char **argv, int flags) { printf(" Within 3 std dev (p99) = %.2lf%%\n", 100 * p99 / m); printf(" Within 4 std dev = %.2lf%%\n", 100 * p4dev / m); printf(" Within 5 std dev = %.2lf%%\n", 100 * p5dev / m); + printf(" Within 10%% dev = %.2lf%%\n", 100 * p10percent / m); /* Conclusion? The number of trials (n) relative to the probabilities (p and * 1 − p) must be sufficiently large (n * p ≥ 5 and n * (1 − p) ≥ 5) to * approximate a binomial distribution with a normal distribution. */ if (n / m >= 5 && n * (1 - 1 / m) >= 5) { - TEST_ASSERT_MESSAGE("Too unfair randomness", 100 * p99 / m >= 60.0); + /* Check that 80% of the elements are picked within 3 std deviations of + * the expected number. This is a low bar, since typically the 99% of + * the elements are within this range. + * + * There is an edge case. When n is very large and m is very small, the + * std dev of a binomial distribution is very small, which becomes too + * strict for our bucket layout and makes the test flaky. For example + * with m = 400 and n = 1M, we get an expected value of 2500 and a std + * dev of 50, which is just 2% of the expected value. We lower the bar + * for this case and accept that 80% of elements are just within 10% of + * the expected value. */ + TEST_ASSERT_MESSAGE("Too unfair randomness", + 100 * p99 / m >= 80.0 || 100 * p10percent / m >= 80.0); } else { printf("To uncertain numbers to draw any conclusions about fairness.\n"); } From 88a68303c034fcb61a04969f6ac6eeb3fdeca1ee Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Mon, 27 Jan 2025 06:44:48 -0800 Subject: [PATCH 13/24] Make sure to disable pause after fork for dual channel test (#1612) Might close https://github.com/valkey-io/valkey/issues/1484. I noticed that we don't disable pause after fork on the last test that was getting executed, so it might getting stuck in pause loops after the test ends if it tries another psync for any reason. --------- Signed-off-by: Madelyn Olson --- tests/integration/dual-channel-replication.tcl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl index dc0de5a734..b49b614cd1 100644 --- a/tests/integration/dual-channel-replication.tcl +++ b/tests/integration/dual-channel-replication.tcl @@ -848,7 +848,6 @@ start_server {tags {"dual-channel-replication external:skip"}} { $replica config set repl-timeout 60 $primary config set repl-backlog-size 1mb - $replica debug pause-after-fork 1 $primary debug populate 1000 primary 100000 # Set primary with a slow rdb generation, so that we can easily intercept loading # 10ms per key, with 1000 keys is 10 seconds @@ -856,6 +855,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { test "Test dual-channel-replication primary gets cob overrun during replica rdb load" { set cur_client_closed_count [s -1 client_output_buffer_limit_disconnections] + $replica debug pause-after-fork 1 $replica replicaof $primary_host $primary_port wait_for_condition 500 1000 { [s -1 client_output_buffer_limit_disconnections] > $cur_client_closed_count @@ -868,7 +868,12 @@ start_server {tags {"dual-channel-replication external:skip"}} { } else { fail "Primary did not free repl buf block after sync failure" } + + # Increase the delay to make sure the replica doesn't start another sync + # after it resumes after the first one. + $primary config set repl-diskless-sync-delay 100 wait_and_resume_process 0 + $replica debug pause-after-fork 0 set res [wait_for_log_messages -1 {"*Unable to partial resync with replica * for lack of backlog*"} $loglines 20000 1] set loglines [lindex $res 0] } From 7699a3a94a5d3b2d5f2713893192ba87042fa030 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Mon, 27 Jan 2025 15:45:09 +0100 Subject: [PATCH 14/24] Fix use-after-free in hashtableTwoPhasePopDelete (#1626) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use-after-free has been detect by address sanitizer, such as in this test run: https://github.com/valkey-io/valkey/actions/runs/12981530413/job/36200075972?pr=1620#step:5:1339 `hashtableShrinkIfNeeded` may free one of the hash tables and invalidate the variables used by the `fillBucketHole(ht, b, pos_in_bucket, table_index)` just after, causing use-after-free. Fill bucket hole first and shrink afterwards is assumed to solve the issue. (Not reproduced locally.) Signed-off-by: Viktor Söderqvist --- src/hashtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashtable.c b/src/hashtable.c index 23097eb246..6b503b637c 100644 --- a/src/hashtable.c +++ b/src/hashtable.c @@ -1518,7 +1518,6 @@ void hashtableTwoPhasePopDelete(hashtable *ht, hashtablePosition *pos) { assert(isPositionFilled(b, pos_in_bucket)); b->presence &= ~(1 << pos_in_bucket); ht->used[table_index]--; - hashtableShrinkIfNeeded(ht); hashtableResumeRehashing(ht); if (b->chained && !hashtableIsRehashingPaused(ht)) { /* Rehashing paused also means bucket chain compaction paused. It is @@ -1527,6 +1526,7 @@ void hashtableTwoPhasePopDelete(hashtable *ht, hashtablePosition *pos) { * we do the compaction in the scan and iterator code instead. */ fillBucketHole(ht, b, pos_in_bucket, table_index); } + hashtableShrinkIfNeeded(ht); } /* Initializes the state for an incremental find operation. From e9b8970e72754461d92b8bf22055a58540878d59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Mon, 27 Jan 2025 18:44:24 +0100 Subject: [PATCH 15/24] Relaxed RDB version check (#1604) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New config `rdb-version-check` with values: * `strict`: Reject future RDB versions. * `relaxed`: Try parsing future RDB versions and fail only when an unknown RDB opcode or type is encountered. This can make it possible for Valkey 8.1 to try read a dump from for example Valkey 9.0 or later on a best-effort basis. The conditions for when this is expected to work can be defined when the future Valkey versions are released. Loading is expected to fail in the following cases: * If the data set contains any new key types or other data elements not supported by the current version. * If the RDB contains new representations or encodings of existing key types or other data elements. This change also prepares for the next RDB version bump. A range of RDB versions (12-79) is reserved, since it's expected to be used by foreign software RDB versions, so Valkey will not accept versions in this range even with the `relaxed` version check. The DUMP/RESTORE format has no magic string; only the RDB version number. This change also prepares for the magic string to change from REDIS to VALKEY next time we bump the RDB version. Related to #1108. --------- Signed-off-by: Viktor Söderqvist Co-authored-by: Madelyn Olson --- src/cluster.c | 5 ++- src/config.c | 5 +++ src/rdb.c | 15 ++++++-- src/rdb.h | 20 ++++++++++- src/server.h | 4 +++ tests/assets/encodings-rdb987.rdb | Bin 0 -> 675 bytes tests/integration/rdb.tcl | 55 +++++++++++++++++++++--------- tests/unit/dump.tcl | 18 ++++++++++ valkey.conf | 9 +++++ 9 files changed, 110 insertions(+), 21 deletions(-) create mode 100644 tests/assets/encodings-rdb987.rdb diff --git a/src/cluster.c b/src/cluster.c index cedcd9ecb1..caa1e5798b 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -162,7 +162,10 @@ int verifyDumpPayload(unsigned char *p, size_t len, uint16_t *rdbver_ptr) { if (rdbver_ptr) { *rdbver_ptr = rdbver; } - if (rdbver > RDB_VERSION) return C_ERR; + if ((rdbver >= RDB_FOREIGN_VERSION_MIN && rdbver <= RDB_FOREIGN_VERSION_MAX) || + (rdbver > RDB_VERSION && server.rdb_version_check == RDB_VERSION_CHECK_STRICT)) { + return C_ERR; + } if (server.skip_checksum_validation) return C_OK; diff --git a/src/config.c b/src/config.c index 6bbb82bf52..214faea92a 100644 --- a/src/config.c +++ b/src/config.c @@ -160,6 +160,10 @@ configEnum log_timestamp_format_enum[] = {{"legacy", LOG_TIMESTAMP_LEGACY}, {"milliseconds", LOG_TIMESTAMP_MILLISECONDS}, {NULL, 0}}; +configEnum rdb_version_check_enum[] = {{"strict", RDB_VERSION_CHECK_STRICT}, + {"relaxed", RDB_VERSION_CHECK_RELAXED}, + {NULL, 0}}; + /* Output buffer limits presets. */ clientBufferLimitsConfig clientBufferLimitsDefaults[CLIENT_TYPE_OBUF_COUNT] = { {0, 0, 0}, /* normal */ @@ -3244,6 +3248,7 @@ standardConfig static_configs[] = { createEnumConfig("shutdown-on-sigterm", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, shutdown_on_sig_enum, server.shutdown_on_sigterm, 0, isValidShutdownOnSigFlags, NULL), createEnumConfig("log-format", NULL, MODIFIABLE_CONFIG, log_format_enum, server.log_format, LOG_FORMAT_LEGACY, NULL, NULL), createEnumConfig("log-timestamp-format", NULL, MODIFIABLE_CONFIG, log_timestamp_format_enum, server.log_timestamp_format, LOG_TIMESTAMP_LEGACY, NULL, NULL), + createEnumConfig("rdb-version-check", NULL, MODIFIABLE_CONFIG, rdb_version_check_enum, server.rdb_version_check, RDB_VERSION_CHECK_STRICT, NULL, NULL), /* Integer configs */ createIntConfig("databases", NULL, IMMUTABLE_CONFIG, 1, INT_MAX, server.dbnum, 16, INTEGER_CONFIG, NULL, NULL), diff --git a/src/rdb.c b/src/rdb.c index 6653e99c3a..149825e96b 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -1418,6 +1419,7 @@ int rdbSaveRio(int req, rio *rdb, int *error, int rdbflags, rdbSaveInfo *rsi) { int j; if (server.rdb_checksum) rdb->update_cksum = rioGenericUpdateChecksum; + /* TODO: Change this to "VALKEY%03d" next time we bump the RDB version. */ snprintf(magic, sizeof(magic), "REDIS%04d", RDB_VERSION); if (rdbWriteRaw(rdb, magic, 9) == -1) goto werr; if (rdbSaveInfoAuxFields(rdb, rdbflags, rsi) == -1) goto werr; @@ -3023,17 +3025,24 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin char buf[1024]; int error; long long empty_keys_skipped = 0; + bool is_valkey_magic; rdb->update_cksum = rdbLoadProgressCallback; rdb->max_processing_chunk = server.loading_process_events_interval_bytes; if (rioRead(rdb, buf, 9) == 0) goto eoferr; buf[9] = '\0'; - if (memcmp(buf, "REDIS", 5) != 0) { + if (memcmp(buf, "REDIS0", 6) == 0) { + is_valkey_magic = false; + } else if (memcmp(buf, "VALKEY", 6) == 0) { + is_valkey_magic = true; + } else { serverLog(LL_WARNING, "Wrong signature trying to load DB from file"); return C_ERR; } - rdbver = atoi(buf + 5); - if (rdbver < 1 || rdbver > RDB_VERSION) { + rdbver = atoi(buf + 6); + if (rdbver < 1 || + (rdbver >= RDB_FOREIGN_VERSION_MIN && !is_valkey_magic) || + (rdbver > RDB_VERSION && server.rdb_version_check == RDB_VERSION_CHECK_STRICT)) { serverLog(LL_WARNING, "Can't handle RDB format version %d", rdbver); return C_ERR; } diff --git a/src/rdb.h b/src/rdb.h index 7342a926b5..9f19a3a9ec 100644 --- a/src/rdb.h +++ b/src/rdb.h @@ -37,9 +37,27 @@ #include "server.h" /* The current RDB version. When the format changes in a way that is no longer - * backward compatible this number gets incremented. */ + * backward compatible this number gets incremented. + * + * RDB 11 is the last open-source Redis RDB version, used by Valkey 7.x and 8.x. + * + * RDB 12+ are non-open-source Redis formats. + * + * Next time we bump the Valkey RDB version, use much higher version to avoid + * collisions with non-OSS Redis RDB versions. For example, we could use RDB + * version 90 for Valkey 9.0. + * + * In an RDB file/stream, we also check the magic string REDIS or VALKEY but in + * the DUMP/RESTORE format, there is only the RDB version number and no magic + * string. */ #define RDB_VERSION 11 +/* Reserved range for foreign (unsupported, non-OSS) RDB format. */ +#define RDB_FOREIGN_VERSION_MIN 12 +#define RDB_FOREIGN_VERSION_MAX 79 +static_assert(RDB_VERSION < RDB_FOREIGN_VERSION_MIN || RDB_VERSION > RDB_FOREIGN_VERSION_MAX, + "RDB version in foreign version range"); + /* Defines related to the dump file format. To store 32 bits lengths for short * keys requires a lot of space, so we check the most significant 2 bits of * the first byte to interpreter the length: diff --git a/src/server.h b/src/server.h index f5a4d4951b..37f4f0c27b 100644 --- a/src/server.h +++ b/src/server.h @@ -600,6 +600,9 @@ typedef enum { LOG_TIMESTAMP_LEGACY = 0, LOG_TIMESTAMP_ISO8601, LOG_TIMESTAMP_MILLISECONDS } log_timestamp_type; +typedef enum { RDB_VERSION_CHECK_STRICT = 0, + RDB_VERSION_CHECK_RELAXED } rdb_version_check_type; + /* common sets of actions to pause/unpause */ #define PAUSE_ACTIONS_CLIENT_WRITE_SET \ (PAUSE_ACTION_CLIENT_WRITE | PAUSE_ACTION_EXPIRE | PAUSE_ACTION_EVICT | PAUSE_ACTION_REPLICA) @@ -1768,6 +1771,7 @@ struct valkeyServer { int active_defrag_enabled; int sanitize_dump_payload; /* Enables deep sanitization for ziplist and listpack in RDB and RESTORE. */ int skip_checksum_validation; /* Disable checksum validation for RDB and RESTORE payload. */ + int rdb_version_check; /* Try to load RDB produced by a future version. */ int jemalloc_bg_thread; /* Enable jemalloc background thread */ int active_defrag_configuration_changed; /* Config changed; need to recompute active_defrag_cpu_percent. */ size_t active_defrag_ignore_bytes; /* minimum amount of fragmentation waste to start active defrag */ diff --git a/tests/assets/encodings-rdb987.rdb b/tests/assets/encodings-rdb987.rdb new file mode 100644 index 0000000000000000000000000000000000000000..2357671597724721d10930f7a2ef378febcb7a36 GIT binary patch literal 675 zcmbVKu}UOC5UuKJmvv?Z0|U7O+2r6j40>i@UPmmNhA10%sq zxQj0G8;lKlH4!5}VZXq@54g2@xCPA)8){znRCV{O*YEq+Z=35sSKBLpf#gZ)4jaN4 zkt$)W$P^i4C{;=_ny95FgRHfb@qb1;out`PYk8%;iW(E4wMY~iOi61^iBf1WlRVdw z7bCEF+6e&6NW$*ceX$$=n%coxM)x18ja;>;(GI)F!zUT|;~YC=P3>GA3up*Dh> zD~DV*hFWTx3urug4#DZ|-u=z@LVqtFWR%ln*N<>%N`ej#{jcn`$R87?B_c|N>Ea?ZW- LSCC@Ntg7(>Xe^=_ literal 0 HcmV?d00001 diff --git a/tests/integration/rdb.tcl b/tests/integration/rdb.tcl index 61cb0cea7e..1772fee601 100644 --- a/tests/integration/rdb.tcl +++ b/tests/integration/rdb.tcl @@ -1,9 +1,21 @@ tags {"rdb external:skip"} { +# Helper function to start a server and kill it, just to check the error +# logged. +set defaults {} +proc start_server_and_kill_it {overrides code} { + upvar defaults defaults srv srv server_path server_path + set config [concat $defaults $overrides] + set srv [start_server [list overrides $config keep_persistence true]] + uplevel 1 $code + kill_server $srv +} + set server_path [tmpdir "server.rdb-encoding-test"] # Copy RDB with different encodings in server path exec cp tests/assets/encodings.rdb $server_path +exec cp tests/assets/encodings-rdb987.rdb $server_path exec cp tests/assets/list-quicklist.rdb $server_path start_server [list overrides [list "dir" $server_path "dbfilename" "list-quicklist.rdb" save ""]] { @@ -15,11 +27,7 @@ start_server [list overrides [list "dir" $server_path "dbfilename" "list-quickli } {7} } -start_server [list overrides [list "dir" $server_path "dbfilename" "encodings.rdb"]] { - test "RDB encoding loading test" { - r select 0 - csvdump r - } {"0","compressible","string","aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" +set csv_dump {"0","compressible","string","aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" "0","hash","hash","a","1","aa","10","aaa","100","b","2","bb","20","bbb","200","c","3","cc","30","ccc","300","ddd","400","eee","5000000000", "0","hash_zipped","hash","a","1","b","2","c","3", "0","list","list","1","2","3","a","b","c","100000","6000000000","1","2","3","a","b","c","100000","6000000000","1","2","3","a","b","c","100000","6000000000", @@ -33,6 +41,32 @@ start_server [list overrides [list "dir" $server_path "dbfilename" "encodings.rd "0","zset","zset","a","1","b","2","c","3","aa","10","bb","20","cc","30","aaa","100","bbb","200","ccc","300","aaaa","1000","cccc","123456789","bbbb","5000000000", "0","zset_zipped","zset","a","1","b","2","c","3", } + +start_server [list overrides [list "dir" $server_path "dbfilename" "encodings.rdb"]] { + test "RDB encoding loading test" { + r select 0 + csvdump r + } $csv_dump +} + +start_server_and_kill_it [list "dir" $server_path "dbfilename" "encodings-rdb987.rdb"] { + test "RDB future version loading, strict version check" { + wait_for_condition 50 100 { + [string match {*Fatal error loading*} \ + [exec tail -1 < [dict get $srv stdout]]] + } else { + fail "Server started even if RDB version check failed" + } + } +} + +start_server [list overrides [list "dir" $server_path \ + "dbfilename" "encodings-rdb987.rdb" \ + "rdb-version-check" "relaxed"]] { + test "RDB future version loading, relaxed version check" { + r select 0 + csvdump r + } $csv_dump } set server_path [tmpdir "server.rdb-startup-test"] @@ -80,17 +114,6 @@ start_server [list overrides [list "dir" $server_path] keep_persistence true] { r del stream } -# Helper function to start a server and kill it, just to check the error -# logged. -set defaults {} -proc start_server_and_kill_it {overrides code} { - upvar defaults defaults srv srv server_path server_path - set config [concat $defaults $overrides] - set srv [start_server [list overrides $config keep_persistence true]] - uplevel 1 $code - kill_server $srv -} - # Make the RDB file unreadable file attributes [file join $server_path dump.rdb] -permissions 0222 diff --git a/tests/unit/dump.tcl b/tests/unit/dump.tcl index e4c0f9d312..5f1de4f6b5 100644 --- a/tests/unit/dump.tcl +++ b/tests/unit/dump.tcl @@ -124,6 +124,24 @@ start_server {tags {"dump"}} { close_replication_stream $repl } {} {needs:repl} + test {RESTORE key with future RDB version, strict version check} { + # str len RDB 222 CRC64 checksum + # | | | | + set bar_dump "\x00\x03bar\xde\x00\x0fYUza\xd3\xec\xe0" + assert_error {ERR DUMP payload version or checksum are wrong} {r restore foo 0 $bar_dump replace} + } + + test {RESTORE key with future RDB version, relaxed version check} { + # str len RDB 222 CRC64 checksum + # | | | | + set bar_dump "\x00\x03bar\xde\x00\x0fYUza\xd3\xec\xe0" + r config set rdb-version-check relaxed + catch {r restore foo 0 $bar_dump replace} e + r config set rdb-version-check strict + assert_equal {bar} [r get foo] + set e + } {OK} + test {DUMP of non existing key returns nil} { r dump nonexisting_key } {} diff --git a/valkey.conf b/valkey.conf index 1838357058..99041fb409 100644 --- a/valkey.conf +++ b/valkey.conf @@ -549,6 +549,15 @@ rdbcompression yes # tell the loading code to skip the check. rdbchecksum yes +# Valkey can try to load an RDB dump produced by a future version of Valkey. +# This can only work on a best-effort basis, because future RDB versions may +# contain information that's not known to the current version. If no new features +# are used, it may be possible to import the data produced by a later version, +# but loading is aborted if unknown information is encountered. Possible values +# are 'strict' and 'relaxed'. This also applies to replication and the RESTORE +# command. +rdb-version-check strict + # Enables or disables full sanitization checks for ziplist and listpack etc when # loading an RDB or RESTORE payload. This reduces the chances of a assertion or # crash later on while processing commands. From 230efa4fbf4f90828c211924c0e02fe3495bd727 Mon Sep 17 00:00:00 2001 From: ranshid <88133677+ranshid@users.noreply.github.com> Date: Tue, 28 Jan 2025 12:35:32 +0200 Subject: [PATCH 16/24] deflake tracking-redir-broken test (#1628) This address 2 issues: 1. It is possible (somehow) that the inner server client (r) was not working resp 3 when entering this test. this makes sure it does. 2. in case the test failed it might leave the redirection client closed. there is a cross test assumption it should be open, so moved most of the assert checks to the end of the test. example fail: https://github.com/valkey-io/valkey/actions/runs/12979601179/job/36195523412 --------- Signed-off-by: Ran Shidlansik --- tests/unit/tracking.tcl | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tests/unit/tracking.tcl b/tests/unit/tracking.tcl index 9fdc4b79cd..1d1ad4044b 100644 --- a/tests/unit/tracking.tcl +++ b/tests/unit/tracking.tcl @@ -236,20 +236,14 @@ start_server {tags {"tracking network logreqres:skip"}} { } test {RESP3 Client gets tracking-redir-broken push message after cached key changed when rediretion client is terminated} { + # make sure r is working resp 3 + r HELLO 3 r CLIENT TRACKING on REDIRECT $redir_id $rd_sg SET key1 1 r GET key1 $rd_redirection QUIT assert_equal OK [$rd_redirection read] $rd_sg SET key1 2 - set MAX_TRIES 100 - set res -1 - for {set i 0} {$i <= $MAX_TRIES && $res < 0} {incr i} { - set res [lsearch -exact [r PING] "tracking-redir-broken"] - } - assert {$res >= 0} - # Consume PING reply - assert_equal PONG [r read] # Reinstantiating after QUIT set rd_redirection [valkey_deferring_client] @@ -257,6 +251,15 @@ start_server {tags {"tracking network logreqres:skip"}} { set redir_id [$rd_redirection read] $rd_redirection SUBSCRIBE __redis__:invalidate $rd_redirection read ; # Consume the SUBSCRIBE reply + + # Wait to read the tracking-redir-broken + wait_for_condition 1000 50 { + [lsearch -exact [r PING] "tracking-redir-broken"] + } else { + fail "Failed to get redirect broken indication" + } + # Consume PING reply + assert_equal PONG [r read] } test {Different clients can redirect to the same connection} { From f695c52acb82519d8b1362d456b2ec64ae401fd4 Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Tue, 28 Jan 2025 06:35:24 -0800 Subject: [PATCH 17/24] Fix timing issue in pause test (#1631) --- tests/unit/pause.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/pause.tcl b/tests/unit/pause.tcl index 9697c3b44e..d7431e4e51 100644 --- a/tests/unit/pause.tcl +++ b/tests/unit/pause.tcl @@ -7,7 +7,7 @@ start_server {tags {"pause network"}} { assert_equal [s paused_actions] "write" after 1000 set timeout [s paused_timeout_milliseconds] - assert {$timeout > 0 && $timeout < 9000} + assert {$timeout > 0 && $timeout <= 9000} r client unpause r multi From ad60d6b7b36e0599dba8f817c6fcd9bb69c30f37 Mon Sep 17 00:00:00 2001 From: Wen Hui Date: Tue, 28 Jan 2025 11:37:41 -0500 Subject: [PATCH 18/24] Initialize one variable in struct to avoid risk (#1606) In C, we had better initialize every variable in struct, this PR fixes one missed variable Initialization. --------- Signed-off-by: hwware --- src/cluster_legacy.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 5e976d3060..22e69e0a6a 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1130,6 +1130,7 @@ void clusterInit(void) { server.cluster->failover_auth_time = 0; server.cluster->failover_auth_count = 0; server.cluster->failover_auth_rank = 0; + server.cluster->failover_auth_sent = 0; server.cluster->failover_failed_primary_rank = 0; server.cluster->failover_auth_epoch = 0; server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE; From d72a97edf6eb6c68b0aba99a7669dd521bb0784d Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Wed, 29 Jan 2025 01:22:16 +0800 Subject: [PATCH 19/24] RDMA: Protect RDMA memory regions (#1602) Use Linux syscall mmap/munmap to manage a RDMA memory region, then we have a guard page protected VMA like (cat /proc/PID/maps): 785018afe000-785018aff000 ---p 00000000 00:00 0 -> top guard page 785018aff000-785018bff000 rw-p 00000000 00:00 0 -> RDMA memory region 785018bff000-785018c00000 ---p 00000000 00:00 0 -> bottom guard page Once any code accesses memory unexpectedly, segment fault occurs. Signed-off-by: zhenwei pi Signed-off-by: zhenwei pi --- src/rdma.c | 73 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 51 insertions(+), 22 deletions(-) diff --git a/src/rdma.c b/src/rdma.c index 7fe65ad2d2..db918ec797 100644 --- a/src/rdma.c +++ b/src/rdma.c @@ -23,6 +23,7 @@ #include #include #include +#include #define CONN_TYPE_RDMA "rdma" @@ -134,6 +135,8 @@ static list *pending_list; static rdma_listener *rdma_listeners; static serverRdmaContextConfig *rdma_config; +static size_t page_size; + static ConnectionType CT_RDMA; static void serverRdmaError(char *err, const char *fmt, ...) { @@ -191,31 +194,56 @@ static int rdmaPostRecv(RdmaContext *ctx, struct rdma_cm_id *cm_id, ValkeyRdmaCm return C_OK; } -/* To make Valkey forkable, buffer which is registered as RDMA - * memory region should be aligned to page size. And the length - * also need be aligned to page size. +/* To make Valkey forkable, buffer which is registered as RDMA memory region should be + * aligned to page size. And the length also need be aligned to page size. * Random segment-fault case like this: * 0x7f2764ac5000 - 0x7f2764ac7000 * |ptr0 128| ... |ptr1 4096| ... |ptr2 512| * - * After ibv_reg_mr(pd, ptr1, 4096, access), the full range of 8K - * becomes DONTFORK. And the child process will hit a segment fault - * during access ptr0/ptr2. - * Note that the memory can be freed by libc free only. - * TODO: move it to zmalloc.c if necessary + * After ibv_reg_mr(pd, ptr1, 4096, access), the full range of 8K becomes DONTFORK. And + * the child process will hit a segment fault during access ptr0/ptr2. + * + * The portable posix_memalign(&tmp, page_size, aligned_size) would be fine too. However, + * RDMA is supported by Linux only, so it would not break anything. Using raw mmap syscall + * to allocate a separate virtual memory area(VMA), also make it protected by the 2 guard + * pages (a top one and a bottom one). */ -static void *page_aligned_zalloc(size_t size) { - void *tmp; - size_t aligned_size, page_size = sysconf(_SC_PAGESIZE); +static void *rdmaMemoryAlloc(size_t size) { + size_t real_size, aligned_size = (size + page_size - 1) & (~(page_size - 1)); + uint8_t *ptr; - aligned_size = (size + page_size - 1) & (~(page_size - 1)); - if (posix_memalign(&tmp, page_size, aligned_size)) { - serverPanic("posix_memalign failed"); + real_size = aligned_size + 2 * page_size; + ptr = mmap(NULL, real_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (ptr == MAP_FAILED) { + serverPanic("failed to allocate memory for RDMA region"); + } + + madvise(ptr, real_size, MADV_DONTDUMP); /* no need to dump this VMA on coredump */ + mprotect(ptr, page_size, PROT_NONE); /* top page of this VMA */ + mprotect(ptr + size + page_size, page_size, PROT_NONE); /* bottom page of this VMA */ + + return ptr + page_size; +} + +static void rdmaMemoryFree(void *ptr, size_t size) { + uint8_t *real_ptr; + size_t real_size, aligned_size; + + if (!ptr) { + return; } - memset(tmp, 0x00, aligned_size); + if ((unsigned long)ptr & (page_size - 1)) { + serverPanic("unaligned memory in use for RDMA region"); + } - return tmp; + aligned_size = (size + page_size - 1) & (~(page_size - 1)); + real_size = aligned_size + 2 * page_size; + real_ptr = (uint8_t *)ptr - page_size; + + if (munmap(real_ptr, real_size)) { + serverPanic("failed to free memory for RDMA region"); + } } static void rdmaDestroyIoBuf(RdmaContext *ctx) { @@ -224,7 +252,7 @@ static void rdmaDestroyIoBuf(RdmaContext *ctx) { ctx->rx.mr = NULL; } - zlibc_free(ctx->rx.addr); + rdmaMemoryFree(ctx->rx.addr, ctx->rx.length); ctx->rx.addr = NULL; if (ctx->tx.mr) { @@ -232,7 +260,7 @@ static void rdmaDestroyIoBuf(RdmaContext *ctx) { ctx->tx.mr = NULL; } - zlibc_free(ctx->tx.addr); + rdmaMemoryFree(ctx->tx.addr, ctx->tx.length); ctx->tx.addr = NULL; if (ctx->cmd_mr) { @@ -240,7 +268,7 @@ static void rdmaDestroyIoBuf(RdmaContext *ctx) { ctx->cmd_mr = NULL; } - zlibc_free(ctx->cmd_buf); + rdmaMemoryFree(ctx->cmd_buf, sizeof(ValkeyRdmaCmd) * VALKEY_RDMA_MAX_WQE * 2); ctx->cmd_buf = NULL; } @@ -251,7 +279,7 @@ static int rdmaSetupIoBuf(RdmaContext *ctx, struct rdma_cm_id *cm_id) { int i; /* setup CMD buf & MR */ - ctx->cmd_buf = page_aligned_zalloc(length); + ctx->cmd_buf = rdmaMemoryAlloc(length); ctx->cmd_mr = ibv_reg_mr(ctx->pd, ctx->cmd_buf, length, access); if (!ctx->cmd_mr) { serverLog(LL_WARNING, "RDMA: reg mr for CMD failed"); @@ -275,7 +303,7 @@ static int rdmaSetupIoBuf(RdmaContext *ctx, struct rdma_cm_id *cm_id) { /* setup recv buf & MR */ access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE; length = rdma_config->rx_size; - ctx->rx.addr = page_aligned_zalloc(length); + ctx->rx.addr = rdmaMemoryAlloc(length); ctx->rx.length = length; ctx->rx.mr = ibv_reg_mr(ctx->pd, ctx->rx.addr, length, access); if (!ctx->rx.mr) { @@ -387,7 +415,7 @@ static int rdmaAdjustSendbuf(RdmaContext *ctx, unsigned int length) { } /* create a new buffer & MR */ - ctx->tx.addr = page_aligned_zalloc(length); + ctx->tx.addr = rdmaMemoryAlloc(length); ctx->tx_length = length; ctx->tx.mr = ibv_reg_mr(ctx->pd, ctx->tx.addr, length, access); if (!ctx->tx.mr) { @@ -1705,6 +1733,7 @@ static int connRdmaAddr(connection *conn, char *ip, size_t ip_len, int *port, in static void rdmaInit(void) { pending_list = listCreate(); + page_size = sysconf(_SC_PAGESIZE); VALKEY_BUILD_BUG_ON(sizeof(ValkeyRdmaFeature) != 32); VALKEY_BUILD_BUG_ON(sizeof(ValkeyRdmaKeepalive) != 32); From 4b8f3ed9acf3103349b6368407fc48bef55b3327 Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 30 Jan 2025 01:06:13 +0800 Subject: [PATCH 20/24] Do command existence and arity checks when loading AOF to avoid crash (#1614) Do command existence and arity checks when loading AOF to avoid crash Currently, loading commands such as `cluster` or `cluster slots xxx` from AOF will cause the server to crash. 1. `cluster` is a container command, and executing proc will cause a crash because we do not check subcommand and arity. 2. `cluster slots xxx`, arity check fail, reply with an error from the AOF client and trigger a panic. Of course, there are many other ways for a problematic AOF to cause the panic, but it is still necessary do some basic checks before executing. In this way, in these basic cases, we can print useful error messages instead of crashing directly. Signed-off-by: Binbin --- src/aof.c | 10 ++++----- src/server.c | 3 +-- tests/integration/aof.tcl | 43 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/src/aof.c b/src/aof.c index c6828d4b6e..d4c2a523c2 100644 --- a/src/aof.c +++ b/src/aof.c @@ -1532,10 +1532,11 @@ int loadSingleAppendOnlyFile(char *filename) { } /* Command lookup */ - cmd = lookupCommand(argv, argc); - if (!cmd) { - serverLog(LL_WARNING, "Unknown command '%s' reading the append only file %s", (char *)argv[0]->ptr, - filename); + sds err = NULL; + fakeClient->cmd = fakeClient->lastcmd = cmd = lookupCommand(argv, argc); + if ((!cmd && !commandCheckExistence(fakeClient, &err)) || (cmd && !commandCheckArity(cmd, argc, &err))) { + serverLog(LL_WARNING, "Error reading the append only file %s, error: %s", filename, err); + sdsfree(err); freeClientArgv(fakeClient); ret = AOF_FAILED; goto cleanup; @@ -1544,7 +1545,6 @@ int loadSingleAppendOnlyFile(char *filename) { if (cmd->proc == multiCommand) valid_before_multi = valid_up_to; /* Run the command in the context of a fake client */ - fakeClient->cmd = fakeClient->lastcmd = cmd; if (fakeClient->flag.multi && fakeClient->cmd->proc != execCommand) { /* Note: we don't have to attempt calling evalGetCommandFlags, * since this is AOF, the checks in processCommand are not made diff --git a/src/server.c b/src/server.c index 898675ed04..495247e36a 100644 --- a/src/server.c +++ b/src/server.c @@ -3911,7 +3911,7 @@ void afterCommand(client *c) { int commandCheckExistence(client *c, sds *err) { if (c->cmd) return 1; if (!err) return 0; - if (isContainerCommandBySds(c->argv[0]->ptr)) { + if (isContainerCommandBySds(c->argv[0]->ptr) && c->argc >= 2) { /* If we can't find the command but argv[0] by itself is a command * it means we're dealing with an invalid subcommand. Print Help. */ sds cmd = sdsnew((char *)c->argv[0]->ptr); @@ -4025,7 +4025,6 @@ int processCommand(client *c) { return C_OK; } - /* Check if the command is marked as protected and the relevant configuration allows it */ if (c->cmd->flags & CMD_PROTECTED) { if ((c->cmd->proc == debugCommand && !allowProtectedAction(server.enable_debug_cmd, c)) || diff --git a/tests/integration/aof.tcl b/tests/integration/aof.tcl index 3a666bbd15..02df4bb4f8 100644 --- a/tests/integration/aof.tcl +++ b/tests/integration/aof.tcl @@ -259,7 +259,7 @@ tags {"aof external:skip"} { start_server_aof_ex [list dir $server_path aof-load-truncated yes] [list wait_ready false] { test "Unknown command: Server should have logged an error" { - wait_for_log_messages 0 {"*Unknown command 'bla' reading the append only file*"} 0 10 1000 + wait_for_log_messages 0 {"*unknown command 'bla'*"} 0 10 1000 } } @@ -693,6 +693,47 @@ tags {"aof cluster external:skip"} { assert_equal [r ping] {PONG} } } + + test {Test command check in aof won't crash} { + # cluster, wrong number of arguments for 'cluster' command + create_aof $aof_dirpath $aof_file { append_to_aof [formatCommand cluster] } + create_aof_manifest $aof_dirpath $aof_manifest_file { append_to_manifest "file appendonly.aof.1.incr.aof seq 1 type i\n" } + start_server_aof_ex [list dir $server_path] [list wait_ready false] { + wait_for_condition 100 50 { + ! [is_alive [srv pid]] + } else { + fail "AOF loading didn't fail" + } + assert_equal 1 [count_message_lines $server_path/stdout "wrong number of arguments for 'cluster' command"] + } + clean_aof_persistence $aof_dirpath + + # cluster slots-xxx, unknown subcommand 'slots-xxx' + create_aof $aof_dirpath $aof_file { append_to_aof [formatCommand cluster slots-xxx] } + create_aof_manifest $aof_dirpath $aof_manifest_file { append_to_manifest "file appendonly.aof.1.incr.aof seq 1 type i\n" } + start_server_aof_ex [list dir $server_path] [list wait_ready false] { + wait_for_condition 100 50 { + ! [is_alive [srv pid]] + } else { + fail "AOF loading didn't fail" + } + assert_equal 1 [count_message_lines $server_path/stdout "unknown subcommand 'slots-xxx'"] + } + clean_aof_persistence $aof_dirpath + + # cluster slots xxx, wrong number of arguments for 'cluster|slots' command + create_aof $aof_dirpath $aof_file { append_to_aof [formatCommand cluster slots xxx] } + create_aof_manifest $aof_dirpath $aof_manifest_file { append_to_manifest "file appendonly.aof.1.incr.aof seq 1 type i\n" } + start_server_aof_ex [list dir $server_path] [list wait_ready false] { + wait_for_condition 100 50 { + ![is_alive [srv pid]] + } else { + fail "AOF loading didn't fail" + } + assert_equal 1 [count_message_lines $server_path/stdout "wrong number of arguments for 'cluster|slots' command"] + } + clean_aof_persistence $aof_dirpath + } } set ::singledb $old_singledb From ff8a528fd6353310b2d09206f543485f691b8c7d Mon Sep 17 00:00:00 2001 From: xingbowang Date: Wed, 29 Jan 2025 13:13:40 -0800 Subject: [PATCH 21/24] Fix a heap-use-after-free bug in cluster bus (#1643) https://github.com/valkey-io/valkey/issues/1642 Avoid heap-use-after-free in cluster bus around node cleanup code. freeClusterNode free the human_nodename. https://github.com/valkey-io/valkey/blob/unstable/src/cluster_legacy.c#L1725 Then it calls freeClusterLink to free the links. https://github.com/valkey-io/valkey/blob/unstable/src/cluster_legacy.c#L1730 freeClusterLink print human_nodename here, which just got freed by the caller freeClusterNode. https://github.com/valkey-io/valkey/blob/unstable/src/cluster_legacy.c#L1383 Signed-off-by: xingbowang --- src/cluster_legacy.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 22e69e0a6a..78db1aa641 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1721,14 +1721,16 @@ void freeClusterNode(clusterNode *n) { nodename = sdsnewlen(n->name, CLUSTER_NAMELEN); serverAssert(dictDelete(server.cluster->nodes, nodename) == DICT_OK); sdsfree(nodename); - sdsfree(n->hostname); - sdsfree(n->human_nodename); - sdsfree(n->announce_client_ipv4); - sdsfree(n->announce_client_ipv6); /* Release links and associated data structures. */ if (n->link) freeClusterLink(n->link); if (n->inbound_link) freeClusterLink(n->inbound_link); + + /* Free these members after links are freed, as freeClusterLink may access them. */ + sdsfree(n->hostname); + sdsfree(n->human_nodename); + sdsfree(n->announce_client_ipv4); + sdsfree(n->announce_client_ipv6); listRelease(n->fail_reports); zfree(n->replicas); zfree(n); From d3aabd7f13a8ad4d6b45767cce04dc284dce5b0f Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Wed, 29 Jan 2025 13:20:38 -0800 Subject: [PATCH 22/24] Hex encode the data in dump test (#1637) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the failure here: https://github.com/valkey-io/valkey/actions/runs/13000845302/job/36259016156#step:5:7272. This change does three things: 1. For some reason TCL 8.5 (which is used on macos) is handling `\x03ba` as `\0xba`, according to https://www.tcl-lang.org/man/tcl8.5/TclCmd/Tcl.htm#M27 so we encode "bar" using hex escapes too. 2. Fix a spacing issue. 3. Make it so that if the restore fails, it immediately errors. --------- Signed-off-by: Madelyn Olson Signed-off-by: Viktor Söderqvist Co-authored-by: Viktor Söderqvist --- tests/unit/dump.tcl | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/tests/unit/dump.tcl b/tests/unit/dump.tcl index 5f1de4f6b5..37eaffc0c5 100644 --- a/tests/unit/dump.tcl +++ b/tests/unit/dump.tcl @@ -125,22 +125,20 @@ start_server {tags {"dump"}} { } {} {needs:repl} test {RESTORE key with future RDB version, strict version check} { - # str len RDB 222 CRC64 checksum - # | | | | - set bar_dump "\x00\x03bar\xde\x00\x0fYUza\xd3\xec\xe0" - assert_error {ERR DUMP payload version or checksum are wrong} {r restore foo 0 $bar_dump replace} + # str len "bar" RDB 222 CRC64 checksum + # | | | | | + set bar_dump "\x00\x03\x62\x61\x72\xde\x00\x0fYUza\xd3\xec\xe0" + assert_error {ERR DUMP payload version or checksum are wrong} {r restore foo 0 $bar_dump replace} } test {RESTORE key with future RDB version, relaxed version check} { - # str len RDB 222 CRC64 checksum - # | | | | - set bar_dump "\x00\x03bar\xde\x00\x0fYUza\xd3\xec\xe0" - r config set rdb-version-check relaxed - catch {r restore foo 0 $bar_dump replace} e + r config set rdb-version-check relaxed + # |type|len| | RDB | CRC64 | + # |str | 3 | "bar" | 222 | checksum | + r restore foo 0 "\x00\x03\x62\x61\x72\xde\x00\x0fYUza\xd3\xec\xe0" replace r config set rdb-version-check strict assert_equal {bar} [r get foo] - set e - } {OK} + } test {DUMP of non existing key returns nil} { r dump nonexisting_key From 12ec3d59327166e112cd8e4650cbdf978adbaaf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Wed, 29 Jan 2025 22:29:35 +0100 Subject: [PATCH 23/24] Increase timeout for cross-version-replication test (#1644) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #1641 Signed-off-by: Viktor Söderqvist --- tests/integration/cross-version-replication.tcl | 2 +- tests/support/util.tcl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/cross-version-replication.tcl b/tests/integration/cross-version-replication.tcl index 6ec3d0db80..104e821460 100644 --- a/tests/integration/cross-version-replication.tcl +++ b/tests/integration/cross-version-replication.tcl @@ -18,7 +18,7 @@ start_server {tags {"repl needs:other-server external:skip"} start-other-server start_server {} { test "Start replication from $primary_name_and_version" { r replicaof [srv -1 host] [srv -1 port] - wait_for_sync r + wait_for_sync r 500 100 # The key has been transferred. assert_equal bar [r get foo] assert_equal up [s master_link_status] diff --git a/tests/support/util.tcl b/tests/support/util.tcl index 8f3cda3a4c..5ba1f8ea55 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -110,8 +110,8 @@ proc waitForBgrewriteaof r { } } -proc wait_for_sync r { - wait_for_condition 50 100 { +proc wait_for_sync {r {maxtries 50} {delay 100}} { + wait_for_condition $maxtries $delay { [status $r master_link_status] eq "up" } else { fail "replica didn't sync in time" From 78bcc0a2cfb33ec940bfb14cb12ea10efa3af93a Mon Sep 17 00:00:00 2001 From: Harkrishn Patro Date: Thu, 30 Jan 2025 15:21:31 -0800 Subject: [PATCH 24/24] Update daily failure notification job list (#1648) Two jobs were missing from the job list for failure notification * test-ubuntu-tls-io-threads * test-sanitizer-force-defrag Signed-off-by: Harkrishn Patro --- .github/workflows/daily.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index 309e8353f7..31979a82e1 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -1262,7 +1262,7 @@ jobs: notify-about-job-results: runs-on: ubuntu-latest if: always() && github.event_name == 'schedule' && github.repository == 'valkey-io/valkey' - needs: [test-ubuntu-jemalloc, test-ubuntu-jemalloc-fortify, test-ubuntu-libc-malloc, test-ubuntu-no-malloc-usable-size, test-ubuntu-32bit, test-ubuntu-tls, test-ubuntu-tls-no-tls, test-ubuntu-io-threads, test-ubuntu-reclaim-cache, test-valgrind-test, test-valgrind-misc, test-valgrind-no-malloc-usable-size-test, test-valgrind-no-malloc-usable-size-misc, test-sanitizer-address, test-sanitizer-undefined, test-rpm-distros-jemalloc, test-rpm-distros-tls-module, test-rpm-distros-tls-module-no-tls, test-macos-latest, test-macos-latest-sentinel, test-macos-latest-cluster, build-macos, test-freebsd, test-alpine-jemalloc, test-alpine-libc-malloc, reply-schemas-validator] + needs: [test-ubuntu-jemalloc, test-ubuntu-jemalloc-fortify, test-ubuntu-libc-malloc, test-ubuntu-no-malloc-usable-size, test-ubuntu-32bit, test-ubuntu-tls, test-ubuntu-tls-no-tls, test-ubuntu-io-threads, test-ubuntu-tls-io-threads, test-ubuntu-reclaim-cache, test-valgrind-test, test-valgrind-misc, test-valgrind-no-malloc-usable-size-test, test-valgrind-no-malloc-usable-size-misc, test-sanitizer-address, test-sanitizer-undefined, test-sanitizer-force-defrag, test-rpm-distros-jemalloc, test-rpm-distros-tls-module, test-rpm-distros-tls-module-no-tls, test-macos-latest, test-macos-latest-sentinel, test-macos-latest-cluster, build-macos, test-freebsd, test-alpine-jemalloc, test-alpine-libc-malloc, reply-schemas-validator] steps: - name: Collect job status run: |